diff --git a/.docker/data/cmip5/cmip5/output1/NOAA-GFDL/GFDL-CM3/1pctCO2/fx/ocean/fx/r0i0p0/v20120227/gridspec/gridspec_ocean_fx_GFDL-CM3_1pctCO2_r0i0p0.nc b/.docker/data/cmip5/cmip5/output1/NOAA-GFDL/GFDL-CM3/1pctCO2/fx/ocean/fx/r0i0p0/v20120227/gridspec/gridspec_ocean_fx_GFDL-CM3_1pctCO2_r0i0p0.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/INM/inmcm4/esmHistorical/mon/land/Lmon/r1i1p1/v20110323/fVegSoil/fVegSoil_Lmon_inmcm4_esmHistorical_r1i1p1_185001-200512.nc b/.docker/data/cmip5/cmip5/output2/INM/inmcm4/esmHistorical/mon/land/Lmon/r1i1p1/v20110323/fVegSoil/fVegSoil_Lmon_inmcm4_esmHistorical_r1i1p1_185001-200512.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/INM/inmcm4/esmrcp85/mon/land/Lmon/r1i1p1/v20110323/residualFrac/residualFrac_Lmon_inmcm4_esmrcp85_r1i1p1_200601-210012.nc b/.docker/data/cmip5/cmip5/output2/INM/inmcm4/esmrcp85/mon/land/Lmon/r1i1p1/v20110323/residualFrac/residualFrac_Lmon_inmcm4_esmrcp85_r1i1p1_200601-210012.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/INM/inmcm4/historical/mon/land/Lmon/r1i1p1/v20110323/mrlso/mrlso_Lmon_inmcm4_historical_r1i1p1_185001-200512.nc b/.docker/data/cmip5/cmip5/output2/INM/inmcm4/historical/mon/land/Lmon/r1i1p1/v20110323/mrlso/mrlso_Lmon_inmcm4_historical_r1i1p1_185001-200512.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/INM/inmcm4/piControl/mon/land/Lmon/r1i1p1/v20110323/fLuc/fLuc_Lmon_inmcm4_piControl_r1i1p1_185001-234912.nc b/.docker/data/cmip5/cmip5/output2/INM/inmcm4/piControl/mon/land/Lmon/r1i1p1/v20110323/fLuc/fLuc_Lmon_inmcm4_piControl_r1i1p1_185001-234912.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/amip4K/mon/atmos/aero/r1i1p1/v20110429/ps/ps_aero_IPSL-CM5A-LR_amip4K_r1i1p1_197901-200912.nc b/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/amip4K/mon/atmos/aero/r1i1p1/v20110429/ps/ps_aero_IPSL-CM5A-LR_amip4K_r1i1p1_197901-200912.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/amip4K/mon/atmos/cfMon/r1i1p1/v20110429/ps/ps_cfMon_IPSL-CM5A-LR_amip4K_r1i1p1_197901-200912.nc b/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/amip4K/mon/atmos/cfMon/r1i1p1/v20110429/ps/ps_cfMon_IPSL-CM5A-LR_amip4K_r1i1p1_197901-200912.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/historical/mon/atmos/aero/r4i1p1/v20110406/ps/ps_aero_IPSL-CM5A-LR_historical_r4i1p1_185001-200512.nc b/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/historical/mon/atmos/aero/r4i1p1/v20110406/ps/ps_aero_IPSL-CM5A-LR_historical_r4i1p1_185001-200512.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/historical/mon/atmos/cfMon/r2i1p1/v20111119/ps/ps_cfMon_IPSL-CM5A-LR_historical_r2i1p1_185001-200512.nc b/.docker/data/cmip5/cmip5/output2/IPSL/IPSL-CM5A-LR/historical/mon/atmos/cfMon/r2i1p1/v20111119/ps/ps_cfMon_IPSL-CM5A-LR_historical_r2i1p1_185001-200512.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/cmip5/cmip5/output2/NASA-GISS/GISS-E2-H/historical/mon/land/Lmon/r5i1p1/v20120517/cSoil/cSoil_Lmon_GISS-E2-H_historical_r5i1p1_185001-190012.nc b/.docker/data/cmip5/cmip5/output2/NASA-GISS/GISS-E2-H/historical/mon/land/Lmon/r5i1p1/v20120517/cSoil/cSoil_Lmon_GISS-E2-H_historical_r5i1p1_185001-190012.nc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020000-201609020030.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020000-201609020030.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020000-201609020030.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020000-201609020030.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020100-201609020130.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020100-201609020130.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020100-201609020130.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020100-201609020130.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020200-201609020230.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020200-201609020230.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020200-201609020230.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020200-201609020230.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020300-201609020330.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020300-201609020330.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020300-201609020330.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020300-201609020330.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020400-201609020430.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020400-201609020430.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020400-201609020430.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020400-201609020430.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020500-201609020530.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020500-201609020530.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020500-201609020530.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020500-201609020530.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020600-201609020630.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020600-201609020630.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020600-201609020630.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020600-201609020630.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020700-201609020730.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020700-201609020730.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020700-201609020730.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020700-201609020730.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020800-201609020830.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020800-201609020830.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020800-201609020830.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020800-201609020830.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020900-201609020930.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020900-201609020930.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020900-201609020930.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609020900-201609020930.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021000-201609021030.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021000-201609021030.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021000-201609021030.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021000-201609021030.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021100-201609021130.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021100-201609021130.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021100-201609021130.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021100-201609021130.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021200-201609021230.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021200-201609021230.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021200-201609021230.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021200-201609021230.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021300-201609021330.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021300-201609021330.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021300-201609021330.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021300-201609021330.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021400-201609021430.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021400-201609021430.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021400-201609021430.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021400-201609021430.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021500-201609021530.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021500-201609021530.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021500-201609021530.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021500-201609021530.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021600-201609021630.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021600-201609021630.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021600-201609021630.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021600-201609021630.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021700-201609021730.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021700-201609021730.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021700-201609021730.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021700-201609021730.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021800-201609021830.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021800-201609021830.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021800-201609021830.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021800-201609021830.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021900-201609021930.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021900-201609021930.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021900-201609021930.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609021900-201609021930.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022000-201609022030.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022000-201609022030.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022000-201609022030.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022000-201609022030.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022100-201609022130.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022100-201609022130.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022100-201609022130.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022100-201609022130.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022200-201609022230.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022200-201609022230.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022200-201609022230.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022200-201609022230.nc diff --git a/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022300-201609022330.nc b/.docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022300-201609022330.nc similarity index 100% rename from .docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022300-201609022330.nc rename to .docker/data/obs/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr/pr_30min_CPC_cmorph_r1i1p1_201609022300-201609022330.nc diff --git a/.docker/drs_config.toml b/.docker/drs_config.toml index 25cdcb49a61d5b8b9b6cf40c0b0efab0f8f4373d..92204b0d0b5b17a0208e90d9a77ae79724c28608 100644 --- a/.docker/drs_config.toml +++ b/.docker/drs_config.toml @@ -1,6 +1,10 @@ +[cmip5_name] +root_dir = ".docker/data/cmip5" +drs_format = "cmip5" [observations] -root_dir = ".docker/data" +root_dir = ".docker/data/obs" +drs_format = "custom" parts_dir = [ "project", "product", @@ -13,44 +17,15 @@ parts_dir = [ "ensemble", "version", "variable", - "file_name", -] -parts_dataset = [ - "project", - "product", - "institute", - "model", - "experiment", - "time_frequency", - "realm", - "cmor_table", - "ensemble", - "", - "variable", -] -parts_versioned_dataset = [ - "project", - "product", - "institute", - "model", - "experiment", - "time_frequency", - "realm", - "cmor_table", - "ensemble", - "version", - "variable", ] parts_file_name = [ "variable", - "time_frequency", + "frequency", "experiment", "level", "version", "time", ] parts_time = "start_time-end_time" -data_type = "observations" - [observations.defaults] project = "observations" diff --git a/.docker/solr/managed-schema b/.docker/solr/managed-schema index dbe7efe6e399339981fcbfe9d648d2c70bef90e0..ca0ae799aec65d8868e1a435a2d1dd4c6d2b819e 100644 --- a/.docker/solr/managed-schema +++ b/.docker/solr/managed-schema @@ -2,99 +2,42 @@ <!-- Solr managed schema - automatically generated - DO NOT EDIT --> <schema name="file_system" version="1.6"> <uniqueKey>file</uniqueKey> - <fieldType name="ancestor_path" class="solr.TextField"> - <analyzer type="index"> - <tokenizer class="solr.KeywordTokenizerFactory"/> - </analyzer> - <analyzer type="query"> - <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/> - </analyzer> - </fieldType> - <fieldType name="binary" class="solr.BinaryField"/> - <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> - <fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/> - <fieldType name="descendent_path" class="solr.TextField"> - <analyzer type="index"> - <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/> - </analyzer> - <analyzer type="query"> - <tokenizer class="solr.KeywordTokenizerFactory"/> - </analyzer> - </fieldType> - <fieldType name="ignored" class="solr.StrField" indexed="false" stored="false" multiValued="true"/> - <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> - <fieldType name="location_rpt" class="solr.SpatialRecursivePrefixTreeFieldType" geo="true" maxDistErr="0.001" distErrPct="0.025" distanceUnits="kilometers"/> - <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.KeywordTokenizerFactory"/> - <filter class="solr.LowerCaseFilterFactory"/> - </analyzer> - </fieldType> - <fieldType name="phonetic_en" class="solr.TextField" indexed="true" stored="false"> - <analyzer> - <tokenizer class="solr.StandardTokenizerFactory"/> - <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> - </analyzer> - </fieldType> <fieldType name="pdate" class="solr.DatePointField" docValues="true"/> <fieldType name="pdates" class="solr.DatePointField" docValues="true" multiValued="true"/> - <fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/> <fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/> <fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/> - <fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/> - <fieldType name="pint" class="solr.IntPointField" docValues="true"/> - <fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/> <fieldType name="plong" class="solr.LongPointField" docValues="true"/> <fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/> - <fieldType name="point" class="solr.PointType" subFieldSuffix="_d" dimension="2"/> - <fieldType name="random" class="solr.RandomSortField" indexed="true"/> <fieldType name="string" class="solr.StrField" sortMissingLast="true"/> - <fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true"/> - <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> + <fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/> + <fieldType name="version" class="solr.TextField" > <analyzer> <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="^v"/> <tokenizer class="solr.KeywordTokenizerFactory"/> </analyzer> - </fieldType> - <!-- A general text field that has reasonable, generic - cross-language defaults: it tokenizes with StandardTokenizer, - removes stop words from case-insensitive "stopwords.txt" - (empty by default), and down cases. At query time only, it - also applies synonyms. --> - <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> - <analyzer type="index"> - <tokenizer class="solr.KeywordTokenizerFactory"/> - <!-- in this example, we will only use synonyms at query time - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> - <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> - --> - <filter class="solr.LowerCaseFilterFactory"/> - </analyzer> - <analyzer type="query"> - <tokenizer class="solr.KeywordTokenizerFactory"/> - <!-- - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> - --> - <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> - <filter class="solr.LowerCaseFilterFactory"/> - </analyzer> - </fieldType> + </fieldType> + <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> <field name="_version_" type="plong" indexed="true" stored="true"/> - <!-- base entries for the search (path and timestamp) --> <field name="file" type="string" multiValued="false" indexed="true" required="true" stored="true"/> <field name="timestamp" type="pfloat" indexed="false" stored="true"/> <field name="creation_time" type="pdate" indexed="true" stored="false" default="NOW"/> + <!-- we need this to search for latest version. If not it won't work for entries not being versioned at all --> <field name="version" type="version" stored="false" indexed="true" default="-1"/> <field name="file_no_version" type="string" stored="false" indexed="true"/> - - <!-- everything else will get indexed but not stored --> - <dynamicField name="*" type="text_general" stored="false" indexed="true"/> - <!-- - <copyField source="content" dest="text_shingles"/> - <copyField source="*" dest="_text_"/> --> + <field name="_root_" type="string" indexed="false" stored="false" docValues="false"/> + <dynamicField name="*" type="text_general" stored="true" indexed="true" multiValued="true"/> </schema> diff --git a/.envrc b/.envrc index f45792a58a7889bdb8d1f2ab97940e6612134c63..efd49c400f7ef7775c3404ebf7a50e7d68c1bbdb 100644 --- a/.envrc +++ b/.envrc @@ -1 +1 @@ -export EVALUATION_SYSTEM_CONFIG_FILE=$(PWD)/.docker \ No newline at end of file +export EVALUATION_SYSTEM_CONFIG_DIR=$(PWD)/.docker \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 35efed6c291d5a92eb81a55359265cea548ce1db..cef7a272f4d85ca5a4db66833958ee74d5cbb301 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,6 +19,7 @@ default: fmt: stage: test before_script: + - apt update && apt install -y cmake - rustup component add rustfmt script: - cargo fmt --check @@ -26,12 +27,15 @@ fmt: clippy: stage: test before_script: + - apt update && apt install -y cmake - rustup component add clippy script: - cargo clippy -- -D warnings test: stage: test + before_script: + - apt update && apt install -y cmake script: - cargo test diff --git a/Cargo.lock b/Cargo.lock index 9775de5a2459e546a479144f9a2cdb4240f4e596..c6309db486a7ab3ac773f2dcaaca6290896e34db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -22,15 +22,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.52" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84450d0b4a8bd1ba4144ce8ce718fbc5d071358b1e5384bace6536b3d1f2d5b3" - -[[package]] -name = "arrayvec" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" +checksum = "08f9b8508dccb7687a1d6c4ce66b2b0ecef467c94667de27d8d7fe1f8d2a9cdc" [[package]] name = "assert-json-diff" @@ -55,9 +49,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.52" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" +checksum = "ed6aa3524a2dfcf9fe180c51eae2b58738348d819517ceadf95789c51fff7600" dependencies = [ "proc-macro2", "quote", @@ -77,9 +71,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" @@ -113,18 +107,18 @@ checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c" [[package]] name = "camino" -version = "1.0.7" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f3132262930b0522068049f5870a856ab8affc80c70d08b6ecb785771a6fc23" +checksum = "869119e97797867fd90f5e22af7d0bd274bd4635ebb9eb68c04f3f513ae6c412" dependencies = [ "serde", ] [[package]] name = "cc" -version = "1.0.72" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" [[package]] name = "cfg-if" @@ -147,16 +141,16 @@ dependencies = [ [[package]] name = "clap" -version = "3.0.9" +version = "3.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c506244a13c87262f84bf16369740d0b7c3850901b6a642aa41b031a710c473" +checksum = "d2dbdf4bdacb33466e854ce889eee8dfd5729abf7ccd7664d0a2d60cd384440b" dependencies = [ "atty", "bitflags", "clap_derive", + "clap_lex", "indexmap", "lazy_static", - "os_str_bytes", "strsim", "termcolor", "textwrap", @@ -164,9 +158,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "3.0.6" +version = "3.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "517358c28fcef6607bf6f76108e02afad7e82297d132a6b846dcc1fc3efcd153" +checksum = "25320346e922cffe59c0bbc5410c8d8784509efb321488971081313cb1e1a33c" dependencies = [ "heck", "proc-macro-error", @@ -176,23 +170,30 @@ dependencies = [ ] [[package]] -name = "concurrent-queue" -version = "1.2.2" +name = "clap_lex" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ed07550be01594c6026cff2a1d7fe9c8f683caa798e12b68694ac9e88286a3" +checksum = "a37c35f1112dad5e6e0b1adaff798507497a18fceeb30cceb3bae7d1427b9213" dependencies = [ - "cache-padded", + "os_str_bytes", ] [[package]] -name = "config" -version = "0.10.1" +name = "cmake" +version = "0.1.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b076e143e1d9538dde65da30f8481c2a6c44040edb8e02b9bf1351edb92ce3" +checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a" dependencies = [ - "lazy_static", - "nom", - "serde", + "cc", +] + +[[package]] +name = "concurrent-queue" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ed07550be01594c6026cff2a1d7fe9c8f683caa798e12b68694ac9e88286a3" +dependencies = [ + "cache-padded", ] [[package]] @@ -202,39 +203,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06821ea598337a8412cf47c5b71c3bc694a7f0aed188ac28b836fab164a2c202" [[package]] -name = "crossbeam-queue" -version = "0.3.3" +name = "core-foundation" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b979d76c9fcb84dffc80a73f7290da0f83e4c95773494674cb44b76d13a7a110" +checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" dependencies = [ - "cfg-if", - "crossbeam-utils", + "core-foundation-sys", + "libc", ] [[package]] -name = "crossbeam-utils" -version = "0.8.6" +name = "core-foundation-sys" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcae03edb34f947e64acdb1c33ec169824e20657e9ecb61cef6c8c74dcb8120" -dependencies = [ - "cfg-if", - "lazy_static", -] +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" [[package]] name = "deadpool" -version = "0.7.0" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d126179d86aee4556e54f5f3c6bf6d9884e7cc52cef82f77ee6f90a7747616d" +checksum = "421fe0f90f2ab22016f32a9881be5134fdd71c65298917084b0c7477cbc3856e" dependencies = [ "async-trait", - "config", - "crossbeam-queue", + "deadpool-runtime", "num_cpus", - "serde", + "retain_mut", "tokio", ] +[[package]] +name = "deadpool-runtime" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaa37046cc0f6c3cc6090fbdbf73ef0b8ef4cfcc37f6befc0020f63e8cf121e1" + [[package]] name = "directories" version = "4.0.1" @@ -246,20 +248,32 @@ dependencies = [ [[package]] name = "dirs-sys" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780" +checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" dependencies = [ "libc", "redox_users", "winapi", ] +[[package]] +name = "drs" +version = "0.1.0" +dependencies = [ + "camino", + "chrono", + "nom", + "serde", + "thiserror", + "tracing", +] + [[package]] name = "encoding_rs" -version = "0.8.30" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" +checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ "cfg-if", ] @@ -279,15 +293,15 @@ dependencies = [ [[package]] name = "event-listener" -version = "2.5.1" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7531096570974c3a9dcf9e4b8e1cede1ec26cf5046219fb3b9d897503b9be59" +checksum = "77f3309417938f28bf8228fcff79a4a37103981e3e186d2ccd19c74b38f4eb71" [[package]] name = "fastrand" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "779d043b6a0b90cc4c0ed7ee380a6504394cee7efd7db050e3774eee387324b2" +checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" dependencies = [ "instant", ] @@ -298,6 +312,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.0.1" @@ -313,8 +342,13 @@ name = "freva" version = "0.1.0" dependencies = [ "camino", + "chrono", + "drs", "env_logger", "futures", + "lazy_static", + "netcdf", + "nom", "reqwest", "serde", "serde_json", @@ -322,6 +356,7 @@ dependencies = [ "tokio", "toml", "tracing", + "tracing-subscriber", "url", "walkdir", "wiremock", @@ -341,15 +376,16 @@ dependencies = [ "log", "stderrlog", "tokio", + "toml", "tracing", "tracing-subscriber", ] [[package]] name = "futures" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28560757fe2bb34e79f907794bb6b22ae8b0e5c669b638a1132f2592b19035b4" +checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" dependencies = [ "futures-channel", "futures-core", @@ -362,9 +398,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3dda0b6588335f360afc675d0564c17a77a2bda81ca178a4b6081bd86c7f0b" +checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" dependencies = [ "futures-core", "futures-sink", @@ -372,15 +408,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c8ff0461b82559810cdccfde3215c3f373807f5e5232b71479bff7bb2583d7" +checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" [[package]] name = "futures-executor" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29d6d2ff5bb10fb95c85b8ce46538a2e5f5e7fdc755623a7d4529ab8a4ed9d2a" +checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" dependencies = [ "futures-core", "futures-task", @@ -389,9 +425,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f9d34af5a1aac6fb380f735fe510746c38067c5bf16c7fd250280503c971b2" +checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" [[package]] name = "futures-lite" @@ -410,9 +446,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dbd947adfffb0efc70599b3ddcf7b5597bb5fa9e245eb99f62b3a5f7bb8bd3c" +checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" dependencies = [ "proc-macro2", "quote", @@ -421,15 +457,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3055baccb68d74ff6480350f8d6eb8fcfa3aa11bdc1a1ae3afdd0514617d508" +checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" [[package]] name = "futures-task" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee7c6485c30167ce4dfb83ac568a849fe53274c831081476ee13e0dce1aad72" +checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" [[package]] name = "futures-timer" @@ -439,9 +475,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.19" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b5cf40b47a271f77a8b1bec03ca09044d99d2372c0de244e66430761127164" +checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" dependencies = [ "futures-channel", "futures-core", @@ -468,20 +504,20 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" +checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad" dependencies = [ "cfg-if", "libc", - "wasi 0.10.0+wasi-snapshot-preview1", + "wasi 0.10.2+wasi-snapshot-preview1", ] [[package]] name = "h2" -version = "0.3.10" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c9de88456263e249e241fcd211d3954e2c9b0ef7ccfc235a444eb367cae3689" +checksum = "37a82c6d637fc9515a4694bbf1cb2457b79d81ce52b3108bdeea58b07dd34a57" dependencies = [ "bytes", "fnv", @@ -502,6 +538,33 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +[[package]] +name = "hdf5-src" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01493db39ddc0519cf2a83d620d2c037fee60f4fed724cb72dc23763f1727a8" +dependencies = [ + "cmake", + "libz-sys", +] + +[[package]] +name = "hdf5-sys" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4842d5980dc311a7c8933c7b45534fdae84df5ae7939a0ae8e449a56d4beb3d2" +dependencies = [ + "hdf5-src", + "libc", + "libloading", + "libz-sys", + "pkg-config", + "regex", + "serde", + "serde_derive", + "winreg", +] + [[package]] name = "heck" version = "0.4.0" @@ -519,20 +582,20 @@ dependencies = [ [[package]] name = "http" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f4c6746584866f0feabcc69893c5b51beef3831656a968ed7ae254cdc4fd03" +checksum = "ff8670570af52249509a86f5e3e18a08c60b177071826898fde8997cf5f6bfbb" dependencies = [ "bytes", "fnv", - "itoa 1.0.1", + "itoa", ] [[package]] name = "http-body" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", @@ -562,9 +625,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.5.1" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" +checksum = "496ce29bb5a52785b44e0f7ca2847ae0bb839c9bd28f69acac9b99d461c0c04c" [[package]] name = "httpdate" @@ -580,9 +643,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.16" +version = "0.14.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7ec3e62bdc98a2f0393a5048e4c30ef659440ea6e0e572965103e72bd836f55" +checksum = "b26ae0a80afebe130861d90abf98e3814a4f28a4c6ffeb5ab8ebb2be311e0ef2" dependencies = [ "bytes", "futures-channel", @@ -593,7 +656,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 0.4.8", + "itoa", "pin-project-lite", "socket2", "tokio", @@ -603,16 +666,16 @@ dependencies = [ ] [[package]] -name = "hyper-rustls" -version = "0.23.0" +name = "hyper-tls" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ - "http", + "bytes", "hyper", - "rustls", + "native-tls", "tokio", - "tokio-rustls", + "tokio-native-tls", ] [[package]] @@ -628,9 +691,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" +checksum = "0f647032dfaa1f8b6dc29bd3edb7bbef4861b8b8007ebb118d6db284fd59f6ee" dependencies = [ "autocfg", "hashbrown", @@ -653,27 +716,21 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" - -[[package]] -name = "itoa" -version = "0.4.8" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" [[package]] name = "itoa" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" +checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" [[package]] name = "js-sys" -version = "0.3.55" +version = "0.3.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc9ffccd38c451a86bf13657df244e9c3f37493cce8e5e21e940963777acc84" +checksum = "671a26f820db17c2a2750743f1dd03bafd15b98c9f30c7c2628c024c05d73397" dependencies = [ "wasm-bindgen", ] @@ -685,38 +742,48 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] -name = "lexical-core" -version = "0.7.6" +name = "libc" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" + +[[package]] +name = "libloading" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" +checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" dependencies = [ - "arrayvec", - "bitflags", "cfg-if", - "ryu", - "static_assertions", + "winapi", ] [[package]] -name = "libc" -version = "0.2.112" +name = "libz-sys" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b03d17f364a3a042d5e5d46b053bbbf82c92c9430c592dd4c064dc6ee997125" +checksum = "92e7e15d7610cce1d9752e137625f14e61a28cd45929b6e12e47b50fe154ee2e" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] [[package]] name = "lock_api" -version = "0.4.5" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", ] @@ -727,11 +794,20 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" +[[package]] +name = "matrixmultiply" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "add85d4dd35074e6fedc608f8c8f513a3548619a9024b751949ef0e8e45a4d84" +dependencies = [ + "rawpointer", +] + [[package]] name = "memchr" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "mime" @@ -739,53 +815,112 @@ version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "mio" -version = "0.7.14" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" +checksum = "713d550d9b44d89174e066b7a6217ae06234c10cb47819a88290d2b353c31799" dependencies = [ "libc", "log", - "miow", - "ntapi", - "winapi", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys", ] [[package]] -name = "miow" -version = "0.3.7" +name = "native-tls" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +checksum = "fd7e2f3618557f980e0b17e8856252eee3c97fa12c54dff0ca290fb6266ca4a9" dependencies = [ - "winapi", + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "ndarray" +version = "0.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec23e6762830658d2b3d385a75aa212af2f67a4586d4442907144f3bb6a1ca8" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "netcdf" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9668c493289156ed0dce9472ea33ab7a6f6dfe4565a8433bd103995f8053b467" +dependencies = [ + "bitflags", + "lazy_static", + "ndarray", + "netcdf-sys", +] + +[[package]] +name = "netcdf-src" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55951a3f2007bc40b0a35341393170a597d0820d53c2106d312d451615ab0cb7" +dependencies = [ + "cmake", + "hdf5-sys", +] + +[[package]] +name = "netcdf-sys" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1028362b1f88b5c4bdf3538b202b2c83ab3cc8a5537f6337ac147e6e36dc5f9" +dependencies = [ + "hdf5-sys", + "libz-sys", + "netcdf-src", ] [[package]] name = "nom" -version = "5.1.2" +version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" dependencies = [ - "lexical-core", "memchr", - "version_check", + "minimal-lexical", ] [[package]] -name = "ntapi" -version = "0.3.6" +name = "num-complex" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +checksum = "97fbc387afefefd5e9e39493299f3069e14a140dd34dc19b4c1c1a8fddb6a790" dependencies = [ - "winapi", + "num-traits", ] [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", @@ -793,9 +928,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] @@ -812,19 +947,61 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.9.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +checksum = "7709cef83f0c1f58f666e746a08b21e0085f7440fa6a29cc194d68aac97a4225" [[package]] -name = "os_str_bytes" -version = "6.0.0" +name = "openssl" +version = "0.10.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" +checksum = "fb81a6430ac911acb25fe5ac8f1d2af1b4ea8a4fdfda0f1ee4292af2e2d8eb0e" dependencies = [ - "memchr", + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", ] +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5fd19fb3e0a8191c1e34935718976a3e70c112ab9a24af6d7cadccd9d90bc0" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "os_str_bytes" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "029d8d0b2f198229de29dca79676f2738ff952edf3fde542eb8bf94d8c21b435" + [[package]] name = "parking" version = "2.0.0" @@ -833,27 +1010,25 @@ checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" [[package]] name = "parking_lot" -version = "0.11.2" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" dependencies = [ - "instant", "lock_api", "parking_lot_core", ] [[package]] name = "parking_lot_core" -version = "0.8.5" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" dependencies = [ "cfg-if", - "instant", "libc", "redox_syscall", "smallvec", - "winapi", + "windows-sys", ] [[package]] @@ -864,9 +1039,9 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pin-project-lite" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e280fbe77cc62c91527259e9442153f4688736748d24660126286329742b4c6c" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" @@ -874,6 +1049,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" + [[package]] name = "ppv-lite86" version = "0.2.16" @@ -906,18 +1087,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.36" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +checksum = "c54b25569025b7fc9651de43004ae593a75ad88543b17178aa5e1b9c4f15f56f" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "quote" -version = "1.0.14" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47aa80447ce4daf1717500037052af176af5d38cc3e571d9ec1c7353fc10c87d" +checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1" dependencies = [ "proc-macro2", ] @@ -963,30 +1144,37 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" dependencies = [ "bitflags", ] [[package]] name = "redox_users" -version = "0.4.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ - "getrandom 0.2.4", + "getrandom 0.2.6", "redox_syscall", + "thiserror", ] [[package]] name = "regex" -version = "1.5.4" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "d83f127d94bdbcda4c8cc2e50f6f84f4b611f69c902699ca385a39c3a75f9ff1" dependencies = [ "aho-corasick", "memchr", @@ -995,15 +1183,24 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.25" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] [[package]] name = "reqwest" -version = "0.11.9" +version = "0.11.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f242f1488a539a79bac6dbe7c8609ae43b7914b7736210f239a37cccb32525" +checksum = "46a1f7aa4f35e5e8b4160449f51afc758f0ce6454315a9fa7d0d113e958c41eb" dependencies = [ "base64", "bytes", @@ -1014,29 +1211,34 @@ dependencies = [ "http", "http-body", "hyper", - "hyper-rustls", + "hyper-tls", "ipnet", "js-sys", "lazy_static", "log", "mime", + "native-tls", "percent-encoding", "pin-project-lite", "rustls", - "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls", + "tokio-native-tls", "url", "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots", "winreg", ] +[[package]] +name = "retain_mut" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4389f1d5789befaf6029ebd9f7dac4af7f7e3d61b69d4f30e2ac02b57e7712b0" + [[package]] name = "ring" version = "0.16.20" @@ -1054,9 +1256,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.20.4" +version = "0.20.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbfeb8d0ddb84706bc597a5574ab8912817c52a397f819e5b614e2265206921" +checksum = "5aab8ee6c7097ed6057f43c187a62418d0c05a4bd5f18b3571db50ee0f9ce033" dependencies = [ "log", "ring", @@ -1064,20 +1266,11 @@ dependencies = [ "webpki", ] -[[package]] -name = "rustls-pemfile" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9" -dependencies = [ - "base64", -] - [[package]] name = "ryu" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" +checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" [[package]] name = "same-file" @@ -1088,6 +1281,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2" +dependencies = [ + "lazy_static", + "windows-sys", +] + [[package]] name = "scopeguard" version = "1.1.0" @@ -1104,20 +1307,43 @@ dependencies = [ "untrusted", ] +[[package]] +name = "security-framework" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dc14f172faf8a0194a3aded622712b0de276821addc574fa54fc0a1167e10dc" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0160a13a177a45bfb43ce71c01580998474f556ad854dcbca936dd2841a5c556" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "serde" -version = "1.0.133" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97565067517b60e2d1ea8b268e59ce036de907ac523ad83a0475da04e818989a" +checksum = "61ea8d54c77f8315140a05f4c7237403bf38b72704d031543aa1d16abbf517d1" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.133" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed201699328568d8d08208fdd080e3ff594e6c422e438b6705905da01005d537" +checksum = "1f26faba0c3959972377d3b2d306ee9f71faee9714294e41bb777f83f88578be" dependencies = [ "proc-macro2", "quote", @@ -1126,11 +1352,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.75" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c059c05b48c5c0067d4b4b2b4f0732dd65feb52daf7e0ea09cd87e7dadc1af79" +checksum = "9b7ce2b32a1aed03c558dc61a5cd328f15aff2dbc17daad8fb8af04d2100e15c" dependencies = [ - "itoa 1.0.1", + "itoa", "ryu", "serde", ] @@ -1153,7 +1379,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.1", + "itoa", "ryu", "serde", ] @@ -1178,9 +1404,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" +checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" [[package]] name = "smallvec" @@ -1190,9 +1416,9 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "socket2" -version = "0.4.2" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dc90fe6c7be1a323296982db1836d1ea9e47b6839496dde9a541bc496df3516" +checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" dependencies = [ "libc", "winapi", @@ -1204,12 +1430,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "stderrlog" version = "0.5.1" @@ -1231,44 +1451,58 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.85" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a684ac3dcd8913827e18cd09a68384ee66c1de24157e3c556c9ab16d85695fb7" +checksum = "fbaf6116ab8924f39d52792136fb74fd60a80194cf1b1c6ffa6453eef1c3f942" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", ] [[package]] name = "termcolor" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" dependencies = [ "winapi-util", ] [[package]] name = "textwrap" -version = "0.14.2" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" +checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" [[package]] name = "thiserror" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" dependencies = [ "proc-macro2", "quote", @@ -1286,20 +1520,19 @@ dependencies = [ [[package]] name = "time" -version = "0.1.44" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" dependencies = [ "libc", - "wasi 0.10.0+wasi-snapshot-preview1", "winapi", ] [[package]] name = "tinyvec" -version = "1.5.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" dependencies = [ "tinyvec_macros", ] @@ -1312,9 +1545,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.15.0" +version = "1.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbbf1c778ec206785635ce8ad57fe52b3009ae9e0c9f574a728f3049d3e55838" +checksum = "4903bf0427cf68dddd5aa6a93220756f8be0c34fcfa9f5e6191e103e15a31395" dependencies = [ "bytes", "libc", @@ -1325,6 +1558,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", + "socket2", "tokio-macros", "winapi", ] @@ -1341,35 +1575,34 @@ dependencies = [ ] [[package]] -name = "tokio-rustls" -version = "0.23.3" +name = "tokio-native-tls" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4151fda0cf2798550ad0b34bcfc9b9dcc2a9d2471c895c68f3a8818e54f2389e" +checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" dependencies = [ - "rustls", + "native-tls", "tokio", - "webpki", ] [[package]] name = "tokio-util" -version = "0.6.9" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" +checksum = "f988a1a1adc2fb21f9c12aa96441da33a1728193ae0b95d2be22dbd17fcb4e5c" dependencies = [ "bytes", "futures-core", "futures-sink", - "log", "pin-project-lite", "tokio", + "tracing", ] [[package]] name = "toml" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" dependencies = [ "serde", ] @@ -1382,9 +1615,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.29" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105" +checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09" dependencies = [ "cfg-if", "pin-project-lite", @@ -1394,9 +1627,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.20" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e65ce065b4b5c53e73bb28912318cb8c9e9ad3921f1d669eb0e68b4c8143a2b" +checksum = "cc6b8ad3567499f98a1db7a752b07a7c8c7c7c34c332ec00effb2b0027974b7c" dependencies = [ "proc-macro2", "quote", @@ -1405,18 +1638,19 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.21" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4" +checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f" dependencies = [ "lazy_static", + "valuable", ] [[package]] name = "tracing-log" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" dependencies = [ "lazy_static", "log", @@ -1445,9 +1679,15 @@ checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" [[package]] name = "unicode-bidi" -version = "0.3.7" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" + +[[package]] +name = "unicode-ident" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" +checksum = "d22af068fba1eb5edcb4aea19d382b2a3deb4c8f9d475c589b6ada9e0fd493ee" [[package]] name = "unicode-normalization" @@ -1458,12 +1698,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-xid" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" - [[package]] name = "untrusted" version = "0.7.1" @@ -1483,6 +1717,18 @@ dependencies = [ "serde", ] +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" @@ -1524,15 +1770,21 @@ checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" [[package]] name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.78" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632f73e236b219150ea279196e54e610f5dbafa5d61786303d4da54f84e47fce" +checksum = "27370197c907c55e3f1a9fbe26f44e937fe6451368324e009cba39e139dc08ad" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -1540,9 +1792,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.78" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a317bf8f9fba2476b4b2c85ef4c4af8ff39c3c7f0cdfeed4f82c34a880aa837b" +checksum = "53e04185bfa3a779273da532f5025e33398409573f348985af9a1cbf3774d3f4" dependencies = [ "bumpalo", "lazy_static", @@ -1555,9 +1807,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.28" +version = "0.4.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e8d7523cb1f2a4c96c1317ca690031b714a51cc14e05f712446691f413f5d39" +checksum = "6f741de44b75e14c35df886aff5f1eb73aa114fa5d4d00dcd37b5e01259bf3b2" dependencies = [ "cfg-if", "js-sys", @@ -1567,9 +1819,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.78" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d56146e7c495528bf6587663bea13a8eb588d39b36b679d83972e1a2dbbdacf9" +checksum = "17cae7ff784d7e83a2fe7611cfe766ecf034111b49deb850a3dc7699c08251f5" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1577,9 +1829,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.78" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7803e0eea25835f8abdc585cd3021b3deb11543c6fe226dcd30b228857c5c5ab" +checksum = "99ec0dc7a4756fffc231aab1b9f2f578d23cd391390ab27f952ae0c9b3ece20b" dependencies = [ "proc-macro2", "quote", @@ -1590,15 +1842,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.78" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0237232789cf037d5480773fe568aac745bfe2afbc11a863e97901780a6b47cc" +checksum = "d554b7f530dee5964d9a9468d95c1f8b8acae4f282807e7d27d4b03099a46744" [[package]] name = "web-sys" -version = "0.3.55" +version = "0.3.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38eb105f1c59d9eaa6b5cdc92b859d85b926e82cb2e0945cd0c9259faa6fe9fb" +checksum = "7b17e741662c70c8bd24ac5c5b18de314a2c26c32bf8346ee1e6f53de919c283" dependencies = [ "js-sys", "wasm-bindgen", @@ -1614,15 +1866,6 @@ dependencies = [ "untrusted", ] -[[package]] -name = "webpki-roots" -version = "0.22.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552ceb903e957524388c4d3475725ff2c8b7960922063af6ce53c9a43da07449" -dependencies = [ - "webpki", -] - [[package]] name = "winapi" version = "0.3.9" @@ -1654,20 +1897,64 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-sys" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" +dependencies = [ + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" + +[[package]] +name = "windows_i686_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" + +[[package]] +name = "windows_i686_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" + [[package]] name = "winreg" -version = "0.7.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" dependencies = [ + "serde", "winapi", ] [[package]] name = "wiremock" -version = "0.5.10" +version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70f7b70ff8dc8f85e456feb581ee05720f87a5bd4f868c018191d997994cabed" +checksum = "1b12f508bdca434a55d43614d26f02e6b3e98ebeecfbc5a1614e0a0c8bf3e315" dependencies = [ "assert-json-diff", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index ef86a2971e929326879b6b27159dc2f38d0f89a9..c9f785f37d12d25bbdaf02b7a85feccbf82793f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,3 @@ [workspace] -members = ["freva-ingest", "freva"] +members = ["freva-ingest", "freva", "drs"] diff --git a/Makefile b/Makefile index 1ece9ac8bdf067da8d8c3939f2ae770199551767..84562dbc0325512191eb191ab5985578d441c389 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,11 @@ linux-build: -v $(PWD):/workspace \ -w /workspace \ rust:latest \ - cargo build --release + /bin/bash -c " \ + apt-get update -y && apt-get install -y \ + libssl-dev cmake && \ + cargo build --release \ + " musl-build: docker run \ @@ -29,7 +33,8 @@ musl-build: -w /workspace \ rust:latest \ /bin/bash -c " \ - apt-get update -y && apt-get install -y musl-tools && \ + apt-get update -y && apt-get install -y \ + musl-tools libssl-dev cmake && \ rustup target add x86_64-unknown-linux-musl && \ cargo build --release --target x86_64-unknown-linux-musl \ " diff --git a/README.md b/README.md index 48fef74b406f6cf47c0a9b43bedbea6e0957a5c8..479c26779b415852bf10fa132c896f1b42f4a628 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,28 @@ create](https://docs.rs/directories/latest/directories/). A table for each OS is directory it will look for `drs_conf.toml` and `evaluation_system.conf`. Examples of the config files are available in `./.docker`. -This can also be overridden with a `EVALUATION_SYSTEM_CONFIG_FILE` envvar (note that the value should be a directory -name, not a file name). +This can also be overridden with a `EVALUATION_SYSTEM_CONFIG_DIR` envvar. + +A small example config (reformatted version of the file found in `.docker/drs_config.toml`): + +```toml +[cmip5_name] +root_dir = ".docker/data/cmip5" +drs_format = "cmip5" + +[observations] +root_dir = ".docker/data/obs" +drs_format = "custom" +parts_dir = [ + "project", "product", "institute", "model", "experiment", "time_frequency", "realm", + "cmor_table", "ensemble", "version", "variable" +] +parts_file_name = ["variable", "time_frequency", "experiment", "level", "version", "time"] +parts_time = "start_time-end_time" +[observations.defaults] +project = "observations" + +``` # Installation diff --git a/drs/Cargo.toml b/drs/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..e6da563032376aba22e43a29b3e7972cbced7e3f --- /dev/null +++ b/drs/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "drs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +camino = { version = "1.0.5", features = ["serde1"] } +nom = "7.1" +thiserror = "1.0" +serde = { version = "1.0", features = ["derive"] } +tracing = "0.1" +chrono = "0.4" diff --git a/drs/src/cmip5.rs b/drs/src/cmip5.rs new file mode 100644 index 0000000000000000000000000000000000000000..67f37beba859f01ab8258e8606768d6184a57584 --- /dev/null +++ b/drs/src/cmip5.rs @@ -0,0 +1,511 @@ +//! <https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf> +//! See section 3.1 +//! +//! CMIP5 defines two different directory structures, an older CMOR structure and a newer structure which is what +//! should be used going forward. This module will map both into the same [`Cmip5`] representation. +//! +//! Old CMOR directory structure: +//! ```text +//! <activity>/<product>/<institute>/<model>/<experiment>/ +//! <frequency>/<modeling_realm>/<variable_name>/<ensemble_member> +//! ``` +//! New CMIP5/ESGF directory structure: +//! ```text +//! <activity>/<product>/<institute>/<model>/<experiment>/<frequency>/ +//! <modeling_realm>/<mip_table>/<ensemble_member>/<version>/ +//! <variable_name> +//! ``` +//! CMIP5 file name structure: +//! ```text +//! <variable_name>_<mip_table>_<model>_<experiment>_ +//! <ensemble_member>[_<temporal_subset>][_<geographical_info>] +//! ``` +//! CMIP5 file names have an alternative form for gridspec files: +//! ```text +//! gridspec_<modeling_realm>_fx_<model>_<experiment>_r0i0p0.nc +//! ``` +//! Currently those differences will be ignored because a file being a gridspec file or not does not seem to be relevant +//! to the rest of its metadata. +//! +//! Publication level dataset (see 3.4) +//! Datasets are assigned an ID with THREDDS catalogs ESGF nodes of the form: +//! ```text +//! <activity>.<product>.<institute>.<model>.<experiment>.<frequency>.<modeling_realm>. +//! <mip_tables>.<ensemble_member> +//! ``` +//! Each publication-level dataset _version_ will have the ID: +//! ```text +//! <activity>.<product>.<institute>.<model>.<experiment>.<frequency>.<modeling_realm>. +//! <mip_tables>.<ensemble_member>.<version> +//! ``` +//! The only difference between this and the dataset ID is the presence of `<version>`. +//! +//! Exceptions to the CMIP5 standard: +//! * This allows frequency values of `1hr` in addition to the normal values + +use std::str::FromStr; + +use camino::Utf8Path; +use chrono::NaiveDateTime; +use thiserror::Error; +use tracing::error; + +use super::parser::{parse_cmor, parse_esgf}; + +/// Error extracting data from a cmor path +#[derive(Debug, Error)] +#[error("error parsing CMOR path: {0}")] +pub struct InvalidCmorPathError(String); + +/// Error extracting data from an esgf path +#[derive(Debug, Error)] +#[error("error parsing ESGF path: {0}")] +pub struct InvalidEsgfPathError(String); + +/// Holds path and metadata pulled from a CMIP5 style path. +#[derive(Debug)] +pub struct Cmip5<'a> { + /// The path of the file relative to the root directory of the dataset meaning it contains all of and only the + /// elements relevant to CMIP5. + pub path: &'a Utf8Path, + /// The metadata extracted from the path + pub metadata: PathMetadata<'a>, +} + +impl<'a> Cmip5<'a> { + /// Extracts metadata from a CMOR style file path. It expects that the path will consist of only the parts relevant + /// to the DRS structure. For example, if a dataset starts at `/foo/data/cmip5/output...` then `path` must start + /// from `cmip5/output...`. + /// + /// Note: + /// * This will not verify that the file upholds various cross-field constraints of the spec (e.g. that `fx` + /// frequency has an ensemble value of `r0i0p0`). + pub fn from_cmor_path(path: &Utf8Path) -> Result<Cmip5, InvalidCmorPathError> { + let metadata = match parse_cmor(path.as_str()) { + Ok((_, metadata)) => metadata, + Err(e) => return Err(InvalidCmorPathError(e.to_string())), + }; + + Ok(Cmip5 { path, metadata }) + } + + /// Extracts metadata from an ESGF style file path. It expects that the path will consist of only the parts relevant + /// to the DRS structure. For example, if a dataset starts at `/foo/data/cmip5/output...` then `path` must start + /// from `cmip5/output...`. + /// + /// Note: + /// * This will not verify that the file upholds various cross-field constraints of the spec (e.g. that `fx` + /// frequency has an ensemble value of `r0i0p0`). + pub fn from_esgf_path(path: &Utf8Path) -> Result<Cmip5, InvalidEsgfPathError> { + let metadata = match parse_esgf(path.as_str()) { + Ok((_, metadata)) => metadata, + Err(e) => return Err(InvalidEsgfPathError(e.to_string())), + }; + + Ok(Cmip5 { path, metadata }) + } +} + +/// CMIP5 metadata encoded in the file path and file name of a file organized according to the CMIP5 DRS specification. +/// +/// Field documentation is partially copied from source document where relevant for a basic overview of the meaning of +/// the field and information on extracting and verifying its value. +#[derive(Debug, PartialEq, Eq)] +pub struct PathMetadata<'a> { + /// Identifies the model intercomparison activity or other data collection activity. + /// + /// There does not seem to be a definitive list of all possible values for this so no value checking will be done. + pub activity: &'a str, + /// Roughly, a description of what has been done to reach this data. From the spec: + /// + /// > For CMIP5, files will initially be designated as "output" or "unsolicited". Subsequently, data from the + /// > requested variable list will be assigned a version (see below) and placed in either "output1" or "output2." + /// + /// The spec says the allowed names are `output`, `output1`, `output2`, and `unsolicited` but later states that + /// other values may appear here but this is not part of the current DRS. Since current usage examples already use + /// other values, this will not check if the value is one of the stated possible values and will accept any string. + pub product: &'a str, + /// Identifies the institute responsible for the model results. or CMIP5 the institute name will be suggested by the + /// research group at the institute, subject to final authorization by PCMDI May differ from the netcdf + /// `institute_id` value. + /// + /// There does not seem to be a definitive list of all possible values so no checking will be done. + pub institute: &'a str, + /// Identifies the model used. Subject to certain constraints imposed by PCMDI, the modeling group will assign this + /// name, which might include a version number (usually truncated to the nearest integer). May differ from the netcdf + /// attribute `model_id` value. + /// + /// There does not seem to be a definitive list of all possible values so no checking will be done. + pub model: &'a str, + /// Identifies either the experiment or both the experiment _family_ and the specific _type_ within that experiment + /// family. These experiment names are not freely chosen, but come from controlled vocabularies defined in the + /// Appendix 1.1 of the source document under the column labeled “Short Name of Experiment†+ pub experiment: &'a str, + /// The interval between time samples in this dataset. See [`Frequency`] for more details. + pub frequency: Frequency, + /// Modeling component most relevant to this dataset. See [`ModelingRealm`] for more details. + pub modeling_realm: ModelingRealm, + /// This and the MIP table component identify the physical quantity and often imply something about the sampling + /// frequency and modeling realm. For CMIP5 the variable anme and MIP table for requested output appear in the + /// `standard_output` spreadsheet in <https://pcmdi.llnl.gov/mips/cmip5/requirements.html>. + /// + /// Note that hyphens (-) are forbidden in CMIP5 variable names. Though later the document states that this is + /// merely recommended for programming language compatibility. This chooses to allow hyphens. + pub variable: &'a str, + /// Informations about what variable conditions were applied to this dataset. See [`EnsembleMember`] for more + /// details. + pub ensemble: EnsembleMember, + /// For CMIP5, each MIP table contains fields sample only at a single frequency. See also `variable_name`. + pub mip_table: &'a str, + /// The version number will be `v` followed by an integer, which uniquely identifies a particular version of a + /// publication-level dataset. For CMIP5 the version number is supposed to reflect the date of publication: for + /// example, `v20100105` for a version provided on the 5th January 2010. Software interpreting version numbers + /// should not, however, assume the integer has invariably been correctly encoded (e.g., sometimes a single digit + /// number might appear as in `v3`). + pub version: Option<&'a str>, + /// The time interval covered by this dataset. See [`TemporalSubset`] for more details. + pub temporal_subset: Option<TemporalSubset<'a>>, + /// The geographical region covered by this dataset. See [`GeographicalInfo`] for more details. + pub geographical_info: Option<GeographicalInfo<'a>>, +} + +/// Indicates the interval between individual time-samples in the atomic dataset. For CMIP5, the following are the +/// only options: `yr`, `mon`, `day`, `6hr`, `3hr`, `subhr`, `monClim`, and `fx`. +/// +/// This additionally allows values of `1hr` corresponding to 1 hourly data. +/// +/// Further information: <https://pcmdi.llnl.gov/mips/cmip5/requirements.html> +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Frequency { + /// Yearly frequency + Year, + /// Monthly frequency + Month, + /// Daily frequency + Day, + /// 6 hourly frequency + Hour6, + /// 3 hourly frequency + Hour3, + /// Hourly frequency. + /// + /// This value is not part of the CMIP5 DRS spec but is frequently used + Hour1, + // TODO: it would be nice to have more details on these 2 + /// Sub-hourly frequency + SubHour, + /// Climatological monthly mean frequency + ClimatologicalMonthlyMean, + /// This dataset does not have data that changes over time + Fixed, +} + +impl ToString for Frequency { + fn to_string(&self) -> String { + use Frequency::*; + let s = match self { + Year => "yr", + Month => "mon", + Day => "day", + Hour6 => "6hr", + Hour3 => "3hr", + Hour1 => "1hr", + SubHour => "subhr", + ClimatologicalMonthlyMean => "monclim", + Fixed => "fx", + }; + s.to_owned() + } +} + +/// Error parsing a string value into a frequency +#[derive(Debug, Error)] +#[error("invalid frequency value {given}")] +pub struct InvalidFrequencyError { + given: String, +} + +impl FromStr for Frequency { + type Err = InvalidFrequencyError; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + use Frequency::*; + let s = s.to_lowercase(); + match s.as_str() { + "yr" => Ok(Year), + "mon" => Ok(Month), + "day" => Ok(Day), + "6hr" => Ok(Hour6), + "3hr" => Ok(Hour3), + "1hr" => Ok(Hour1), + "subhr" => Ok(SubHour), + "monclim" => Ok(ClimatologicalMonthlyMean), + "fx" => Ok(Fixed), + _ => Err(InvalidFrequencyError { given: s }), + } + } +} + +/// Indicates which high level modeling component is of particular relevance for the dataset. For CMIP5, the +/// permitted values are: `atmos`, `ocean`, `land`, `landIce`, `seaIce`, `aerosol`, `atmosChem`, and `ocnBgchem`. +/// Note that sometimes a variable will be equally (or almost equally relevant) to two or more “realmsâ€, in which +/// case the atomic dataset might be assigned to a primary “realmâ€, but cross-referenced or aliased to the other +/// relevant “realms†+/// +/// Its `FromStr` is case insensitive so `landice`, `landIce`, etc. would all be considered `LandIce`. +/// +/// Further information: <https://pcmdi.llnl.gov/mips/cmip5/requirements.html> +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ModelingRealm { + /// Dataset applies to atmospheric realm + Atmosphere, + /// Dataset applies to ocean + Ocean, + /// Dataset applies to land + Land, + /// Dataset applies to land ice + LandIce, + /// Dataset applies to sea ice + SeaIce, + /// Dataset applies to aerosol + Aerosol, + /// Dataset applies to atmospheric chemicals + AtmosphereChemical, + /// Dataset applies to ocean biogeochemical + OceanBiogeochemical, +} + +impl ToString for ModelingRealm { + fn to_string(&self) -> String { + use ModelingRealm::*; + let s = match self { + Atmosphere => "atmos", + Ocean => "ocean", + Land => "land", + LandIce => "landice", + SeaIce => "seaice", + Aerosol => "aerosol", + AtmosphereChemical => "atmoschem", + OceanBiogeochemical => "ocnbgchem", + }; + s.to_owned() + } +} + +/// Error parsing modeling realm from a string +#[derive(Debug, Error)] +#[error("invalid modeling realm value {given}")] +pub struct InvalidModelingRealmError { + given: String, +} + +impl FromStr for ModelingRealm { + type Err = InvalidModelingRealmError; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + use ModelingRealm::*; + let s = s.to_lowercase(); + match s.as_str() { + "atmos" => Ok(Atmosphere), + "ocean" => Ok(Ocean), + "land" => Ok(Land), + "landice" => Ok(LandIce), + "seaice" => Ok(SeaIce), + "aerosol" => Ok(Aerosol), + "atmoschem" => Ok(AtmosphereChemical), + "ocnbgchem" => Ok(OceanBiogeochemical), + _ => Err(InvalidModelingRealmError { given: s }), + } + } +} + +/// A tuple of 3 integers formatted as `r<N>i<M>p<L>` distinguishes among closely related simulations by a single +/// model. All three are required even if only a single simulation is performed. In CMIP5, time-independent +/// variables (i.e., those with frequency=`fx`) are not expected to differ across ensemble members, so for these M +/// should invariably be assigned the value zero (`i0`). The same holds true for the other numbers. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct EnsembleMember { + /// Used to distinguish between members of an ensemble that are generated by initializing a set of runs with + /// different, equally realistic initial conditions. + pub realization: u32, + /// Models used for forecasts that depend on the initial conditions might be initialized from observations using + /// different methods or different observational datasets. This value distinguishes between conditions + pub initialization: u32, + /// The "perturbed physics" numbers used to distinguish between closely-related model versions which are, as a + /// group, referred to as a perturbed physics ensemble where the different models use different sets of model + /// parameters. + pub physics: u32, +} + +impl ToString for EnsembleMember { + fn to_string(&self) -> String { + format!( + "r{0}i{1}p{2}", + self.realization, self.initialization, self.physics + ) + } +} + +/// Error parsing ensemble member from a string +#[derive(Debug, Error)] +#[error("invalid ensemble member string: {reason}")] +pub struct InvalidEnsembleMember { + reason: String, +} + +impl FromStr for EnsembleMember { + type Err = InvalidEnsembleMember; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + let (_, e) = match super::parser::cmip5::parse_ensemble(s) { + Ok(res) => res, + Err(e) => { + return Err(InvalidEnsembleMember { + reason: e.to_string(), + }) + } + }; + + Ok(e) + } +} + +/// Time instants or periods will be represented by a construction of the form `N1-N2`, where N1 and N2 are of the +/// form `yyyy[MM[dd[hh[mm[ss]]]]][-suffix]`, where `yyyy`, `MM`, `dd`, `hh`, `mm`, and `ss` are integer year, +/// month, day, hour, minute, and second, respectively, and the precision with which time is expressed must +/// unambiguously resolve the interval between time-samples contained in the file or virtual file. If only a single +/// time instant is included in the dataset, N2 may normally be omitted, but for CMIP5 N2 is required and in this +/// case would be identical to N1. +/// +/// Note that the DRS does not explicitly specify the calendar type (e.g., Julian, Gregorian), but the calendar will +/// be indicated by one of the attributes in each netCDF file. This is omitted for variables that are +/// time-independent. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TemporalSubset<'a> { + /// Start of the time period + pub start: NaiveDateTime, + /// End of the time period + pub end: NaiveDateTime, + /// The optional `-suffix` can be included to indicate that the netCDF file contains a climatology (suffix = + /// `-clim`) or a single time mean, for example, over multiple years (suffix = `-avg`). Consider a file containing a + /// single time-average, based on daily samples for the two-week period from February 1, 1971 through February 14, + /// 1971. In this case the frequency for the dataset would be `day` (because the average is based on daily samples), + /// and the suffix would be `19710201-19710214-avg` + pub suffix: Option<&'a str>, +} + +/// The geographical indicator is always optional, but when present it should appear last in the extended path. This +/// indicator specifies geographical subsets described by bounding boxes (e.g. 20S to 20N and 0 to 180E) or by +/// named regions (e.g., 'pacific-ocean`). The underscore character (`_`) is forbidden in the geographical +/// indicator. +/// +/// The DRS specification for this indicator is a string of the form `g-XXXX[-YYYY].` The `g-` indicates that some +/// spatial selection or processing has been done (i.e., selection of a sub-global region and possibly spatial +/// averaging). The `XXXX`, which must not be omitted, is either a named region (with names from a specific +/// gazetteer, which is yet to be selected) or the bounds of a latitude-longitude rectangle (following the template +/// defined below). The `YYYY` is optional and indicates if and what sort of spatial averaging has been performed +/// and whether the average includes masking of certain areas within the region (e.g., masking of land areas). The +/// DRS currently includes a single named region: `global`, which is used to select data from the entire horizontal +/// domain. +/// +/// In the case of a bounding box, the bounds of the region should be specified following the template, +/// `latJHJJHHlonMZMMZZ` where J, JJ, M and MM are integers, indicating the southern, northern, western and eastern +/// edges of the bounding box, respectively. H and HH are restricted to `N` or `S` (indicating "north" or "south"), +/// and the Z and ZZ are restricted to `E` or `W` (indicating "east" or "west"). The longitude values should be in +/// the range from 0 to 180 (e.g., a box spanning 200 degrees of longitude could be specified by `10W170W`, but not +/// by `10W190E`, even though 170W and 190E are the same longitude). The latitude and longitude values should be +/// rounded to the nearest integer. Omission of the latitude range or the longitude range implies that data were +/// selected from the entire domain of the omitted dimension. (For example, `lat20S20N` implies all longitudes were +/// included.) Remember, however, that if `XXXX` designates a bounding box, then at least one of the dimensions must +/// appear. +/// +/// The `YYYY` string is of the form `[yyy]-[zzz]` where the hyphen should be omitted unless both `yyy` and `zzz` +/// are present. As options for `yyy`, the DRS currently includes `lnd` and `ocn`. The `lnd` suffix indicates that +/// only `land` locations are considered, and the `ocn` suffix indicates that only `ocean` locations (including sea +/// ice) are considered. As options for `zzz`, the DRS currently includes `zonalavg` and `areaavg`, which indicate +/// `zonal mean` and `area mean` respectively. +/// +/// Here are some examples of geographical indicators: +/// * `g-lat20S20Nlon170W130Wâ€`– a geographical subset defined by a bounding box (latitudes -20 to 20, and +/// longitudes -170 to -130, when rounded to the nearest integer) +/// * `g-global-ocn-areaavg` – an average over the world’s oceans. +/// * `g-lat20S20N` – a geographical subset defined by a bounding box covering all longitudes +/// and extending from 20S to 20N. +/// * `g-lat20S20N-lnd-zonalavg` – a zonal average over tropical lands, covering all +/// longitudes. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GeographicalInfo<'a> { + /// The region of the file + pub region: Region<'a>, + /// How the data has been averaged across the region + pub averaging: Option<&'a str>, +} + +/// The different types of region definitions +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Region<'a> { + /// A region defined by a latitude-longitude bounding box + BoundingBox { + /// the latitude of the bounding box, both north and south bounds + lat: Option<&'a str>, + /// the longitude of the bounding box, both east and west bounds + lon: Option<&'a str>, + }, + /// A region defined by a single name, e.g. `global`. + Named(&'a str), +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_frequency_from_str() { + use Frequency::*; + let cases = vec![ + ("yr", Year), + ("mon", Month), + ("day", Day), + ("6hr", Hour6), + ("3hr", Hour3), + ("1hr", Hour1), + ("subhr", SubHour), + ("monClim", ClimatologicalMonthlyMean), + ("fx", Fixed), + ]; + for (s, expected) in cases.iter() { + let res = Frequency::from_str(s); + assert!(res.is_ok(), "unexpected error on from_str for '{s}'"); + assert_eq!( + res.unwrap(), + *expected, + "input '{s}' did not give expected result '{expected:?}'" + ); + } + } + + #[test] + fn test_modeling_realm_from_str() { + use ModelingRealm::*; + let cases = vec![ + ("atmos", Atmosphere), + ("ocean", Ocean), + ("land", Land), + ("landIce", LandIce), + ("seaIce", SeaIce), + ("aerosol", Aerosol), + ("atmosChem", AtmosphereChemical), + ("ocnBgchem", OceanBiogeochemical), + ]; + for (s, expected) in cases.iter() { + let res = ModelingRealm::from_str(s); + assert!(res.is_ok(), "unexpected error on from_str for '{s}'"); + assert_eq!( + res.unwrap(), + *expected, + "input '{s}' did not give expected result '{expected:?}'" + ); + } + } +} diff --git a/drs/src/cmip6.rs b/drs/src/cmip6.rs new file mode 100644 index 0000000000000000000000000000000000000000..d0183025ec70f51a2d8285b264347c2af5882455 --- /dev/null +++ b/drs/src/cmip6.rs @@ -0,0 +1,145 @@ +//! Implementation of the [CMIP6 +//! specification](https://docs.google.com/document/d/1h0r8RZr_f3-8egBMMh7aqLwy3snpD6_MrDz1q8n5XUk/edit) (see "File +//! name template:" and the preceding section) for extracting metadata from a file's path and name. +//! +//! Directory structure: +//! ```text +//! <mip_era>/<activity_id>/<institution_id>/<source_id>/<experiment_id>/ +//! <member_id>/<table_id>/<variable_id>/<grid_label>/<version>/<filename> +//! ``` +//! +//! Filename structure: +//! ```text +//! <variable_id>_<table_id>_<source_id>_<experiment_id>_<member_id>_ +//! <grid_label>[_<time_range>].nc +//! ``` +//! + +use camino::Utf8Path; +use thiserror::Error; + +use crate::{cmip5::TemporalSubset, parser::parse_cmip6}; + +/// The required value for the mip_era field of CMIP6 paths. +pub const MIP_ERA: &str = "CMIP6"; + +/// Error parsing cmip6 data from a path +#[derive(Debug, Error)] +#[error("invalid cmip6 path: {reason}")] +pub struct InvalidCmip6PathError { + reason: String, +} + +/// Holds data for a CMIP6 path +pub struct Cmip6<'a> { + /// The path of the file relative to the root directory of the dataset meaning it contains all of and only the + /// elements relevant to CMIP6. + pub path: &'a Utf8Path, + /// The metadata extracted from the path + pub metadata: PathMetadata<'a>, +} + +impl<'a> Cmip6<'a> { + /// Extracts metadata from a CMIP6 style file path. It expects that the path will consist of only the parts relevant + /// to the DRS structure. For example, if a dataset starts at `/foo/data/cmip6/output...` then `path` must start + /// from `cmip6/output...`. + /// + /// Note: + /// * This will not verify that the file upholds various cross-field constraints of the spec (e.g. that `fx` + /// frequency has an ensemble value of `r0i0p0`). + pub fn from_path(path: &'a Utf8Path) -> Result<Cmip6, InvalidCmip6PathError> { + let metadata = match parse_cmip6(path.as_str()) { + Ok((_, metadata)) => metadata, + Err(e) => { + return Err(InvalidCmip6PathError { + reason: e.to_string(), + }) + } + }; + + Ok(Self { path, metadata }) + } +} + +/// Metadata related to files stored according to the CMIP6 standard. +/// +/// Fields when have controlled vocabularies that are not static will not have their values checked against the +/// CV list. +#[derive(Debug, PartialEq, Eq)] +pub struct PathMetadata<'a> { + /// Activity ID, equivalent to CMIP5's `activity`. + /// + /// Allowed values and their meanings are + /// [here](https://github.com/WCRP-CMIP/CMIP6_CVs/blob/master/CMIP6_activity_id.json). + pub activity_id: &'a str, + /// Institution ID, equivalent of CMIP5's `institution`. + /// + /// Allowed values and their meanings are + /// [here](https://github.com/WCRP-CMIP/CMIP6_CVs/blob/master/CMIP6_institution_id.json) + pub institution_id: &'a str, + /// Model identifier, equivalent of CMIP5's `model`. + /// + /// Allowed values and their meanings are + /// [here](https://github.com/WCRP-CMIP/CMIP6_CVs/blob/master/CMIP6_source_id.json). + pub source_id: &'a str, + /// Root experiment ID, similar to CMIP5's `experiment`. + /// + /// Allowed values and their meanings + /// [here](https://github.com/WCRP-CMIP/CMIP6_CVs/blob/master/CMIP6_experiment_id.json) + pub experiment_id: &'a str, + /// Information on the experiment and variant label (similar to ensemble member in other DRS specs). + pub member_id: MemberId<'a>, + /// Table identifier, equivalent of CMIP5's `table_id`. + /// + /// Allowed values [here](https://github.com/WCRP-CMIP/CMIP6_CVs/blob/master/CMIP6_table_id.json) + pub table_id: &'a str, + /// Name of the variable contained in this dataset. + pub variable_id: &'a str, + /// Grid label to identify if this data uses the model's native grid, a different grid (i.e. was regridded) or no + /// grid in the case of some form of averaging. + /// + /// Allowed values [here](https://github.com/WCRP-CMIP/CMIP6_CVs/blob/master/CMIP6_grid_label.json). + pub grid_label: &'a str, + /// Version for the data, according to the spec this will always have the form `vYYYYMMDD`. + pub version: &'a str, + /// The time range covered by this dataset + pub time_range: Option<TemporalSubset<'a>>, +} + +/// Compound field consisting of `[<sub_experiment_id>-]<variant_label>` +#[derive(Debug, PartialEq, Eq)] +pub struct MemberId<'a> { + /// Sub-experiment ID. Needed for CMIP6 hindcast and forecast experiments to indicate "start year". For other + /// experiments, this should be set to "none". + /// + /// Allowed values and meanings + /// [here](https://github.com/WCRP-CMIP/CMIP6_CVs/blob/master/CMIP6_sub_experiment_id.json) + pub sub_experiment_id: Option<&'a str>, + /// The variant label for this data + pub variant_label: VariantLabel, +} + +/// Information about the variant used to generate this data, equivalent to CMIP5's `ensemble`. +#[derive(Debug, PartialEq, Eq)] +pub struct VariantLabel { + /// Used to distinguish between members of an ensemble that differ in their initial conditions + pub realization: u32, + /// Used to distinguish simulations performed under the same conditions but with different initialization procedures + pub initialization: u32, + /// Used to distinguish closely-related model versions (e.g., as in a “perturbed physics†ensemble) or for the same + /// model run with slightly different parameterizations (e.g., of cloud physics). Model versions that are + /// substantially different from one another should be given a different source_id†(rather than simply assigning a + pub physics: u32, + /// Used to distinguish runs conforming to the protocol of a single CMIP6 experiment, but with different variants + /// of forcing applied. + pub forcing: u32, +} + +impl ToString for VariantLabel { + fn to_string(&self) -> String { + format!( + "r{}i{}p{}f{}", + self.realization, self.initialization, self.physics, self.forcing + ) + } +} diff --git a/drs/src/cordex.rs b/drs/src/cordex.rs new file mode 100644 index 0000000000000000000000000000000000000000..2bdc31a70c08ed3d521eac40ec2ade2cfa665b05 --- /dev/null +++ b/drs/src/cordex.rs @@ -0,0 +1,216 @@ +//! Parsing and validation for files that conform to the CORDEX DRS standard. +//! +//! Source: <https://is-enes-data.github.io/cordex_archive_specifications.pdf> +//! See section 5.1 +//! +//! Cordex directory structure: +//! ```text +//! <activity>/<product>/<domain>/<institution>/ +//! <gcm_model_name>/<cmip5_experiement_name>/<cmip5_ensemble_member>/ +//! <rcm_model_name>/<rcm_version_id>/<frequency>/<variable_name>/[<version>/] +//! ``` +//! Cordex file name structure: +//! ```text +//! <variable_name>_<domain>_<gcm_model_name>_ +//! <cmip5_experiment_name>_<cmip5_ensemble_member>_ +//! <rcm_model_name>_<rcm_version_id>_<frequency>[_<start_time>-<end_time>].nc +//! ``` +//! +//! # Exceptions to the Cordex standard: +//! * According to the spec, `activity` and `product` must be `cordex` and `output` respectively. This allows +//! arbitrary values for these elements. +//! * There is no `version` element in the spec but this allows for an optional `version` at the end of the +//! path portion (before the filename). +//! * This allows frequency values of `1hr` in addition to the normal values + +use std::str::FromStr; + +use camino::Utf8Path; +use chrono::NaiveDateTime; +use thiserror::Error; + +use crate::{cmip5::EnsembleMember, parser::parse_cordex}; + +/// Error parsing a cordex style path +#[derive(Debug, Error)] +#[error("error parsing CORDEX path: {reason}")] +pub struct InvalidCordexPathError { + reason: String, +} + +/// Contains the path and metadata for a cordex-style DRS path. +#[derive(Debug)] +pub struct Cordex<'a> { + /// The path of the file relative to the root directory of the dataset meaning it contains all of and only the + /// elements relevant to CORDEX. + pub path: &'a Utf8Path, + /// The metadata extracted from the path + pub metadata: PathMetadata<'a>, +} + +impl<'a> Cordex<'a> { + /// Extracts metadata from a CORDEX style file path. It expects that the path will consist of only the parts relevant + /// to the DRS structure. For example, if a dataset starts at `/foo/data/cordex/output...` then `path` must start + /// from `cordex/output...`. + /// + /// Note: + /// * This will not verify that the file upholds various cross-field constraints of the spec (e.g. that `fx` + /// frequency has an ensemble value of `r0i0p0`). + pub fn from_path(path: &'a Utf8Path) -> Result<Self, InvalidCordexPathError> { + let metadata = match parse_cordex(path.as_str()) { + Ok((_, metadata)) => metadata, + Err(e) => { + return Err(InvalidCordexPathError { + reason: e.to_string(), + }) + } + }; + + Ok(Self { path, metadata }) + } +} + +/// Cordex metadata for a file path and name. +/// +/// Field documentation is mostly copied from the source document. Fields which have limited sets of value that are +/// updated remotely (e.g. institution) currently do not have those lists checked to ensure values are correct. This +/// may change later. The lists themselves are still linked for reference. +/// +/// Where relevant, name in attributes is referring to the key within the netcdf file's metadata for that field. +#[derive(Debug, PartialEq, Eq)] +pub struct PathMetadata<'a> { + /// Name of the activity for this dataset. In the spec this must be `cordex` but this is not enforced here. + pub activity: &'a str, + /// Name of the product of this dataset. In the spec this must be `output` but this is not enforced here. + pub product: &'a str, + /// The name assigned to each of the CORDEX regions and includes a flag for resolution. + /// + /// Possible values come from the name column(?) of Tables 1 and 2 in the source document. + /// + /// Name in attributes: `CORDEX_domain` + pub domain: &'a str, + /// identifier for the institution that is responsible for the scientific aspects of the CORDEX simulation (RCM + /// configuration, experiments ...). + /// + /// Updated list of possible values: <http://is-enes-data.github.io/CORDEX_RCMs_ToU.txt>. + /// + /// Name in attributes: `institute_id` + pub institution: &'a str, + /// Identifier of the driving data. The name consists of an institute identifier and a model identifier. For + /// reanalysis driven runs these are ECMWF and a name for the reanalysis data (ERAINT). For runs driven by CMIP5 + /// model data these are the associated CMIP5 institute_id and the CMIP5 model_id. The two parts of the name are + /// separated by a `-` (dash). Note that dashes in either of the two parts are allowed. + /// + /// List of possible values: <http://is-enes-data.github.io/GCMModelName.txt> + /// + /// Name in attributes: `driving_model_id` + pub gcm_model_name: &'a str, + /// Either `"evaluation"` or the value of the CMIP5 experiment_id of the data used. + /// + /// Name in attributes: `driving_experiment_name` + pub cmip5_experiment_name: &'a str, + /// Identifies the ensemble member of the CMIP5 experiment that produced the forcing data. It has to have the same + /// value in CORDEX as in CMIP5. For evaluation runs it has to be r1i1p1. Invariant fields (frequency=fx) may have + /// the value r0i0p0 or that of the corresponding GCMEnsembleMember attribute. + /// + /// Name in attributes: `driving_model_ensemble_member` + pub cmip5_ensemble_member: EnsembleMember, + /// Identifier of the CORDEX RCM. It consists of the Institution identifier (see above) and a model acronym, + /// connected by a dash (e.g. DMI-HIRHAM5 or SMHI-RCA4). The CV of the RCMModelName has to be coordinated in the + /// worldwide CORDEX community. Segments of the model name are allowed to include `-` + /// + /// List of possible values: <http://is-enes-data.github.io/CORDEX_RCMs_ToU.txt> + /// + /// Name in attributes: `model_id` + pub rcm_model_name: &'a str, + /// Identifies reruns with perturbed parameters or smaller RCM release upgrades, i.e. equivalent simulations. Major + /// upgrades and improvements should be reflected in the RCMModelName. + /// + /// Name in attributes: `rcm_version_id` + pub rcm_version_id: &'a str, + /// The frequency of the data. See [`Frequency`] for more details. + pub frequency: Frequency, + /// The name of the target variable in the NetCDF files. Possible values are found in + /// <http://is-enes-data.github.io/CORDEX_variables_requirement_table.pdf> + pub variable_name: &'a str, + /// The version of the data. Generally either an incrementing number as this dataset is revised or a string indicating + /// the data when the data was generated. + /// + /// This is not in the spec file but exists in real usage. + pub version: Option<&'a str>, + /// Indicates the start time of the file content. + /// + /// The format is `YYYY[MM[DD[HH[MM]]]]`, i.e. the year is represented by 4 digits, while the month, day, hour, and + /// minutes are represented by exactly 2 digits, if they are present at all. + pub start_time: Option<NaiveDateTime>, + /// Indicates the end time of the file content. See [`Self::start_time`] for additional information. + pub end_time: Option<NaiveDateTime>, +} + +/// The output frequency indicator: `3hr`=3 hourly, `6hr`=6 hourly, `day`=daily, `mon`=monthly, `sem`=seasonal, and +/// `fx`=invariant fields. +/// +/// This additionally allows values of `1hr` corresponding to 1 hourly data. +/// +/// Name in attributes: `frequency` +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Frequency { + /// Hourly data frequency. + /// + /// This is not part of the spec but added because it's widely used. + Hour1, + /// 3-hourly frequency + Hour3, + /// 6-hourly frequency + Hour6, + /// Daily frequency + Day, + /// Montly frequency + Month, + /// Seasonal frequency meaning the data is per season + Seasonal, + /// Fixed frequency meaning the dataset does not change over time + Fixed, +} + +impl ToString for Frequency { + fn to_string(&self) -> String { + use Frequency::*; + let s = match self { + Hour1 => "1hr", + Hour3 => "3hr", + Hour6 => "6hr", + Day => "day", + Month => "mon", + Seasonal => "sem", + Fixed => "fx", + }; + s.to_owned() + } +} + +/// Error parsing frequency from string +#[derive(Debug, Error)] +#[error("invalid frequency value {given}")] +pub struct InvalidFrequencyError { + given: String, +} + +impl FromStr for Frequency { + type Err = InvalidFrequencyError; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + use Frequency::*; + let s = s.to_lowercase(); + match s.as_str() { + "1hr" => Ok(Hour1), + "3hr" => Ok(Hour3), + "6hr" => Ok(Hour6), + "day" => Ok(Day), + "mon" => Ok(Month), + "sem" => Ok(Seasonal), + "fx" => Ok(Fixed), + _ => Err(InvalidFrequencyError { given: s }), + } + } +} diff --git a/drs/src/lib.rs b/drs/src/lib.rs new file mode 100644 index 0000000000000000000000000000000000000000..87ce330a1b3e7d17c7cd870dfbcbefdb533f963f --- /dev/null +++ b/drs/src/lib.rs @@ -0,0 +1,30 @@ +#![warn(missing_docs)] +//! This module handles extracting metadata from various DRS path specifications. +//! +//! It presents functions which take in DRS paths and returns objects containing the path given (to facilitate opening +//! the file later if needed) along with an object holding the metadata found in the path. Currently it does not provide +//! functionality to open the files and extract any metadata from the file's attribute data. +//! +//! # Example: +//! ``` +//! use camino::Utf8Path; +//! use drs::cmip5::Cmip5; +//! +//! let path = concat!( +//! "cmip5/output1/NOAA-GFDL/GFDL-CM3/1pctCO2/fx/ocean/fx/r0i0p0/v20120227/gridspec/", +//! "gridspec_ocean_fx_GFDL-CM3_1pctCO2_r0i0p0.nc" +//! ); +//! let cmip5 = Cmip5::from_esgf_path(path.into()).unwrap(); +//! assert_eq!(path, cmip5.path); +//! assert_eq!("NOAA-GFDL", cmip5.metadata.institute); +//! // ... +//! ``` +//! +//! This relies heavily on the [`nom`](https://crates.io/crates/nom) crate for parsing the paths which allows for this +//! to be fast with minimal allocations hence the `metadata` objects will reference parts of `path`. Any comparisons +//! which are case insensitive do allocate a temporary string as part of `to_lowercase`. + +pub mod cmip5; +pub mod cmip6; +pub mod cordex; +mod parser; diff --git a/drs/src/parser.rs b/drs/src/parser.rs new file mode 100644 index 0000000000000000000000000000000000000000..e088e924c086ac4e493b624cd4992653ed170e27 --- /dev/null +++ b/drs/src/parser.rs @@ -0,0 +1,17 @@ +//! Handles the nom parsing code for each of the different DRS specifications and puts commonly used functions in +//! [`common`]. +//! +//! Each of the functions this module provides expect to receive just the portion of the path relevant to their +//! respective DRS specification. +//! +//! Documentation about the different specifications can be found in the respective `drs::<spec>` modules as those are +//! public facing and are not be replicated within their `parser` counterparts. + +pub(crate) mod cmip5; +mod cmip6; +mod common; +mod cordex; + +pub(crate) use cmip5::{parse_cmor, parse_esgf}; +pub(crate) use cmip6::parse_cmip6; +pub(crate) use cordex::parse_cordex; diff --git a/drs/src/parser/cmip5.rs b/drs/src/parser/cmip5.rs new file mode 100644 index 0000000000000000000000000000000000000000..be2d40f79ee6cbc576d8b473524e900dfbc0a5c1 --- /dev/null +++ b/drs/src/parser/cmip5.rs @@ -0,0 +1,748 @@ +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::character::complete::{alpha1, digit1, u32 as nom_u32}; +use nom::combinator::{eof, map_res, opt, recognize, verify}; +use nom::sequence::{pair, preceded, terminated, tuple}; +use nom::IResult; +use std::str::FromStr; + +use crate::cmip5::{ + EnsembleMember, Frequency, GeographicalInfo, ModelingRealm, PathMetadata, Region, + TemporalSubset, +}; + +use super::common::{ + name_segment, name_sep, parse_temporal_subset, parse_version, path_segment, path_sep, word, +}; + +/// Parses a string as a CMOR path. This assumes that the string it's given will be just the portion of a path that +/// is relevant to CMOR meaning any preceding directories have already been removed. +pub(crate) fn parse_cmor(path: &str) -> IResult<&str, PathMetadata> { + /* + <activity>/<product>/<institute>/<model>/<experiment>/ + <frequency>/<modeling_realm>/<variable_name>/<ensemble_member> + */ + let (i, activity) = path_segment(path)?; + let (i, product) = path_segment(i)?; + let (i, institute) = path_segment(i)?; + let (i, model) = path_segment(i)?; + let (i, experiment) = path_segment(i)?; + let (i, frequency) = terminated(parse_frequency, path_sep)(i)?; + let (i, modeling_realm) = terminated(parse_modeling_realm, path_sep)(i)?; + let (i, variable) = path_segment(i)?; + let (i, ensemble) = terminated(parse_ensemble, path_sep)(i)?; + /* + Now we're in the filename portion + <variable_name>_<mip_table>_<model>_<experiment>_ + <ensemble_member>[_<temporal_subset>][_<geographical_info>] + */ + let (i, _variable) = name_segment(i)?; + let (i, mip_table) = name_segment(i)?; + let (i, _model) = name_segment(i)?; + let (i, _experiment) = name_segment(i)?; + let (i, _ensemble) = parse_ensemble(i)?; + let (i, temporal_subset) = opt(parse_temporal_subset)(i)?; + let (i, geographical_info) = opt(parse_geographical_info)(i)?; + + // The specification explicitly has DRS files ending with `.nc` so assuming they're netCDF files (or willing to make + // themselves look like netCDF files), this verifies that. + let (i, _) = tag(".nc")(i)?; + // verify that we've reached the end of the input, if not, this file is not a valid cmor path + let (i, _) = eof(i)?; + + Ok(( + i, + PathMetadata { + activity, + product, + institute, + model, + experiment, + frequency, + modeling_realm, + variable, + ensemble, + mip_table, + temporal_subset, + geographical_info, + version: None, + }, + )) +} + +/// Parses a string as an ESGF path. This assumes that the string it's given will be just the portion of a path that +/// is relevant to ESGF meaning any preceding directories have already been removed. +pub(crate) fn parse_esgf(path: &str) -> IResult<&str, PathMetadata> { + /* + <activity>/<product>/<institute>/<model>/<experiment>/<frequency>/ + <modeling_realm>/<mip_table>/<ensemble_member>/<version>/ + <variable_name> + */ + let (i, activity) = path_segment(path)?; + let (i, product) = path_segment(i)?; + let (i, institute) = path_segment(i)?; + let (i, model) = path_segment(i)?; + let (i, experiment) = path_segment(i)?; + let (i, frequency) = terminated(parse_frequency, path_sep)(i)?; + let (i, modeling_realm) = terminated(parse_modeling_realm, path_sep)(i)?; + let (i, mip_table) = path_segment(i)?; + let (i, ensemble) = terminated(parse_ensemble, path_sep)(i)?; + let (i, version) = terminated(parse_version, path_sep)(i)?; + let (i, variable) = path_segment(i)?; + + let (i, (temporal_subset, geographical_info)) = + alt((parse_esgf_gridspec_filename, parse_esgf_filename))(i)?; + + // The specification explicitly has DRS files ending with `.nc` so assuming they're netCDF files (or willing to make + // themselves look like netCDF files), this verifies that. + let (i, _) = tag(".nc")(i)?; + // verify that we've reached the end of the input, if not, this file is not a valid cmor path + let (i, _) = eof(i)?; + + Ok(( + i, + PathMetadata { + activity, + product, + institute, + model, + experiment, + frequency, + modeling_realm, + variable, + ensemble, + mip_table, + temporal_subset, + geographical_info, + version: Some(version), + }, + )) +} + +fn parse_esgf_filename( + i: &str, +) -> IResult<&str, (Option<TemporalSubset>, Option<GeographicalInfo>)> { + /* + <variable_name>_<mip_table>_<model>_<experiment>_ + <ensemble_member>[_<temporal_subset>][_<geographical_info>] + */ + + let (i, _variable) = name_segment(i)?; + let (i, _mip_table) = name_segment(i)?; + let (i, _model) = name_segment(i)?; + let (i, _experiment) = name_segment(i)?; + // does not use `name_segment` because this could be the last part of the file and may not be followed by `_` + let (i, _ensemble) = word(i)?; + let (i, temporal_subset) = opt(parse_temporal_subset)(i)?; + let (i, geographical_info) = opt(parse_geographical_info)(i)?; + + Ok((i, (temporal_subset, geographical_info))) +} + +fn parse_esgf_gridspec_filename( + i: &str, +) -> IResult<&str, (Option<TemporalSubset>, Option<GeographicalInfo>)> { + // gridspec_<modeling_realm>_fx_<model>_<experiment>_r0i0p0 + let (i, _) = terminated(tag("gridspec"), name_sep)(i)?; + let (i, _modeling_realm) = terminated(parse_modeling_realm, name_sep)(i)?; + let (i, _) = terminated(tag("fx"), name_sep)(i)?; + let (i, _model) = name_segment(i)?; + let (i, _experiment) = name_segment(i)?; + let (i, _ensemble) = parse_ensemble(i)?; + Ok((i, (None, None))) +} + +fn parse_frequency(i: &str) -> IResult<&str, Frequency> { + map_res(word, Frequency::from_str)(i) +} + +fn parse_modeling_realm(i: &str) -> IResult<&str, ModelingRealm> { + map_res(word, ModelingRealm::from_str)(i) +} + +pub(crate) fn parse_ensemble(i: &str) -> IResult<&str, EnsembleMember> { + let (i, (realization, method, perturbed_num)) = tuple(( + preceded(tag("r"), nom_u32), + preceded(tag("i"), nom_u32), + preceded(tag("p"), nom_u32), + ))(i)?; + Ok(( + i, + EnsembleMember { + realization, + initialization: method, + physics: perturbed_num, + }, + )) +} + +fn parse_geographical_info(i: &str) -> IResult<&str, GeographicalInfo> { + let (i, _) = tag("g-")(i)?; + let (i, region) = parse_geographical_region(i)?; + let (i, averaging) = opt(preceded(tag("-"), word))(i)?; // averaging has a - so we use `word` here + + Ok((i, GeographicalInfo { region, averaging })) +} + +fn parse_geographical_region(i: &str) -> IResult<&str, Region> { + alt(( + parse_geographical_region_bounding_box, + parse_geographical_region_named, + ))(i) +} + +fn parse_geographical_region_bounding_box(i: &str) -> IResult<&str, Region> { + let (i, (lat, lon)) = verify( + pair(opt(parse_bounding_box_lat), opt(parse_bounding_box_lon)), + |(lat, lon)| lat.is_some() || lon.is_some(), + )(i)?; + Ok((i, Region::BoundingBox { lat, lon })) +} + +fn parse_geographical_region_named(i: &str) -> IResult<&str, Region> { + let (i, name) = alpha1(i)?; + Ok((i, Region::Named(name))) +} + +fn parse_bounding_box_lat(i: &str) -> IResult<&str, &str> { + let (i, _) = tag("lat")(i)?; + recognize(tuple((digit1, tag("S"), digit1, tag("N"))))(i) +} + +fn parse_bounding_box_lon(i: &str) -> IResult<&str, &str> { + let (i, _) = tag("lon")(i)?; + recognize(tuple((digit1, tag("W"), digit1, tag("E"))))(i) +} + +#[cfg(test)] +mod tests { + use chrono::NaiveDate; + + use crate::cmip5::TemporalSubset; + + use super::*; + + #[test] + fn test_frequency_parser() { + assert!(parse_frequency("foo").is_err()); + use Frequency::*; + let cases = vec![ + ("yr", Year), + ("mon", Month), + ("day", Day), + ("6hr", Hour6), + ("3hr", Hour3), + ("subhr", SubHour), + ("monClim", ClimatologicalMonthlyMean), + ("fx", Fixed), + ]; + for (s, expected) in cases.iter() { + let res = parse_frequency(s); + assert_eq!( + res, + Ok(("", *expected)), + "input '{s}' did not give expected result '{expected:?}'" + ); + } + } + + #[test] + fn test_parse_ensemble_member() { + assert_eq!( + Ok(( + "", + EnsembleMember { + realization: 1, + initialization: 2, + physics: 3 + } + )), + parse_ensemble("r1i2p3") + ); + } + + #[test] + fn test_parse_region() { + let cases = &[ + ("lat20S20Nlon20W20E", Some("20S20N"), Some("20W20E")), + ("lon10W30E", None, Some("10W30E")), + ]; + for (s, exp_lat, exp_lon) in cases.into_iter() { + let expected = Region::BoundingBox { + lat: exp_lat.clone(), + lon: exp_lon.clone(), + }; + assert_eq!( + Ok(("", expected.clone())), + parse_geographical_region(s), + "input '{s} did not give expected result '{expected:?}'", + ); + } + } + + #[test] + fn test_parse_geographical_info() { + let cases = &[ + ( + "g-lon10W30E", + Region::BoundingBox { + lat: None, + lon: Some("10W30E"), + }, + None, + ), + ( + "g-lat10S20N", + Region::BoundingBox { + lat: Some("10S20N"), + lon: None, + }, + None, + ), + ( + "g-lat10S190Nlon50W40E-avg", + Region::BoundingBox { + lat: Some("10S190N"), + lon: Some("50W40E"), + }, + Some("avg"), + ), + ("g-namedregion", Region::Named("namedregion"), None), + ( + "g-namedregion-some-avgfunc", + Region::Named("namedregion"), + Some("some-avgfunc"), + ), + ]; + for (s, region, suffix) in cases.into_iter() { + let expected = GeographicalInfo { + region: region.clone(), + averaging: *suffix, + }; + assert_eq!( + Ok(("", expected.clone())), + parse_geographical_info(s), + "input '{s} did not give expected result '{expected:?}'", + ); + } + } + + #[test] + fn test_full_esgf_paths() { + let cases = full_esgf_path_data(); + for (i, (input, expected)) in cases.into_iter().enumerate() { + let (remaining, output) = parse_esgf(input).expect("error parsing esgf path"); + assert_eq!( + remaining, "", + "expected no remaining data to parse on case {}", + i + ); + assert_eq!( + expected, output, + "output did not match expected on case {}", + i + ); + } + } + + #[test] + fn test_full_cmor_paths() { + let cases = full_cmor_path_data(); + for (i, (input, expected)) in cases.into_iter().enumerate() { + let (remaining, output) = parse_cmor(input).expect("error parsing cmor path"); + assert_eq!( + remaining, "", + "expected no remaining data to parse on case {}", + i + ); + assert_eq!( + expected, output, + "output did not match expected on case {}", + i + ); + } + } + + fn full_esgf_path_data() -> Vec<(&'static str, PathMetadata<'static>)> { + vec![ + ( + concat!( + "cmip5/output2/INM/inmcm4/esmHistorical/mon/land/Lmon/r1i1p1/v20110323/fVegSoil/", + "fVegSoil_Lmon_inmcm4_esmHistorical_r1i1p1_185001-200512.nc" + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "INM", + model: "inmcm4", + experiment: "esmHistorical", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Land, + variable: "fVegSoil", + ensemble: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + mip_table: "Lmon", + version: Some("v20110323"), + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2005, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + }, + ), + ( + concat!( + "cmip5/output2/INM/inmcm4/historical/mon/land/Lmon/r1i1p1/v20110323/mrlso/", + "mrlso_Lmon_inmcm4_historical_r1i1p1_185001-200512.nc" + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "INM", + model: "inmcm4", + experiment: "historical", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Land, + ensemble: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + version: Some("v20110323"), + variable: "mrlso", + mip_table: "Lmon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2005, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + }, + ), + ( + concat!( + "cmip5/output2/INM/inmcm4/esmrcp85/mon/land/Lmon/r1i1p1/v20110323/residualFrac/", + "residualFrac_Lmon_inmcm4_esmrcp85_r1i1p1_200601-210012.nc", + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "INM", + model: "inmcm4", + experiment: "esmrcp85", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Land, + ensemble: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + version: Some("v20110323"), + variable: "residualFrac", + mip_table: "Lmon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(2006, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2100, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + } + ), + ( + concat!( + "cmip5/output2/INM/inmcm4/piControl/mon/land/Lmon/r1i1p1/v20110323/fLuc/", + "fLuc_Lmon_inmcm4_piControl_r1i1p1_185001-234912.nc" + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "INM", + model: "inmcm4", + experiment: "piControl", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Land, + ensemble: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + version: Some("v20110323"), + variable: "fLuc", + mip_table: "Lmon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2349, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + } + ), + ( + concat!( + "cmip5/output2/NASA-GISS/GISS-E2-H/historical/mon/land/Lmon/r5i1p1/v20120517/cSoil/", + "cSoil_Lmon_GISS-E2-H_historical_r5i1p1_185001-190012.nc", + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "NASA-GISS", + model: "GISS-E2-H", + experiment: "historical", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Land, + ensemble: EnsembleMember { + realization: 5, + initialization: 1, + physics: 1, + }, + version: Some("v20120517"), + variable: "cSoil", + mip_table: "Lmon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(1900, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + } + ), + ( + concat!( + "cmip5/output2/IPSL/IPSL-CM5A-LR/amip4K/mon/atmos/aero/r1i1p1/v20110429/ps/", + "ps_aero_IPSL-CM5A-LR_amip4K_r1i1p1_197901-200912.nc" + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "IPSL", + model: "IPSL-CM5A-LR", + experiment: "amip4K", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Atmosphere, + ensemble: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + version: Some("v20110429"), + variable: "ps", + mip_table: "aero", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1979, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2009, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + } + ), + ( + concat!( + "cmip5/output2/IPSL/IPSL-CM5A-LR/amip4K/mon/atmos/cfMon/r1i1p1/v20110429/ps/", + "ps_cfMon_IPSL-CM5A-LR_amip4K_r1i1p1_197901-200912.nc" + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "IPSL", + model: "IPSL-CM5A-LR", + experiment: "amip4K", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Atmosphere, + ensemble: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + version: Some("v20110429"), + variable: "ps", + mip_table: "cfMon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1979, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2009, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + } + ), + ( + concat!( + "cmip5/output2/IPSL/IPSL-CM5A-LR/historical/mon/atmos/aero/r4i1p1/v20110406/ps/", + "ps_aero_IPSL-CM5A-LR_historical_r4i1p1_185001-200512.nc" + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "IPSL", + model: "IPSL-CM5A-LR", + experiment: "historical", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Atmosphere, + ensemble: EnsembleMember { + realization: 4, + initialization: 1, + physics: 1, + }, + version: Some("v20110406"), + variable: "ps", + mip_table: "aero", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2005, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + } + ), + ( + concat!( + "cmip5/output2/IPSL/IPSL-CM5A-LR/historical/mon/atmos/cfMon/r2i1p1/v20111119/ps/", + "ps_cfMon_IPSL-CM5A-LR_historical_r2i1p1_185001-200512.nc" + ), + PathMetadata { + activity: "cmip5", + product: "output2", + institute: "IPSL", + model: "IPSL-CM5A-LR", + experiment: "historical", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Atmosphere, + ensemble: EnsembleMember { + realization: 2, + initialization: 1, + physics: 1, + }, + version: Some("v20111119"), + variable: "ps", + mip_table: "cfMon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2005, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + geographical_info: None, + } + ), + ( + concat!( + "cmip5/output1/NOAA-GFDL/GFDL-CM3/1pctCO2/fx/ocean/fx/r0i0p0/v20120227/gridspec/", + "gridspec_ocean_fx_GFDL-CM3_1pctCO2_r0i0p0.nc", + ), + PathMetadata { + activity: "cmip5", + product: "output1", + institute: "NOAA-GFDL", + model: "GFDL-CM3", + experiment: "1pctCO2", + frequency: Frequency::Fixed, + modeling_realm: ModelingRealm::Ocean, + ensemble: EnsembleMember { + realization: 0, + initialization: 0, + physics: 0, + }, + version: Some("v20120227"), + variable: "gridspec", + mip_table: "fx", + temporal_subset: None, + geographical_info: None, + } + ), + ] + } + + fn full_cmor_path_data() -> Vec<(&'static str, PathMetadata<'static>)> { + vec![ + ( + concat!( + "user-b/historical/MPI-M/MPI-ESM-LR/historical/mon/atmos/tas/r1i1p1/", + "tas_Amon_MPI-ESM-LR_historical_r1i1p1_185001-200512.nc" + ), + PathMetadata { + activity: "user-b", + product: "historical", + institute: "MPI-M", + model: "MPI-ESM-LR", + experiment: "historical", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Atmosphere, + variable: "tas", + ensemble: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + mip_table: "Amon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2005, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + version: None, + geographical_info: None, + }, + ), + ( + concat!( + "user-b/ancestry/FUB/MPI-ESM-LR/decs4e1988/mon/atmos/sic/r9i1p1/", + "sic_mon_MPI-ESM-LR_decs4e1988_r9i1p1_1989010100-1993123118.nc" + ), + PathMetadata { + activity: "user-b", + product: "ancestry", + institute: "FUB", + model: "MPI-ESM-LR", + experiment: "decs4e1988", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Atmosphere, + variable: "sic", + ensemble: EnsembleMember { + realization: 9, + initialization: 1, + physics: 1, + }, + mip_table: "mon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1989, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(1993, 12, 31).and_hms(18, 0, 0), + suffix: None, + }), + version: None, + geographical_info: None, + }, + ), + ( + concat!( + "user-b/novolc/MPI-M/MPI-ESM-LR/decadal1970/mon/atmos/tas/r10i1p1/", + "tas_Amon_MPI-ESM-LR_decadal1970_r10i1p1_197101-198012.nc", + ), + PathMetadata { + activity: "user-b", + product: "novolc", + institute: "MPI-M", + model: "MPI-ESM-LR", + experiment: "decadal1970", + frequency: Frequency::Month, + modeling_realm: ModelingRealm::Atmosphere, + variable: "tas", + ensemble: EnsembleMember { + realization: 10, + initialization: 1, + physics: 1, + }, + mip_table: "Amon", + temporal_subset: Some(TemporalSubset { + start: NaiveDate::from_ymd(1971, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(1980, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + version: None, + geographical_info: None, + }, + ), + ] + } +} diff --git a/drs/src/parser/cmip6.rs b/drs/src/parser/cmip6.rs new file mode 100644 index 0000000000000000000000000000000000000000..db784dac63c772ad2d5e555b2e469d3a4165ae6c --- /dev/null +++ b/drs/src/parser/cmip6.rs @@ -0,0 +1,348 @@ +use nom::{ + branch::alt, + bytes::complete::{tag, take_while, take_while_m_n}, + character::complete::{alpha1, u32 as nom_u32}, + combinator::{eof, opt, recognize, verify}, + sequence::{preceded, terminated, tuple}, + IResult, +}; + +use crate::{ + cmip6::{MemberId, PathMetadata, VariantLabel, MIP_ERA}, + parser::common::{name_segment, parse_temporal_subset, parse_version, path_segment}, +}; + +use super::common::{name_sep, path_sep, word}; + +/// Parses a string as a CMIP6 path. This assumes that the string it's given will be just the portion of a path that +/// is relevant to CMIP6 meaning any preceding directories have already been removed. +pub(crate) fn parse_cmip6(path: &str) -> IResult<&str, PathMetadata> { + /* + <mip_era>/<activity_id>/<institution_id>/<source_id>/<experiment_id>/ + <member_id>/<table_id>/<variable_id>/<grid_label>/<version>/<filename> + */ + let (i, _mip_era) = terminated(parse_mip_era, path_sep)(path)?; + let (i, activity_id) = path_segment(i)?; + let (i, institution_id) = path_segment(i)?; + let (i, source_id) = path_segment(i)?; + let (i, experiment_id) = path_segment(i)?; + let (i, member_id) = terminated(parse_member_id, path_sep)(i)?; + let (i, table_id) = path_segment(i)?; // this may be enumable + let (i, variable_id) = path_segment(i)?; + let (i, grid_label) = terminated(parse_grid_label, path_sep)(i)?; + let (i, version) = terminated(parse_version, path_sep)(i)?; + /* + <variable_id>_<table_id>_<source_id>_<experiment_id>_<member_id>_ + <grid_label>[_<time_range>].nc + */ + let (i, _variable_id) = name_segment(i)?; + let (i, _table_id) = name_segment(i)?; + let (i, _source_id) = name_segment(i)?; + let (i, _experiment_id) = name_segment(i)?; + let (i, _member_id) = terminated(parse_member_id, name_sep)(i)?; + let (i, _grid_label) = parse_grid_label(i)?; + let (i, time_range) = opt(parse_temporal_subset)(i)?; + + let (i, _) = tag(".nc")(i)?; + let (i, _) = eof(i)?; + + Ok(( + i, + PathMetadata { + activity_id, + institution_id, + source_id, + experiment_id, + member_id, + table_id, + variable_id, + grid_label, + version, + time_range, + }, + )) +} + +fn parse_mip_era(i: &str) -> IResult<&str, &str> { + // get the segment and check that it is cmip6 which is required by the spec + verify(word, |w: &str| w == MIP_ERA)(i) +} + +fn parse_member_id(i: &str) -> IResult<&str, MemberId> { + let (i, sub_experiment_id) = opt(terminated(parse_sub_experiment_id, tag("-")))(i)?; + let (i, variant_label) = parse_variant_label(i)?; + + Ok(( + i, + MemberId { + sub_experiment_id, + variant_label, + }, + )) +} + +fn parse_sub_experiment_id(i: &str) -> IResult<&str, &str> { + recognize(alt((tag("none"), parse_initialization_year)))(i) +} + +fn parse_initialization_year(i: &str) -> IResult<&str, &str> { + let (i, _) = tag("s")(i)?; + take_while_m_n(4, 4, |c: char| c.is_numeric())(i) +} + +fn parse_variant_label(i: &str) -> IResult<&str, VariantLabel> { + let (i, realization) = preceded(tag("r"), nom_u32)(i)?; + let (i, initialization) = preceded(tag("i"), nom_u32)(i)?; + let (i, physics) = preceded(tag("p"), nom_u32)(i)?; + let (i, forcing) = preceded(tag("f"), nom_u32)(i)?; + + Ok(( + i, + VariantLabel { + realization, + initialization, + physics, + forcing, + }, + )) +} + +fn parse_grid_label(i: &str) -> IResult<&str, &str> { + recognize(tuple(( + tag("g"), + alt((tag("m"), parse_native_grid, parse_regridded)), + )))(i) +} + +fn parse_native_grid(i: &str) -> IResult<&str, &str> { + // technically allows for multiple characters following despite that not being possible in the CV + recognize(tuple((tag("n"), opt(alpha1))))(i) +} + +fn parse_regridded(i: &str) -> IResult<&str, &str> { + recognize(tuple((tag("r"), take_while(|c: char| c.is_alphanumeric()))))(i) +} + +#[cfg(test)] +mod tests { + use chrono::NaiveDate; + + use crate::cmip5::TemporalSubset; + + use super::*; + + #[test] + fn test_grid_label() { + let cases = vec!["gr", "gm", "gn", "gna", "gng", "gr1", "gr1z", "grz"]; + for (i, c) in cases.into_iter().enumerate() { + let res = parse_grid_label(c); + assert!(res.is_ok(), "error parsing grid label on case {}", i); + let (remaining, output) = res.unwrap(); + assert_eq!( + remaining, "", + "expected no remaining data to parse on case {}", + i + ); + assert_eq!(output, c, "output did not match expected on case {}", i); + } + } + + #[test] + fn test_full_paths() { + let cases = full_path_data(); + for (i, (input, expected)) in cases.into_iter().enumerate() { + let (remaining, output) = parse_cmip6(input).expect("error parsing path"); + assert_eq!( + remaining, "", + "expected no remaining data to parse on case {}", + i + ); + assert_eq!( + expected, output, + "output did not match expected on case {}", + i + ); + } + } + + fn full_path_data() -> Vec<(&'static str, PathMetadata<'static>)> { + vec![ + ( + concat!( + "CMIP6/CMIP/HAMMOZ-Consortium/MPI-ESM-1-2-HAM/1pctCO2/r1i1p1f1/Lmon/tsl/gn/v20190628/", + "tsl_Lmon_MPI-ESM-1-2-HAM_1pctCO2_r1i1p1f1_gn_191001-192912.nc" + ), + PathMetadata { + activity_id: "CMIP", + institution_id: "HAMMOZ-Consortium", + source_id: "MPI-ESM-1-2-HAM", + experiment_id: "1pctCO2", + member_id: MemberId { + sub_experiment_id: None, + variant_label: VariantLabel { + realization: 1, + initialization: 1, + physics: 1, + forcing: 1 + } + }, + table_id: "Lmon", + variable_id: "tsl", + grid_label: "gn", + version: "v20190628", + time_range: Some(TemporalSubset { + start: NaiveDate::from_ymd(1910, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(1929, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + }, + ), + ( + concat!( + "CMIP6/AerChemMIP/BCC/BCC-ESM1/hist-piAer/r1i1p1f1/AERmon/c2h6/gn/v20200511/", + "c2h6_AERmon_BCC-ESM1_hist-piAer_r1i1p1f1_gn_185001-201412.nc" + ), + PathMetadata { + activity_id: "AerChemMIP", + institution_id: "BCC", + source_id: "BCC-ESM1", + experiment_id: "hist-piAer", + member_id: MemberId { + sub_experiment_id: None, + variant_label: VariantLabel { + realization: 1, + initialization: 1, + physics: 1, + forcing: 1 + } + }, + table_id: "AERmon", + variable_id: "c2h6", + grid_label: "gn", + version: "v20200511", + time_range: Some(TemporalSubset { + start: NaiveDate::from_ymd(1850, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2014, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + } + ), + ( + concat!( + "CMIP6/GeoMIP/CCCma/CanESM5/G1/r3i1p2f1/Ofx/areacello/gn/v20190429/", + "areacello_Ofx_CanESM5_G1_r3i1p2f1_gn.nc" + ), + PathMetadata { + activity_id: "GeoMIP", + institution_id: "CCCma", + source_id: "CanESM5", + experiment_id: "G1", + member_id: MemberId { + sub_experiment_id: None, + variant_label: VariantLabel { + realization: 3, + initialization: 1, + physics: 2, + forcing: 1 + } + }, + table_id: "Ofx", + variable_id: "areacello", + grid_label: "gn", + version: "v20190429", + time_range: None, + } + ), + ( + concat!( + "CMIP6/HighResMIP/NOAA-GFDL/GFDL-CM4C192/highresSST-future/r1i1p1f1/6hrPlevPt/psl/gr3/v20180701/", + "psl_6hrPlevPt_GFDL-CM4C192_highresSST-future_r1i1p1f1_gr3_203501010600-205101010000.nc" + ), + PathMetadata { + activity_id: "HighResMIP", + institution_id: "NOAA-GFDL", + source_id: "GFDL-CM4C192", + experiment_id: "highresSST-future", + member_id: MemberId { + sub_experiment_id: None, + variant_label: VariantLabel { + realization: 1, + initialization: 1, + physics: 1, + forcing: 1 + } + }, + table_id: "6hrPlevPt", + variable_id: "psl", + grid_label: "gr3", + version: "v20180701", + time_range: Some(TemporalSubset { + start: NaiveDate::from_ymd(2035, 1, 1).and_hms(6, 0, 0), + end: NaiveDate::from_ymd(2051, 1, 1).and_hms(0, 0, 0), + suffix: None, + }), + } + ), + ( + concat!( + "CMIP6/ScenarioMIP/UA/MCM-UA-1-0/ssp370/r1i1p1f2/SImon/sithick/gn/v20190731/", + "sithick_SImon_MCM-UA-1-0_ssp370_r1i1p1f2_gn_201501-210012.nc" + ), + PathMetadata { + activity_id: "ScenarioMIP", + institution_id: "UA", + source_id: "MCM-UA-1-0", + experiment_id: "ssp370", + member_id: MemberId { + sub_experiment_id: None, + variant_label: VariantLabel { + realization: 1, + initialization: 1, + physics: 1, + forcing: 2 + } + }, + table_id: "SImon", + variable_id: "sithick", + grid_label: "gn", + version: "v20190731", + time_range: Some(TemporalSubset { + start: NaiveDate::from_ymd(2015, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(2100, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + } + ), + ( + concat!( + "CMIP6/CDRMIP/NCAR/CESM2/1pctCO2-cdr/r1i1p1f1/Omon/intpn2/gr/v20190613/", + "intpn2_Omon_CESM2_1pctCO2-cdr_r1i1p1f1_gr_005001-009912.nc" + ), + PathMetadata { + activity_id: "CDRMIP", + institution_id: "NCAR", + source_id: "CESM2", + experiment_id: "1pctCO2-cdr", + member_id: MemberId { + sub_experiment_id: None, + variant_label: VariantLabel { + realization: 1, + initialization: 1, + physics: 1, + forcing: 1 + } + }, + table_id: "Omon", + variable_id: "intpn2", + grid_label: "gr", + version: "v20190613", + time_range: Some(TemporalSubset { + start: NaiveDate::from_ymd(50, 1, 1).and_hms(0, 0, 0), + end: NaiveDate::from_ymd(99, 12, 1).and_hms(0, 0, 0), + suffix: None, + }), + } + ) + ] + } +} diff --git a/drs/src/parser/common.rs b/drs/src/parser/common.rs new file mode 100644 index 0000000000000000000000000000000000000000..ff0ec6c40138d7f53a6e794459d628f5376d3956 --- /dev/null +++ b/drs/src/parser/common.rs @@ -0,0 +1,128 @@ +use chrono::{NaiveDate, NaiveDateTime}; +use nom::{ + bytes::complete::{tag, take_while1, take_while_m_n}, + character::complete::digit1, + combinator::{map_res, opt, recognize}, + sequence::{terminated, tuple}, + IResult, +}; + +use crate::cmip5::TemporalSubset; + +pub(super) fn parse_version(i: &str) -> IResult<&str, &str> { + recognize(tuple((tag("v"), digit1)))(i) +} + +pub(super) fn parse_temporal_subset(i: &str) -> IResult<&str, TemporalSubset> { + let (i, _) = name_sep(i)?; // drop the name separator + let (i, (start, _, end)) = tuple((parse_time, tag("-"), parse_time))(i)?; + let (i, suffix) = opt(word)(i)?; + + Ok((i, TemporalSubset { start, end, suffix })) +} + +pub(super) fn parse_time(i: &str) -> IResult<&str, NaiveDateTime> { + let (i, year) = parse_n_i32(4)(i)?; + let (i, month) = opt(parse_n_u32(2))(i)?; + let (i, day) = opt(parse_n_u32(2))(i)?; + let (i, hour) = opt(parse_n_u32(2))(i)?; + let (i, minute) = opt(parse_n_u32(2))(i)?; + let (i, second) = opt(parse_n_u32(2))(i)?; + + Ok(( + i, + NaiveDate::from_ymd(year, month.unwrap_or(1), day.unwrap_or(1)).and_hms( + hour.unwrap_or(0), + minute.unwrap_or(0), + second.unwrap_or(0), + ), + )) +} + +/// Parses a segment of the path when no special verification is required. +/// This is a shorthand for consuming both a section of the path and following `/` while discarding the `/`. +/// +/// # Examples +/// ```ignore +/// assert_eq!(segment("foo/"), Ok(("", "foo"))); +/// ``` +pub(super) fn path_segment(i: &str) -> IResult<&str, &str> { + terminated(word, path_sep)(i) +} + +pub(super) fn name_segment(i: &str) -> IResult<&str, &str> { + terminated(word, name_sep)(i) +} + +/// Parser for a mostly alphanumeric text segment. +/// Most DRS segments allow `a-zA-Z0-9\-` i.e. alphanumeric plus `-` so this parser will accept a full string of those. +pub(super) fn word(i: &str) -> IResult<&str, &str> { + take_while1(|c: char| c.is_alphanumeric() || c == '-')(i) +} + +/// Parser for all path separators. +/// +/// Allows for multiple consecutive separators though such a path would likely be invalid or incorrect anyway. +pub(super) fn path_sep(i: &str) -> IResult<&str, &str> { + take_while1(std::path::is_separator)(i) +} + +/// Parser for the separator used within the filename. +pub(super) fn name_sep(i: &str) -> IResult<&str, &str> { + tag("_")(i) +} + +pub(super) fn parse_n_u32<'a>(n: usize) -> impl FnMut(&'a str) -> IResult<&'a str, u32> { + map_res(take_while_m_n(n, n, |c: char| c.is_digit(10)), str::parse) +} + +pub(super) fn parse_n_i32<'a>(n: usize) -> impl FnMut(&'a str) -> IResult<&'a str, i32> { + map_res(take_while_m_n(n, n, |c: char| c.is_digit(10)), str::parse) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_path_segment() { + let i = "foo/bar/baz"; + let (i, foo) = path_segment(i).unwrap(); + assert_eq!("bar/baz", i); + assert_eq!("foo", foo); + } + + #[test] + fn test_name_segment() { + let i = "foo_bar_baz"; + let (i, foo) = name_segment(i).unwrap(); + assert_eq!("bar_baz", i); + assert_eq!("foo", foo); + } + + #[test] + fn test_parse_time() { + let cases = &[ + ("1901", (1901, 1, 1, 0, 0, 0)), + ("190102", (1901, 2, 1, 0, 0, 0)), + ("19010203", (1901, 2, 3, 0, 0, 0)), + ("1901020304", (1901, 2, 3, 4, 0, 0)), + ("190102030405", (1901, 2, 3, 4, 5, 0)), + ("19010203040506", (1901, 2, 3, 4, 5, 6)), + ("19011203040506", (1901, 12, 3, 4, 5, 6)), + ("19011203040526", (1901, 12, 3, 4, 5, 26)), + ]; + for (s, expected) in cases.iter() { + let res = parse_time(s); + assert_eq!( + Ok(( + "", + NaiveDate::from_ymd(expected.0, expected.1, expected.2) + .and_hms(expected.3, expected.4, expected.5) + )), + res, + "input '{s}' did not give expected result '{expected:?}'" + ); + } + } +} diff --git a/drs/src/parser/cordex.rs b/drs/src/parser/cordex.rs new file mode 100644 index 0000000000000000000000000000000000000000..9e16013ee875c6d0c077ce9dcb10cb15f8b9b0e3 --- /dev/null +++ b/drs/src/parser/cordex.rs @@ -0,0 +1,436 @@ +use std::str::FromStr; + +use chrono::NaiveDateTime; +use nom::{ + bytes::complete::tag, + combinator::{eof, map_res, opt}, + sequence::{preceded, terminated}, + IResult, +}; + +use crate::{ + cordex::{Frequency, PathMetadata}, + parser::{ + cmip5::parse_ensemble, + common::{name_segment, name_sep, parse_time, parse_version, path_sep}, + }, +}; + +use super::common::{path_segment, word}; + +/// Parses a string as a CORDEX path. This assumes that the string it's given will be just the portion of a path that +/// is relevant to CORDEX meaning any preceding directories have already been removed. +pub(crate) fn parse_cordex(path: &str) -> IResult<&str, PathMetadata> { + /* + <activity>/<product>/<domain>/<institution>/ + <gcm_model_name>/<cmip5_experiement_name>/<cmip5_ensemble_member>/ + <rcm_model_name>/<rcm_version_id>/<frequency>/<variable_name>/[<version>/] + */ + let (i, activity) = path_segment(path)?; + let (i, product) = path_segment(i)?; + let (i, domain) = path_segment(i)?; + let (i, institution) = path_segment(i)?; + let (i, gcm_model_name) = path_segment(i)?; + let (i, cmip5_experiment_name) = path_segment(i)?; + let (i, cmip5_ensemble_member) = terminated(parse_ensemble, path_sep)(i)?; + let (i, rcm_model_name) = path_segment(i)?; + let (i, rcm_version_id) = path_segment(i)?; + let (i, frequency) = terminated(parse_frequency, path_sep)(i)?; + let (i, variable_name) = path_segment(i)?; + let (i, version) = opt(terminated(parse_version, path_sep))(i)?; + + /* + <variable_name>_<domain>_<gcm_model_name>_ + <cmip5_experiment_name>_<cmip5_ensemble_member>_ + <rcm_model_name>_<rcm_version_id>_<frequency>[_<start_time>-<end_time>].nc + */ + let (i, _variable) = name_segment(i)?; + let (i, _domain) = name_segment(i)?; + let (i, _gcm_model_name) = name_segment(i)?; + let (i, _cmip5_experiment_name) = name_segment(i)?; + let (i, _cmip5_ensemble_member) = terminated(parse_ensemble, name_sep)(i)?; + let (i, _rcm_model_name) = name_segment(i)?; + let (i, _rcm_version_id) = name_segment(i)?; + // no `terminated path_sep` due to this being the last unless followed by times + let (i, _frequency) = parse_frequency(i)?; + let (i, times) = opt(preceded(name_sep, parse_start_end_time))(i)?; + + let (i, _) = tag(".nc")(i)?; + let (i, _) = eof(i)?; + + Ok(( + i, + PathMetadata { + activity, + product, + domain, + institution, + gcm_model_name, + cmip5_experiment_name, + cmip5_ensemble_member, + rcm_model_name, + rcm_version_id, + frequency, + variable_name, + version, + start_time: times.map(|(s, _e)| s), + end_time: times.map(|(_s, e)| e), + }, + )) +} + +fn parse_frequency(i: &str) -> IResult<&str, Frequency> { + map_res(word, Frequency::from_str)(i) +} + +fn parse_start_end_time(i: &str) -> IResult<&str, (NaiveDateTime, NaiveDateTime)> { + let (i, start) = parse_time(i)?; + let (i, _) = tag("-")(i)?; + let (i, end) = parse_time(i)?; + + Ok((i, (start, end))) +} + +#[cfg(test)] +mod tests { + use chrono::NaiveDate; + + use crate::cmip5::EnsembleMember; + + use super::*; + + #[test] + fn test_full_paths() { + let cases = full_path_data(); + for (i, (input, expected)) in cases.into_iter().enumerate() { + let (remaining, output) = parse_cordex(input).expect("error parsing esgf path"); + assert_eq!( + remaining, "", + "expected no remaining data to parse on case {}", + i + ); + assert_eq!( + expected, output, + "output did not match expected on case {}", + i + ); + } + } + + fn full_path_data() -> Vec<(&'static str, PathMetadata<'static>)> { + vec![ + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/rsdt/v20131211/", + "rsdt_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_19910101-19951231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "rsdt", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(1991,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(1995,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/prw/v20131211/", + "prw_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_19960101-20001231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "prw", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(1996,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2000,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/ta500/v20131211/", + "ta500_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_20060101-20101231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "ta500", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(2006,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2010,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/tauu/v20131211/", + "tauu_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_19910101-19951231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "tauu", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(1991,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(1995,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/rlut/v20131211/", + "rlut_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_20010101-20051231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "rlut", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(2001,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2005,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/ua500/v20131211/", + "ua500_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_19960101-20001231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "ua500", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(1996,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2000,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/tauv/v20131211/", + "tauv_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_20110101-20111130.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "tauv", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(2011,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2011,11,30).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/hus850/v20131211/", + "hus850_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_20110101-20111130.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "hus850", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(2011,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2011,11,30).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/evspsbl/v20131211/", + "evspsbl_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_20010101-20051231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "evspsbl", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(2001,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2005,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/va200/v20131211/", + "va200_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_19900101-19901231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "va200", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(1990,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(1990,12,31).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/sfcWindmax/v20131211/", + "sfcWindmax_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_20110101-20111130.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "sfcWindmax", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(2011,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2011,11,30).and_hms(0, 0, 0)), + } + ), + ( + concat!( + "cordex/output/AFR-44/MOHC/ECMWF-ERAINT/evaluation/r1i1p1/MOHC-HadRM3P/v1/day/zmla/v20131211/", + "zmla_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_MOHC-HadRM3P_v1_day_20010101-20051231.nc" + ), + PathMetadata { + activity: "cordex", + product: "output", + domain: "AFR-44", + institution: "MOHC", + gcm_model_name: "ECMWF-ERAINT", + cmip5_experiment_name:"evaluation", + cmip5_ensemble_member: EnsembleMember { + realization: 1, + initialization: 1, + physics: 1, + }, + rcm_model_name: "MOHC-HadRM3P", + rcm_version_id: "v1", + frequency: Frequency::Day, + variable_name: "zmla", + version: Some("v20131211"), + start_time: Some(NaiveDate::from_ymd(2001,1,1).and_hms(0, 0, 0)), + end_time: Some(NaiveDate::from_ymd(2005,12,31).and_hms(0, 0, 0)), + } + ) + ] + } +} diff --git a/freva-ingest/Cargo.toml b/freva-ingest/Cargo.toml index 098f74b8fe8f35540f12f6309ffbe00c5bccd269..d5b4f1d8b2016f3b672174520479c41ff9f17106 100644 --- a/freva-ingest/Cargo.toml +++ b/freva-ingest/Cargo.toml @@ -16,6 +16,7 @@ configparser = "3.0" tokio = { version = "1.15", features = ["full"] } tracing = "0.1" tracing-subscriber = "0.3" +toml = "0.5" directories = "4.0.1" chrono = "0.4" humantime = "2.1" diff --git a/freva-ingest/src/cli.rs b/freva-ingest/src/cli.rs index 5674706ce62fa5959f240dd519d1ff53c41e0229..56857e3bffdae77e8011f6a93ba11c3015434343 100644 --- a/freva-ingest/src/cli.rs +++ b/freva-ingest/src/cli.rs @@ -8,14 +8,14 @@ pub struct Cli { #[clap( short, parse(from_occurrences), - help = "log level to use, defaults to warning (0) up to trace (3+)." + help = "Log level to use, defaults to warning (0) up to trace (3+)." )] pub verbosity: usize, #[clap( long, - env = "EVALUATION_SYSTEM_CONFIG_FILE", - help = concat!("directory with freva config files. If not set, defaults to an OS specific default", + env = "EVALUATION_SYSTEM_CONFIG_DIR", + help = concat!("Directory with freva config files. If not set, defaults to an OS specific default", " e.g. on Linux ~/.config/freva") )] pub config_dir: Option<PathBuf>, @@ -26,9 +26,14 @@ pub struct Cli { #[derive(Debug, Parser)] pub struct IngestOpts { - #[clap(long)] - pub data_dir: PathBuf, + #[clap( + long, + help = concat!("Directory to ingest. If not present, all configured datasets are ingested.", + " Can be set to a directory within a dataset to only import data within that directory using the ", + "enclosing dataset for its configuration.") + )] + pub data_dir: Option<PathBuf>, - #[clap(long, help = "size of batches to send to solr", default_value = "50")] + #[clap(long, help = "Size of batches to send to solr", default_value = "1000")] pub batch_size: usize, } diff --git a/freva-ingest/src/config.rs b/freva-ingest/src/config.rs index 5b172788ccf1439246c9fdf683257be2c9e7b7f6..4c4dff819453cf0cffec52de5c818a068d5c93d4 100644 --- a/freva-ingest/src/config.rs +++ b/freva-ingest/src/config.rs @@ -1,9 +1,12 @@ +use std::fs::File; +use std::io::Read; use std::path::Path; use anyhow::anyhow; use anyhow::{Context, Result}; use configparser::ini::Ini; -use freva::drs::Config as DrsConfig; + +use freva::drs::metadata::ConfigBuilder; use log::debug; #[derive(Debug)] @@ -57,9 +60,13 @@ pub fn config<P: AsRef<Path>>(config_dir: P) -> Result<Config> { Ok(Config { solr }) } -pub fn drs_config<P: AsRef<Path>>(config_dir: P) -> Result<DrsConfig> { +pub fn drs_config<P: AsRef<Path>>(config_dir: P) -> Result<freva::drs::Config> { let config_file = config_dir.as_ref().join(DRS_CONFIG_NAME); debug!("loading drs config from {}", config_file.display()); - DrsConfig::from_toml(&config_file) - .with_context(|| format!("error reading config file {:?}", config_file)) + + let mut config_data = String::new(); + File::open(config_file)?.read_to_string(&mut config_data)?; + + let builder: ConfigBuilder = toml::from_str(&config_data)?; + builder.build().context("error building DRS config") } diff --git a/freva-ingest/src/main.rs b/freva-ingest/src/main.rs index 3faff21f5aeedac954227c8cdba188672635af70..3812fa9b444ec69c22bf7c800c4e109f703a0a2e 100644 --- a/freva-ingest/src/main.rs +++ b/freva-ingest/src/main.rs @@ -15,7 +15,7 @@ const APP_NAME: &str = "freva"; #[tokio::main] async fn main() -> Result<()> { - let args = Cli::try_parse()?; + let args = Cli::parse(); tracing_subscriber::fmt() .with_max_level(log_level(args.verbosity)) diff --git a/freva-ingest/src/solr.rs b/freva-ingest/src/solr.rs index cf6abcb16589407eed6a037a67736e56f9df9ecc..663161d39ded466b4ab7a65be8a8c57eb14806ca 100644 --- a/freva-ingest/src/solr.rs +++ b/freva-ingest/src/solr.rs @@ -11,7 +11,7 @@ pub async fn ingest(opts: &IngestOpts, conf: &Config, drs_conf: &DrsConfig) -> R let reports = freva::drs::ingest( &solr, drs_conf, - &opts.data_dir, + &opts.data_dir.as_deref(), opts.batch_size, Some(&conf.solr.core), None, @@ -19,7 +19,7 @@ pub async fn ingest(opts: &IngestOpts, conf: &Config, drs_conf: &DrsConfig) -> R .await .context("error ingesting data to solr")?; - for r in reports.structures.iter() { + for r in reports.datasets.iter() { let time = humantime::format_duration(r.time); println!( "[{}]\n\ @@ -27,7 +27,7 @@ pub async fn ingest(opts: &IngestOpts, conf: &Config, drs_conf: &DrsConfig) -> R skipped {}\n\ took {}\n\ ", - r.structure.activity(), + r.dataset.name(), r.sent, r.skipped, time, diff --git a/freva/Cargo.toml b/freva/Cargo.toml index 23545af1970b4a20de63b4f732a8be1039d4d82c..396e4cab73f4cdde244b67f02931409e84d0986d 100644 --- a/freva/Cargo.toml +++ b/freva/Cargo.toml @@ -7,14 +7,11 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +drs = { path = "../drs" } thiserror = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -reqwest = { version = "0.11", default_features = false, features = [ - "blocking", - "json", - "rustls-tls", -] } +reqwest = { version = "0.11", features = ["blocking", "json", "rustls"] } walkdir = "2" camino = { version = "1.0.5", features = ["serde1"] } toml = "0.5" @@ -22,7 +19,12 @@ tokio = { version = "1.15", features = ["full"] } tracing = "0.1" futures = "0.3" url = "2.2" +netcdf = { version = "0.7", features = ["static"] } +nom = "7.1" +chrono = "0.4" +lazy_static = "1.4.0" [dev-dependencies] env_logger = "0.9" wiremock = "0.5" +tracing-subscriber = "0.3" diff --git a/freva/src/drs.rs b/freva/src/drs.rs index 2a7f06deb66c8ffa5d5470760a63ae46f4b68c6d..5930a5ddfa894441be7bb0bb95a6bd1d7e3aa44f 100644 --- a/freva/src/drs.rs +++ b/freva/src/drs.rs @@ -5,9 +5,10 @@ mod ingest; pub mod metadata; +mod path; mod search; pub use ingest::ingest; pub use search::search; -pub use metadata::{Config, Error, Metadata, Structure}; +pub use metadata::{Config, Dataset, Error, Metadata}; diff --git a/freva/src/drs/ingest.rs b/freva/src/drs/ingest.rs index 4eb4f71fbf64de4c1f06a882a8fe393edccf6043..caadb5ef66651bdb9452120ddf8dbdb45abfda2a 100644 --- a/freva/src/drs/ingest.rs +++ b/freva/src/drs/ingest.rs @@ -1,6 +1,7 @@ //! Handles inserting DRS file metadata into Solr use std::io; +use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; use std::{collections::HashMap, path::Path}; @@ -12,13 +13,15 @@ use tokio::time::Instant; use tracing::{error, info, instrument, trace, warn}; use walkdir::{DirEntry, WalkDir}; -use crate::drs::Structure; +use crate::drs::Dataset; use crate::{ drs::{Config, Metadata}, solr::{Solr, SolrError}, }; use crate::{DEFAULT_FILES_COLLECTION, DEFAULT_LATEST_COLLECTION}; +use super::path::absolute; + /// Errors possible while ingesting data into solr #[derive(Debug, Error)] pub enum IngestError { @@ -28,23 +31,27 @@ pub enum IngestError { /// Error crawling the filesystem tree. Most likely a symlink loop or a broken file. See underlying error for more /// details. - #[error("error while walking the directory structure")] + #[error("error while walking the directory structure: {0}")] WalkDirectory(#[from] walkdir::Error), /// Error canonicalizing data dir path - #[error("error canonicalizing data-dir, may not exist")] - DataDirCanonicalization(#[from] io::Error), + #[error("error getting absolute path for data-dir")] + AbsolutePath(#[from] io::Error), + + /// Given data dir was not contained in any known dataset + #[error("Given data-dir was not contained in any known dataset")] + InvalidDataDir, } #[derive(Debug)] pub struct IngestReport { - pub structures: Vec<StructureReport>, + pub datasets: Vec<DatasetReport>, pub total_time: Duration, } #[derive(Debug)] -pub struct StructureReport { - pub structure: Structure, +pub struct DatasetReport { + pub dataset: Dataset, pub sent: u32, pub skipped: u32, pub time: Duration, @@ -52,54 +59,73 @@ pub struct StructureReport { /// Ingests metadata from filesystem into freva's solr /// -/// Spawns 2 tasks per [`Structure`] in the DRS config where `root_dir` is contained in (i.e. `starts_with`) `data_dir`. +/// Spawns 2 tasks per [`Dataset`] in the DRS config where `root_dir` is contained in (i.e. `starts_with`) `data_dir`. /// One task is the computation thread which processes the file names as they come in and it spawns the second task /// which is a blocking thread that uses `walkdir` synchronously and feeds the paths it finds into a channel. /// /// `files_collection` and `latest_collection` are optional. If `None`, then this will use Freva's defaults: /// [`DEFAULT_FILES_COLLECTION`] and [`DEFAULT_LATEST_COLLECTION`], respectively. -pub async fn ingest<P: AsRef<Path>>( +pub async fn ingest( solr: &Solr, drs_conf: &Config, - data_dir: P, + data_dir: &Option<&Path>, batch_size: usize, files_collection: Option<&str>, latest_collection: Option<&str>, ) -> Result<IngestReport, IngestError> { let mut handles = Vec::new(); - let data_dir = tokio::fs::canonicalize(data_dir.as_ref()).await?; let start_time = Instant::now(); - for structure in drs_conf.structures() { - if structure.root().starts_with(&data_dir) { - // clone for each thread to capture without worrying about lifetimes - let solr = solr.clone(); - let structure = structure.clone(); - let files_collection = files_collection - .unwrap_or(DEFAULT_FILES_COLLECTION) - .to_owned(); - let latest_collection = latest_collection - .unwrap_or(DEFAULT_LATEST_COLLECTION) - .to_owned(); - handles.push(tokio::spawn(async move { - ingest_structure( - solr, - structure, - batch_size, - &files_collection, - &latest_collection, - ) - .await - })); + // immediately canonicalize data-dir if given to ensure we always operate on the fully qualified form + let data_dir = match data_dir { + //Some(p) => Some(tokio::fs::canonicalize(p).await?), + Some(p) => Some(absolute(p)?), + None => None, + }; + + // If we were not given data-dir, ingest all datasets + // If we were given a data-dir then look for a dataset that contains it and only ingest from that dataset + let datasets: Vec<&Dataset> = match data_dir.as_ref() { + None => drs_conf.datasets().collect(), + Some(p) => { + let dataset = drs_conf.datasets().find(|d| p.starts_with(d.root())); + match dataset { + None => return Err(IngestError::InvalidDataDir), + Some(d) => vec![d], + } } + }; + + for dataset in datasets.into_iter() { + // clone for each thread to capture without worrying about lifetimes + let solr = solr.clone(); + let dataset = dataset.clone(); + let files_collection = files_collection + .unwrap_or(DEFAULT_FILES_COLLECTION) + .to_owned(); + let latest_collection = latest_collection + .unwrap_or(DEFAULT_LATEST_COLLECTION) + .to_owned(); + let data_dir = data_dir.as_ref().cloned(); + handles.push(tokio::spawn(async move { + ingest_dataset( + solr, + dataset, + data_dir, + batch_size, + &files_collection, + &latest_collection, + ) + .await + })); } let results = futures::future::join_all(handles).await; let (success, errors): (Vec<_>, Vec<_>) = results.into_iter().partition(|r| r.is_ok()); for err in errors.into_iter().map(|e| e.unwrap_err()) { - error!("error while processing a DRS structure: {}", err); + error!("error while processing a DRS dataset: {}", err); } let (results, errors): (Vec<_>, Vec<_>) = success @@ -108,32 +134,34 @@ pub async fn ingest<P: AsRef<Path>>( .partition(|r| r.is_ok()); for err in errors.into_iter().map(|e| e.unwrap_err()) { - error!("error while processing a DRS structure: {}", err); + error!("error while processing a DRS dataset: {}", err); } let reports = results.into_iter().map(|r| r.unwrap()).collect(); Ok(IngestReport { - structures: reports, + datasets: reports, total_time: start_time.elapsed(), }) } -/// Ingests a single structure from the DRS config. +/// Ingests a single dataset from the DRS config. /// -/// This handles crawling the structure's `root_dir` and ingesting any files it finds there are valid for the structure. -/// This inserts the files it finds into both files and latest collection within solr. +/// This handles crawling either the dataset's `root_dir` or the given `ingest_dir` and ingesting any files it finds +/// that are valid for the dataset. This inserts the files it finds into both files and latest collection within solr. /// +/// This expects that `ingest_dir` will be a fully qualified path if set. /// Invalid files, whatever the reason they're invalid, will be skipped. Failing to send to solr will result in this /// ending early with an error. Any files that were already sent to solr will not be cleaned up. -#[instrument(skip(solr, structure), fields(structure = structure.activity().as_str()))] -async fn ingest_structure( +#[instrument(skip_all, fields(dataset = dataset.name(), batch_size=batch_size))] +async fn ingest_dataset( solr: Solr, - structure: Structure, + dataset: Dataset, + ingest_dir: Option<PathBuf>, batch_size: usize, files_collection: &str, latest_collection: &str, -) -> Result<StructureReport, IngestError> { +) -> Result<DatasetReport, IngestError> { let mut file_buf = Vec::with_capacity(batch_size); let mut latest_buf = Vec::with_capacity(batch_size); @@ -141,17 +169,21 @@ async fn ingest_structure( let mut skipped = 0; let mut latest_versions: HashMap<String, String> = HashMap::new(); - let walkdir = WalkDir::new(structure.root()).follow_links(true); + let ingest_dir = match ingest_dir { + Some(p) => p, + None => dataset.root().as_std_path().to_owned(), + }; + let walkdir = WalkDir::new(ingest_dir).follow_links(true); let mut entries = walkdir_channel(walkdir); let start_time = Instant::now(); while let Some(entry) = entries.recv().await { if file_buf.len() == batch_size { - sent += flush(&solr, files_collection, &mut file_buf).await?; + sent += flush(&solr, &dataset, files_collection, &mut file_buf).await?; } if latest_buf.len() == batch_size { - flush(&solr, latest_collection, &mut latest_buf).await?; + flush(&solr, &dataset, latest_collection, &mut latest_buf).await?; } let entry = match entry { @@ -161,7 +193,9 @@ async fn ingest_structure( // other errors will still result in ingestion ending early let path = e.path().unwrap_or_else(|| Path::new("")).display(); if let Some(inner) = e.io_error() { - if inner.kind() == io::ErrorKind::NotFound { + if inner.kind() == io::ErrorKind::NotFound + || inner.kind() == io::ErrorKind::PermissionDenied + { warn!("found broken file, skipping: {path}"); skipped += 1; continue; @@ -176,7 +210,8 @@ async fn ingest_structure( Ok(p) => p, Err(e) => { warn!( - "not a valid drs file, has non UTF8 characters in path:\n{:?}", + "{} not a valid drs file, has non UTF8 characters in path:\n{:?}", + entry.path().display(), e ); skipped += 1; @@ -184,15 +219,15 @@ async fn ingest_structure( } }; - let f = match structure.metadata_from_path(&path) { + let f = match dataset.metadata_from_path(&path) { Ok(f) => f, Err(e) => { - warn!("not a valid drs file, skipping:\n{:#?}", e); + warn!("{} not a valid drs file, skipping:\n{:?}", path, e); skipped += 1; continue; } }; - trace!("found document to send to solr: {}", f.to_path()); + trace!("found document to send to solr: {}", f.path()); let f = Arc::new(f); if let Some(version) = f.version() { @@ -200,7 +235,7 @@ async fn ingest_structure( match latest_versions.get_mut(&id) { Some(latest) => { - if version >= latest.as_str() { + if version.as_str() >= latest.as_str() { *latest = version.to_owned(); latest_buf.push(f.clone()); } @@ -220,24 +255,26 @@ async fn ingest_structure( } if !file_buf.is_empty() { - sent += flush(&solr, files_collection, &mut file_buf).await?; + sent += flush(&solr, &dataset, files_collection, &mut file_buf).await?; } if !latest_buf.is_empty() { - flush(&solr, latest_collection, &mut latest_buf).await?; + flush(&solr, &dataset, latest_collection, &mut latest_buf).await?; } - Ok(StructureReport { - structure, + Ok(DatasetReport { + dataset, sent, skipped, time: start_time.elapsed(), }) } +#[instrument(skip_all, fields(collection=collection))] async fn flush( solr: &Solr, + dataset: &Dataset, collection: &str, - docs: &mut Vec<Arc<Metadata<'_>>>, + docs: &mut Vec<Arc<Metadata>>, ) -> Result<u32, IngestError> { info!( "flushing {} documents in buffer to collection {}", @@ -246,7 +283,15 @@ async fn flush( ); let sent = docs.len() as u32; // turn Vec<Rc<_>> in Vec<&_> - let doc_refs: Vec<_> = docs.iter().map(|d| d.as_ref()).collect(); + let doc_refs: Vec<_> = docs + .iter() + .map(|d| { + let mut m = d.to_solr_map(); + // inject the name of the dataset to the final map + m.insert("dataset", dataset.name().to_owned()); + m + }) + .collect(); solr.update(collection, &doc_refs).await?; docs.clear(); Ok(sent) diff --git a/freva/src/drs/metadata.rs b/freva/src/drs/metadata.rs index 359b7d4ac0410d379a956783b9871a493e4aa78d..3cb87fa3509083b88ede02ea4f3c914c1231438b 100644 --- a/freva/src/drs/metadata.rs +++ b/freva/src/drs/metadata.rs @@ -1,570 +1,469 @@ //! Implementation of the DRS specification for extracting metadata information from file paths. +mod config_builder; +pub mod custom; + +use core::fmt; use std::{ collections::HashMap, - fmt::{Display, Formatter}, - fs::File, - io::{self, Read}, - path::{Path, PathBuf}, + fmt::Formatter, + io, + path::{Path, StripPrefixError}, }; -use camino::Utf8PathBuf; -use serde::{ser::SerializeMap, Deserialize, Serialize}; +use camino::{Utf8Path, Utf8PathBuf}; +use drs::{ + cmip5::Cmip5, + cmip6::{Cmip6, MIP_ERA}, + cordex::Cordex, +}; +use netcdf::AttrValue; use thiserror::Error; -use tracing::{debug, error, warn}; +use tracing::{debug, error}; + +pub use config_builder::ConfigBuilder; +use custom::Custom; + +use crate::drs::path::absolute; /// Errors that can be returned by DRS functions #[derive(Debug, Error)] pub enum Error { - /// Error while interfacing with the DRS config file - #[error("error opening drs config file: {0}")] - OpenConfig(#[from] io::Error), - /// Error canonicalizing paths. - /// This probably means that the path doesn't exist, which `canonicalize` requires. - #[error("error canonicalizing path: {path:?}")] - CanonicalizePath { - /// path that failed to be canonicalized - path: PathBuf, - }, + /// This probably means that the current directory doesn't exist or the user does not have permission to access it + #[error("error getting absolute path: {0}")] + AbsolutePath(#[from] io::Error), - /// Error parsing DRS config file - #[error("error parsing drs config file: {0}")] - ParseConfig(#[from] toml::de::Error), - - /// Error where path is not valid for a given [`Structure`] - #[error("{path:?} is not a valid DRS file: {reason}")] - InvalidPath { - /// path that is not valid - path: PathBuf, - /// reason it was considered invalid - reason: String, - }, + /// Error where path is not valid for a given [`Dataset`] + #[error("Not a valid DRS file: {0}")] + InvalidPath(String), + + /// Path did not start with the relevant dataset's root_dir + #[error("Incorrect prefix for dataset: {0}")] + InvalidPrefix(#[from] StripPrefixError), /// Error that happens when given a path that isn't valid utf 8 #[error("path is not utf-8")] NonUtf8Path(#[from] camino::FromPathBufError), -} -/// newtype pattern around DRS activity string -#[derive(Debug, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] -pub struct Activity(pub String); + /// Error getting netcdf attributes + #[error(transparent)] + ExtractMetadata(#[from] ExtractMetadataError), -impl Activity { - /// Gets the Activity as a string. Convenience method for accessing its value as a string without using tuple - /// syntax. - pub fn as_str(&self) -> &str { - &self.0 - } -} + /// Path does not follow cmor standard + #[error(transparent)] + InvalidCmorPath(#[from] drs::cmip5::InvalidCmorPathError), -impl Display for Activity { - fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { - write!(f, "{}", self.0) - } + /// Path does not follow esgf standard + #[error(transparent)] + InvalidEsgfPath(#[from] drs::cmip5::InvalidEsgfPathError), + + /// Path does not follow cmip6's specification + #[error(transparent)] + InvalidCmip6Path(#[from] drs::cmip6::InvalidCmip6PathError), + + /// Path does not follow cordex's specification + #[error(transparent)] + InvalidCordexPath(#[from] drs::cordex::InvalidCordexPathError), + + /// Path does not follow defined custom specification + #[error(transparent)] + InvalidCustomPath(#[from] custom::Error), } -/// Metadata for DRS metadata derived from a file's path on the filesystem. -#[derive(Debug, Clone, Deserialize, PartialEq)] -pub struct Structure { - /// The base directory where all DRS files of this structure will be found. - root_dir: Utf8PathBuf, - /// The keys of the different path segments in the order they are expected to appear in the path. - parts_dir: Vec<String>, - /// Key ordering for creating the dataset identifier (See 3.4 of DRS spec) - parts_dataset: Vec<String>, - /// Key ordering for creating the versioned dataset identifier - parts_versioned_dataset: Option<Vec<String>>, - /// Key ordering from parts of the file (separated by `_`) - parts_file_name: Vec<String>, - /// Formatting of the time section of the file's name. - /// - /// Example: `start_time-end_time` - parts_time: String, - /// Default values used in constructing the dataset identifier if not present - defaults: HashMap<String, Utf8PathBuf>, - /// Activity name of this DRS structure. - /// - /// *Note*: this does not always match the key in the [`Config`] map. - data_type: Activity, +/// Which specification a DRS path follows and, if custom, additional required information to parse it +#[derive(Debug, Clone)] +pub enum Specification { + /// Path follows the CMIP5 CMOR format + Cmor, + /// Path follows the current CMIP5 ESGF format + Cmip5, + /// Path follows the CMIP6 format + Cmip6, + /// Path follows The CORDEX format + Cordex, + /// Path does not follow a support DRS specification but one that's defined in its [`custom::ComponentConfig`] + Custom(custom::ComponentConfig), } -impl Structure { - /// Performs a basic check of if the given path could possibly be valid for this structure. +impl Specification { + /// Dispatches a _relative_ path to the correct module to process the path based on which type of DRS spec it + /// follows. /// - /// This is not comprehensive in that there are still paths where this would return `true` but which are not valid - /// for this structure. For example, if a path has the same `root_dir` and the same _number_ of components in its - /// directory structure, it will pass this even if it doesn't have the right `filename` structure for this. - /// TODO: fix this if possible. - pub fn verify_path<P: AsRef<Path>>(&self, path: P) -> bool { - let path = path.as_ref(); - - // Prefix must match. If not, then this isn't a valid structure for this path. - let path = match path.strip_prefix(&self.root_dir) { - Ok(p) => p, - Err(_) => return false, + /// Relative path here means it starts from the beginning of the DRS portion of the path. For example if a + /// dataset's `root_dir` is `/foo/bar` and a file within that dataset is `/foo/bar/cmip5/some/other/things.nc` + /// then `path` here would be `cmip5/some/other/things.nc`. + fn metadata_from_path(&self, root_dir: &Utf8Path, path: &Utf8Path) -> Result<Metadata, Error> { + use drs::cmip5::Cmip5 as drs_cmip5; + use drs::cmip6::Cmip6 as drs_cmip6; + use drs::cordex::Cordex as drs_cordex; + use Specification::*; + let m: Metadata = match self { + Cmor => drs_cmip5::from_cmor_path(path)?.into(), + Cmip5 => drs_cmip5::from_esgf_path(path)?.into(), + Cmip6 => { + let cmip = drs_cmip6::from_path(path)?; + metadata_from_cmip6(cmip, root_dir)? + } + Cordex => drs_cordex::from_path(path)?.into(), + Custom(c) => c.metadata_from_path(path)?.into(), }; - - let parts_len = path.iter().count(); - parts_len == self.parts_dir.len() - } - - /// Gets the root directory for this structure - pub fn root(&self) -> &Utf8PathBuf { - &self.root_dir + Ok(m) } +} - /// Gets the type of activity of this structure. - pub fn activity(&self) -> &Activity { - &self.data_type +impl fmt::Display for Specification { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + use Specification::*; + let s = match self { + Cmor => "CMOR", + Cmip5 => "CMIP5", + Cmip6 => "CMIP6", + Cordex => "CORDEX", + Custom(_) => "Custom", + }; + write!(f, "{}", s) } +} - /// Extracts [`Metadata`] for a file path of this structure - pub fn metadata_from_path<P: AsRef<Path>>(&self, path: P) -> Result<Metadata, Error> { - let path = Utf8PathBuf::try_from(path.as_ref().to_owned())?; - - if !self.verify_path(&path) { - return Err(Error::InvalidPath { - path: path.into(), - reason: "file path does not match expected structure".to_owned(), - }); - } - - let parts = path - .strip_prefix(self.root()) - .unwrap() // this is already done with `verify_path` so should be safe - .iter(); +/// A DRS dataset which describes the DRS specification used by the data within its `root_dir` and the human readable +/// name attached to associate it with an entry in the config file +#[derive(Debug, Clone)] +pub struct Dataset { + name: String, + root_dir: Utf8PathBuf, + drs_format: Specification, +} - let mut path_parts = HashMap::new(); +impl Dataset { + /// Gets the fully qualified root directory for this set of DRS files. + pub fn root(&self) -> &Utf8Path { + &self.root_dir + } - for (part, name) in parts.zip(self.parts_dir.iter()) { - path_parts.insert(name.clone(), Utf8PathBuf::from(part)); - } + /// Processes a path according to this dataset's `drs_format` and returns the normalized metadata + pub fn metadata_from_path(&self, path: &Utf8Path) -> Result<Metadata, Error> { + // in the normal flow of this code as currently written, it's shouldn't be possible for stripping the prefix to + // cause an error + let path = path.strip_prefix(&self.root_dir)?; - // split file name - // remove file extension before splitting - // if no extension, use the whole name - let name_no_extension = match path_parts["file_name"].file_stem() { - Some(n) => n, - None => path_parts["file_name"].as_str(), - }; - let mut file_name_parts: Vec<_> = - name_no_extension.split('_').map(|s| s.to_owned()).collect(); - - // TODO: I'm not sure why this is here or what it's trying to accomplish - if file_name_parts.len() == self.parts_file_name.len() - 1 - && file_name_parts.contains(&"fx".to_owned()) - { - // original pushed None which doesn't work here - file_name_parts.push("".to_owned()); - } - - for (i, name) in self.parts_file_name.iter().enumerate() { - if !path_parts.contains_key(name) { - let value = match file_name_parts.get(i) { - Some(v) => v, - None => { - return Err(Error::InvalidPath { - path: path.into(), - reason: "file name does not match expected structure".to_owned(), - }) - } - }; - path_parts.insert(name.clone(), Utf8PathBuf::from(value)); - } - } + let mut metadata = self.drs_format.metadata_from_path(&self.root_dir, path)?; + // put the root back into the path so this contains the entire fully qualified path + metadata.path = self.root_dir.join(metadata.path); + Ok(metadata) + } - Ok(Metadata::new(self, path_parts)) + /// Get the human readable name + pub fn name(&self) -> &str { + &self.name } } -/// Map of DRS Activity name to [`Structure`]. -/// Used to validate paths against the known DRS structures and construct [`Metadata`] objects. -#[derive(Debug, Deserialize)] +/// Map of DRS Activity name to [`Dataset`]. +/// Used to validate paths against the known DRS datasets and construct [`Metadata`] objects. +#[derive(Debug)] pub struct Config { - #[serde(flatten)] - map: HashMap<Activity, Structure>, + datasets: Vec<Dataset>, } impl Config { - /// Creates a [`Metadata`] from the given path based on the available structures + /// Creates a [`Metadata`] from the given path based on the available dataset pub fn metadata_from_path<P: AsRef<Path>>(&self, path: &P) -> Result<Metadata, Error> { - let path = path - .as_ref() - .canonicalize() - .map_err(|_e| Error::CanonicalizePath { - path: path.as_ref().to_owned(), - })?; + let path = absolute(path.as_ref())?; let path = Utf8PathBuf::try_from(path)?; - let activity = match self.structure_from_path(&path) { - Some(a) => a, - None => { - return Err(Error::InvalidPath { - path: path.into(), - reason: "no matching structure found".to_owned(), - }) - } - }; - let structure = &self.map[activity]; - debug!("found structure {} for path \n{}", activity.0, path); + let dataset = self + .dataset_from_path(&path) + .ok_or_else(|| Error::InvalidPath("no matching dataset found".to_owned()))?; + debug!("found dataset {} for path \n{}", dataset.name(), path); - structure.metadata_from_path(&path) + dataset.metadata_from_path(&path) } - /// Returns an iterator over the structures available in the config. - pub fn structures(&self) -> impl Iterator<Item = &Structure> { - self.map.values() + /// Returns an iterator over the datasets available in the config. + pub fn datasets(&self) -> impl Iterator<Item = &Dataset> { + self.datasets.iter() } - /// Constructs a new [`Config`] based on config file found at `filename`. - pub fn from_toml<P: AsRef<Path>>(filename: P) -> Result<Self, Error> { - let mut f = File::open(filename)?; - let mut buffer = String::new(); - f.read_to_string(&mut buffer)?; - - Self::try_from(&buffer) - } - - /// Searches for an [`Activity`] based on the given path - pub fn structure_from_path<P: AsRef<Path>>(&self, path: P) -> Option<&Activity> { - for (activity, structure) in self.map.iter() { - if structure.verify_path(&path) { - return Some(activity); - } - } - - None - } -} - -impl TryFrom<&str> for Config { - type Error = Error; - - fn try_from(value: &str) -> Result<Self, Self::Error> { - let mut s: Config = toml::from_str(value)?; - for (_, mut structure) in s.map.iter_mut() { - structure.root_dir = match structure.root_dir.canonicalize() { - Ok(root_dir) => Utf8PathBuf::try_from(root_dir)?, - Err(e) => { - warn!( - "error canonicalizing DRS root {}: {}", - structure.root_dir, e - ); - continue; - } - }; - } - Ok(s) + fn dataset_from_path(&self, path: &Utf8Path) -> Option<&Dataset> { + self.datasets.iter().find(|&s| s.root().starts_with(path)) } } -impl TryFrom<&String> for Config { - type Error = Error; - - fn try_from(value: &String) -> Result<Self, Self::Error> { - let mut s: Config = toml::from_str(value)?; - for (_, mut structure) in s.map.iter_mut() { - structure.root_dir = match structure.root_dir.canonicalize() { - Ok(root_dir) => Utf8PathBuf::try_from(root_dir)?, - Err(e) => { - warn!( - "error canonicalizing DRS root {}: {}", - structure.root_dir, e - ); - continue; - } - }; - } - Ok(s) - } -} - -/// Filepath that conforms to the corresponding [`Structure`]. -/// -/// Paths must be UTF-8. -#[derive(Debug, PartialEq)] -pub struct Metadata<'a> { - // in original, this is the string key rather than the actual structure but I think this can work here - structure: &'a Structure, - /// map of { part name => value } e.g. "institute" => "cpc" - /// We assume that all file paths will be valid utf-8 - parts: HashMap<String, Utf8PathBuf>, +/// [`Metadata`] is the normalized form of DRS metadata that Freva uses which has the same fields as [`Cmip5`] but with +/// no restrictions on what isn't a valid value e.g. `frequency` is not limited to the valid CMIP5 frequency values. +/// This allows us to map the different standards into something with CMIP5 keys though not necessarily with their +/// values. +#[derive(Debug)] +pub struct Metadata { + // I think it may be possible to avoid taking ownership here to reduce allocations but it's the simplest spot to + // have this happen for now + path: Utf8PathBuf, + activity: String, + product: String, + institute: String, + model: String, + experiment: String, + frequency: String, + modeling_realm: String, + mip_table: String, + ensemble: String, + variable: String, + version: Option<String>, } -impl<'a> Metadata<'a> { - fn new(structure: &'a Structure, parts: HashMap<String, Utf8PathBuf>) -> Self { - Metadata { structure, parts } +impl Metadata { + /// Gets the path relative path for this file + pub fn path(&self) -> &Utf8Path { + &self.path } - /// Builds a fully qualified path from for this file. - pub fn to_path(&self) -> Utf8PathBuf { - let parts_order = &self.structure.parts_dir; - let mut path = self.structure.root_dir.clone(); - for k in parts_order.iter() { - path = path.join(&self.parts[k]); - } - - path - } - - /// Returns if this file is supposed to be versioned. - pub fn versioned(&self) -> bool { - self.structure.parts_versioned_dataset.is_some() - } - - /// Returns the dataset identifier. + /// Constructs a CMIP5-like identifier for the dataset including the version. If there is no version, then `None` + /// is returned. /// - /// Ref: DRS spec 3.4 - pub fn to_identifier(&self) -> String { - // the original included a `versioned` param which would use `parts_versioned_dataset` - // instead of `parts_dataset`. Every reference, with one exception, within the eval - // system codebase explicitly set `versioned` to false and the only exception is a - // function which passes through a `versioned` param that it accepts but it itself - // does not appear to be called - // I'm not sure this version stuff is necessary - let parts = { - if let Some(parts) = &self.structure.parts_versioned_dataset { - parts - } else { - &self.structure.parts_dataset - } - }; - - let mut results = Vec::new(); - for k in parts.iter() { - if let Some(part) = self.parts.get(k) { - results.push(part.as_str()) - } else if let Some(part) = self.structure.defaults.get(k) { - results.push(part.as_str()); - } + /// This is CMIP5-like in that the CMIP5 spec doesn't include `variable` which this does. + pub fn to_identifier(&self) -> Option<String> { + if self.version.is_none() { + None + } else { + Some( + vec![ + self.activity.as_str(), + self.product.as_str(), + self.institute.as_str(), + self.model.as_str(), + self.experiment.as_str(), + self.frequency.as_str(), + self.modeling_realm.as_str(), + self.mip_table.as_str(), + self.ensemble.as_str(), + self.version().unwrap().as_str(), + self.variable.as_str(), + ] + .join("."), + ) } - - results.join(".") } - /// Returns the unversioned identifier i.e. ignores `parts_versioned_dataset` if - /// it exists. + /// Constructs a CMIP5-like identifier without version info. + /// + /// Like [`Self::to_identifier`], this differs slightly from CMIP5 by including `variable`. pub fn to_unversioned_identifier(&self) -> String { - let parts_keys = &self.structure.parts_dataset; - let mut results = Vec::with_capacity(parts_keys.len()); - for k in parts_keys.iter() { - if let Some(part) = self.parts.get(k) { - results.push(part.as_str()); - } else if let Some(part) = self.structure.defaults.get(k) { - results.push(part.as_str()); - } - // should this error if the key isn't present in either? - } - - results.join(".") + vec![ + self.activity.as_str(), + self.product.as_str(), + self.institute.as_str(), + self.model.as_str(), + self.experiment.as_str(), + self.frequency.as_str(), + self.modeling_realm.as_str(), + self.mip_table.as_str(), + self.ensemble.as_str(), + self.variable.as_str(), + ] + .join(".") } - /// Returns a reference to the version part of the file data if present. - pub fn version(&self) -> Option<&str> { - self.parts.get("version").map(|v| v.as_str()) + /// Getter for file version string + pub fn version(&self) -> Option<&String> { + self.version.as_ref() } -} - -impl<'a> Serialize for Metadata<'a> { - fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> - where - S: serde::Serializer, - { - let mut metadata = self.parts.clone(); - metadata.insert("file".to_owned(), self.to_path()); - if metadata.contains_key("version") { - // replace is only on Strings but not paths so this awkward little dance is necessary. - // Would be good to find a way to replace this. - let version_to_remove = format!("/{}/", metadata["version"].as_str()); - let file_no_version = metadata["file"].as_str().replace(&version_to_remove, "/"); - - metadata.insert( - "file_no_version".to_owned(), - Utf8PathBuf::from(file_no_version), - ); - } else { - metadata.insert("file_no_version".to_owned(), metadata["file"].to_owned()); - } - metadata.insert("dataset".to_owned(), (&self.structure.data_type.0).into()); - - // TODO: this doesn't add timestamp and creation_time which are added in solr_core.py - // instead of with the rest of the metadata dictionary creation - - let mut map = serializer.serialize_map(Some(metadata.len()))?; - for (k, v) in metadata { - map.serialize_entry(&k, &v)?; + /// Creates a map of the metadata information we want to send to solr + pub fn to_solr_map(&self) -> HashMap<&str, String> { + let mut m = HashMap::new(); + m.insert("file", self.path.to_string()); + m.insert("project", self.activity.to_owned()); + m.insert("product", self.product.to_owned()); + m.insert("institute", self.institute.to_owned()); + m.insert("model", self.model.to_owned()); + m.insert("experiment", self.experiment.to_owned()); + m.insert("time_frequency", self.frequency.to_string()); + m.insert("realm", self.modeling_realm.to_string()); + m.insert("cmor_table", self.mip_table.to_owned()); + m.insert("ensemble", self.ensemble.to_string()); + m.insert("variable", self.variable.to_string()); + if let Some(v) = &self.version { + m.insert("version", v.clone()); } - - map.end() + m } } -#[cfg(test)] -mod tests { - use super::*; - - // rust-analyzer's single test debug uses a different directory from most other things which messes up - // relative paths, so this helps us always keep paths relative to the root of the project rather than - // relative to this file's directory - const ROOT: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/.."); - - #[test] - fn test_single_deserialization() { - let config = load_single_config(); - - assert!(config - .map - .contains_key(&Activity("observations".to_owned()))); - assert_eq!( - config.map[&Activity("observations".to_owned())].data_type, - Activity("observations".to_owned()) - ); - } - - #[test] - fn test_single_file() { - let config = load_single_config(); - let path = single_file(); - - let res = config.metadata_from_path(&path); - assert!(res.is_ok()); - let file = res.unwrap(); - - let expected = Metadata { - structure: &config.map[&Activity("observations".to_owned())], - parts: [ - ("project", "observations"), - ("product", "grid"), - ("institute", "CPC"), - ("model", "CPC"), - ("experiment", "cmorph"), - ("time_frequency", "30min"), - ("realm", "atmos"), - ("cmor_table", "30min"), - ("ensemble", "r1i1p1"), - ("version", "v20210618"), - ("variable", "pr"), - ("level", "cmorph"), - ("time", "201609020000-201609020030"), - ( - "file_name", - "pr_30min_CPC_cmorph_r1i1p1_201609020000-201609020030.nc", - ), - ] - .into_iter() - .map(|(k, v)| (k.to_owned(), Utf8PathBuf::from(v))) - .collect(), - }; - - assert_eq!(expected, file); - } - - #[test] - fn test_identifier() { - let config = load_single_config(); - let path = single_file(); - - let res = config.metadata_from_path(&path); - assert!(res.is_ok()); - let file = res.unwrap(); - - let expected = "observations.grid.CPC.CPC.cmorph.30min.atmos.30min.r1i1p1.v20210618.pr"; - assert_eq!(expected, file.to_identifier()); - } - - #[test] - fn test_unversioned_identifier() { - let config = load_single_config(); - let path = single_file(); - - let res = config.metadata_from_path(&path); - assert!(res.is_ok()); - let file = res.unwrap(); - - let expected = "observations.grid.CPC.CPC.cmorph.30min.atmos.30min.r1i1p1.pr"; - assert_eq!(expected, file.to_unversioned_identifier()); +impl<'a> From<Cmip5<'a>> for Metadata { + fn from(cmip5: Cmip5<'a>) -> Self { + let m = &cmip5.metadata; + Self { + path: cmip5.path.to_owned(), + activity: m.activity.to_owned(), + product: m.product.to_owned(), + institute: m.institute.to_owned(), + model: m.model.to_owned(), + experiment: m.experiment.to_owned(), + frequency: m.frequency.to_string(), + modeling_realm: m.modeling_realm.to_string(), + mip_table: m.mip_table.to_owned(), + ensemble: m.ensemble.to_string(), + variable: m.variable.to_owned(), + version: m.version.map(|v| v.to_owned()), + } } +} - #[test] - fn test_nonexistant_file() { - let config = load_single_config(); - let path = "this/file/doesnt/exist.txt"; - - let result = config.metadata_from_path(&path); - assert!( - match result.unwrap_err() { - Error::CanonicalizePath { path: _ } => true, - _ => false, - }, - "incorrect error type" +impl<'a> From<Cordex<'a>> for Metadata { + fn from(cordex: Cordex<'a>) -> Self { + let m = &cordex.metadata; + let model = format!( + "{}-{}-{}", + m.gcm_model_name, m.rcm_model_name, m.rcm_version_id ); + let mip_table = "atmos".to_owned(); // all cordex data is atmos + Self { + path: cordex.path.to_owned(), + activity: m.activity.to_owned(), + product: m.domain.to_owned(), + institute: m.institution.to_owned(), + model, + experiment: m.cmip5_experiment_name.to_owned(), + frequency: m.frequency.to_string(), + modeling_realm: mip_table.clone(), + variable: m.variable_name.to_owned(), + ensemble: m.cmip5_ensemble_member.to_string(), + mip_table, + version: m.version.map(|v| v.to_owned()), + } } +} - #[test] - fn test_incorrect_file_name_parts() { - // this isn't necessarily correct behavior but it is a case to be aware of - - let mut config = load_single_config(); - let path = single_file(); - - // append another segment to the structure's file name parts so the `file_from_path` fails - let observations = config - .map - .get_mut(&Activity("observations".to_owned())) - .unwrap(); - observations.parts_file_name.push("foo".to_owned()); - - let res = config.metadata_from_path(&path); - assert!(res.is_err()); - assert!(match res.unwrap_err() { - Error::InvalidPath { path: _, reason: _ } => true, - _ => false, - }); +impl<'a> From<Custom<'a>> for Metadata { + fn from(custom: Custom<'a>) -> Self { + Self { + path: custom.path.to_owned(), + activity: custom.activity.to_owned(), + product: custom.product.to_owned(), + institute: custom.institute.to_owned(), + model: custom.model.to_owned(), + experiment: custom.experiment.to_owned(), + frequency: custom.frequency.to_owned(), + modeling_realm: custom.modeling_realm.to_owned(), + mip_table: custom.mip_table.to_owned(), + ensemble: custom.ensemble.to_owned(), + variable: custom.variable.to_owned(), + version: custom.version.map(|v| v.to_owned()), + } } +} - fn load_single_config() -> Config { - let data = single_config(); - Config::try_from(&data).unwrap() - } +/// Error getting netcdf attributes from a file +#[derive(Debug, Error)] +pub enum ExtractMetadataError { + /// Error occurred while opening the file. It's not entirely clear which exact error variants are possible when + /// opening a netcdf file. + #[error("error extracting necessary metadata from netcdf file")] + OpenFileError(#[from] netcdf::error::Error), + + /// Given attribute key was not found + #[error("missing attribute {0}")] + MissingAttribute(String), + + /// Attribute value for given key was not the expected data type. The type names are meant to be human-readable for + /// debugging so may not exactly match existing types in code. + #[error("invalid attribute data for {key} expected {expected} found {found}")] + InvalidAttributeData { + /// Attribute key + key: String, + /// The name type that was expected (arbitrary string) + expected: String, + /// The name type that was found (arbitrary string) + found: String, + }, +} - fn single_config() -> String { - format!( - r#" - [observations] - root_dir = "{ROOT}/.docker/data" - parts_dir = [ - "project", "product", "institute", "model", "experiment", "time_frequency", "realm", - "cmor_table", "ensemble", "version", "variable", "file_name" - ] - parts_dataset = [ - "project", "product", "institute", "model", "experiment", "time_frequency", "realm", - "cmor_table", "ensemble", "", "variable" - ] - parts_versioned_dataset = [ - "project", "product", "institute", "model", "experiment", "time_frequency", "realm", - "cmor_table", "ensemble", "version", "variable" - ] - parts_file_name = ["variable", "time_frequency", "experiment", "level", "version", "time"] - parts_time = "start_time-end_time" - data_type = "observations" - - [observations.defaults] - project = "observations" - "# - ) - } +/// Transforms a [`Cmip6`] object into [`Metadata`]. This is handled differently from the others because CMIP6's path +/// does not contain all the data [`Metadata`] needs. To get the rest we need to either use mapping tables maintained +/// separately from the DRS spec or, as we do here, open the files up and pull the data from their attributes. +fn metadata_from_cmip6(cmip: Cmip6, root_dir: &Utf8Path) -> Result<Metadata, ExtractMetadataError> { + // this netcdf library has an internal global mutex meaning it doesn't play well with multithreaded code. + // It's safe but will slow things down. I think this will be mostly ok since data of one type will already be + // mostly single threaded but the mutex may not play nicely with tokio + let f = netcdf::open(root_dir.join(cmip.path))?; + + let frequency_attr = f + .attribute("frequency") + .ok_or_else(|| ExtractMetadataError::MissingAttribute("frequency".to_string()))? + .value()?; + + let frequency = match frequency_attr { + AttrValue::Str(s) => s, + val => { + return Err(ExtractMetadataError::InvalidAttributeData { + key: "frequency".to_owned(), + expected: "str".to_owned(), + found: attrvalue_to_str(&val).to_owned(), + }) + } + }; + + let realm_attr = f + .attribute("realm") + .ok_or_else(|| ExtractMetadataError::MissingAttribute("table_id".to_owned()))? + .value()?; + let realm = match realm_attr { + AttrValue::Str(s) => s, + val => { + return Err(ExtractMetadataError::InvalidAttributeData { + key: "realm".to_owned(), + expected: "str".to_owned(), + found: attrvalue_to_str(&val).to_owned(), + }); + } + }; + + let m = &cmip.metadata; + Ok(Metadata { + path: cmip.path.to_owned(), + activity: MIP_ERA.to_owned(), + product: m.activity_id.to_owned(), + institute: m.institution_id.to_owned(), + model: m.source_id.to_owned(), + experiment: m.experiment_id.to_owned(), + ensemble: m.member_id.variant_label.to_string(), + variable: m.variable_id.to_owned(), + version: Some(m.version.to_owned()), + mip_table: m.table_id.to_owned(), + modeling_realm: realm, + frequency, + }) +} - fn single_file() -> Utf8PathBuf { - Utf8PathBuf::from( - format!( - concat!( - "{root}/.docker/data/observations/grid/CPC/CPC/cmorph/30min/atmos/30min/r1i1p1/v20210618/pr", - "/pr_30min_CPC_cmorph_r1i1p1_201609020000-201609020030.nc" - ), - root=ROOT, - ) - ) +/// Gets a printable type name for each attrvalue for error handling. +/// +/// Could probably be replaced by `std::any::type_name_of_val` whenever it's stabilized, +/// [tracking issue](https://github.com/rust-lang/rust/issues/66359). +fn attrvalue_to_str(a: &AttrValue) -> &'static str { + use AttrValue::*; + match *a { + Uchar(_) => "u8", + Uchars(_) => "Vec<u8>", + Schar(_) => "i8", + Schars(_) => "Vec<i8>", + Ushort(_) => "u16", + Ushorts(_) => "Vec<u16>", + Short(_) => "i16", + Shorts(_) => "Vec<i16>", + Uint(_) => "u32", + Uints(_) => "Vec<u32>", + Int(_) => "i32", + Ints(_) => "Vec<i32>", + Ulonglong(_) => "u64", + Ulonglongs(_) => "Vec<u64>", + Longlong(_) => "i64", + Longlongs(_) => "Vec<i64>", + Float(_) => "f32", + Floats(_) => "Vec<f32>", + Double(_) => "f64", + Doubles(_) => "Vec<f64>", + Str(_) => "str", + Strs(_) => "Vec<str>", } } diff --git a/freva/src/drs/metadata/config_builder.rs b/freva/src/drs/metadata/config_builder.rs new file mode 100644 index 0000000000000000000000000000000000000000..40e9199c8b45dc6f4a96e49e1184b73c3492d4fa --- /dev/null +++ b/freva/src/drs/metadata/config_builder.rs @@ -0,0 +1,132 @@ +//! This handles the conversion between the external DRS config format and the internal representation we use while +//! working the filesystem. + +use std::collections::HashMap; + +use camino::Utf8PathBuf; +use serde::Deserialize; +use thiserror::Error; + +use crate::drs::path::absolute; + +use super::{custom::ComponentConfig, Config, Dataset, Specification}; + +/// Deserializable builder for a [`Config`]. Allows the config file to be simpler and friendlier without forcing our +/// internal representation to be identical. +#[derive(Debug, Deserialize)] +pub struct ConfigBuilder { + #[serde(flatten)] + inner: HashMap<String, DatasetBuilder>, +} + +/// Contains all the elements necessary to create a dataset. Most will have just `root_dir` and `drs_format` but a +/// custom format would require some of the additional elements. +#[derive(Debug, Deserialize)] +struct DatasetBuilder { + root_dir: Utf8PathBuf, + drs_format: Format, + #[serde(default)] + parts_dir: Option<Vec<String>>, + #[serde(default)] + parts_file_name: Option<Vec<String>>, + // currently unused but it was present in the old configs so I carried it over here + #[serde(default)] + parts_time: Option<String>, + #[serde(default)] + defaults: Option<HashMap<String, String>>, +} + +/// Like [`Specification`] but without additional information in `Custom` since at when we're building the config we +/// won't know we need it yet. +#[derive(Debug, Deserialize)] +enum Format { + #[serde(rename = "cmor")] + Cmor, + #[serde(rename = "cmip5")] + Cmip5, + #[serde(rename = "cmip6")] + Cmip6, + #[serde(rename = "cordex")] + Cordex, + #[serde(rename = "custom")] + Custom, +} + +#[derive(Debug, Error)] +pub enum BuildError { + #[error("`{0}` value is missing for custom format")] + MissingRequiredField(String), + #[error("error converting relative root_dir to fully qualified: {0}")] + AbsolutePath(#[from] std::io::Error), + #[error("root_dir is not valid UTF8")] + VerifyPathUtf8(#[from] camino::FromPathBufError), +} + +impl ConfigBuilder { + /// Builds and verifies a [`Config`] from the information in `self`. + pub fn build(self) -> Result<Config, BuildError> { + let mut datasets: Vec<Dataset> = Vec::with_capacity(self.inner.len()); + for (name, s) in self.inner.into_iter() { + let dataset = s.build(name)?; + datasets.push(dataset); + } + Ok(Config { datasets }) + } +} + +impl DatasetBuilder { + fn build(self, name: String) -> Result<Dataset, BuildError> { + use Format::*; + let drs_format = match self.drs_format { + Cmip5 => Specification::Cmip5, + Cmor => Specification::Cmor, + Cmip6 => Specification::Cmip6, + Cordex => Specification::Cordex, + Custom => { + let conf = self.build_component_config()?; + Specification::Custom(conf) + } + }; + + // fully qualify the path name and expand things like `~` then convert back to a `Utf8PathBuf` + let root_dir = absolute(self.root_dir.as_std_path())?; + let root_dir = Utf8PathBuf::try_from(root_dir)?; + + Ok(Dataset { + name, + root_dir, + drs_format, + }) + } + + fn build_component_config(&self) -> Result<ComponentConfig, BuildError> { + // This is cloning all the fields which I don't love but taking ownership would be a problem for `build`. + // This is only done a handful of times anyway so it should be ok. + let parts_dir = self + .parts_dir + .clone() + .ok_or_else(|| BuildError::MissingRequiredField("parts_dir".to_owned()))?; + + let parts_file_name = self + .parts_file_name + .clone() + .ok_or_else(|| BuildError::MissingRequiredField("parts_file_name".to_owned()))?; + + let parts_time = self + .parts_time + .clone() + .unwrap_or_else(|| "start-end".to_owned()); + + let defaults = match &self.defaults { + Some(m) => m.clone(), + None => HashMap::new(), + }; + + Ok(ComponentConfig { + parts_dir, + parts_file_name, + parts_time, + defaults, + }) + } +} diff --git a/freva/src/drs/metadata/custom.rs b/freva/src/drs/metadata/custom.rs new file mode 100644 index 0000000000000000000000000000000000000000..85dce1ad53981f42b49dde964506befac6c4cff3 --- /dev/null +++ b/freva/src/drs/metadata/custom.rs @@ -0,0 +1,266 @@ +//! This is an escape hatch for files/datasets that don't have a path or name that conforms to an implemented DRS +//! specification. This will require more configuration in the config file to describe how to turn the path and name +//! into all the necessary information for Freva. +//! +//! This requires the creation of a [`ComponentConfig`] to describe where in the path or file name certain values are. +//! Component names aside from the required ones are allowed for clarity when setting up or changing the config but +//! will be ignored when converted to a [`Custom`] object. + +use std::collections::{HashMap, HashSet}; + +use camino::Utf8Path; +use lazy_static::lazy_static; +use thiserror::Error; + +/// Possible errors when extracting metadata from a path +#[derive(Debug, Error)] +pub enum Error { + /// The path did not have the same number of elements as there are in [`ComponentConfig::parts_dir`] + #[error("incorrect number of path parts: has {0}, expected {1}")] + MismatchPathParts(usize, usize), + /// The file name did not have the same number of elements as there are in [`ComponentConfig::parts_file_name`] + #[error("incorrect number of file name parts: has {0}, expected {1}")] + MismatchFilenameParts(usize, usize), + /// The path did not have a file name + #[error("no file name found, this may be a directory")] + NoFilename, + /// The path was missing some required values + #[error("missing required values: needs {0:?}")] + MissingValues(Vec<String>), +} + +lazy_static! { + // these are different from the struct field names for backwards compatibility. The struct field names match the + // cmip5 spec they're derived from but these are the names that were previously used in Freva + static ref REQUIRED: HashSet<&'static str> = HashSet::from([ + "project", + "product", + "institute", + "model", + "experiment", + "time_frequency", + "realm", + "cmor_table", + "ensemble", + "variable", + ]); +} + +/// Config which describes a custom DRS-like file naming scheme to ingest into Freva's metadata format. +/// +/// Expected part names are: +/// ```text +/// activity +/// product +/// institute +/// model +/// experiment +/// modeling_realm +/// mip_table +/// ensemble +/// variable +/// ``` +/// +/// The following names are optional and will only be looked for if present in the config: +/// ```text +/// version +/// ``` +/// +/// Parts can be located in either the path (`/` delineated) or in the filename (`_` delineated). They can also be +/// pressent multiple times in both path and filename, if that is the case, the last instance will be the final value +/// and all previous ones are discarded. This mapping is very simple so more advanced processing like splitting or +/// joining sections is not supported. +/// +/// All part names are accepted directly as strings and will have no verification for correctness or conformity to any +/// existing DRS specification in terms of allowed values. Additional names are also allowed but are ignored. This can +/// be used to mark out parts of the path/name that aren't used for DRS data. +#[derive(Debug, Clone)] +pub struct ComponentConfig { + /// The keys of the different path segments in the order they are expected to appear in the path. This does not + /// include the file name which is always at the end so no `filename` part is necessary. + pub parts_dir: Vec<String>, + /// Key ordering from parts of the file name + pub parts_file_name: Vec<String>, + /// Formatting of the time section of the file's name. + /// + /// Example: `start_time-end_time` + // I don't know if this will be used by it existed in the previous version + #[allow(dead_code)] + pub parts_time: String, + /// Default values used to fill in any missing values for the equivalent key + pub defaults: HashMap<String, String>, +} + +impl ComponentConfig { + pub(crate) fn metadata_from_path<'path, 'config: 'path>( + &'config self, + path: &'path Utf8Path, + ) -> Result<Custom<'path>, Error> { + let parts: Vec<&str> = path.iter().collect(); + // + 1 because parts_dir does not include the filename + if parts.len() != self.parts_dir.len() + 1 { + return Err(Error::MismatchPathParts(parts.len(), self.parts_dir.len())); + } + // remove file extension if present + let filename = path.file_stem().ok_or(Error::NoFilename)?; + + let filename_parts: Vec<&str> = filename.split('_').collect(); + if filename_parts.len() != self.parts_file_name.len() { + return Err(Error::MismatchFilenameParts( + filename_parts.len(), + self.parts_file_name.len(), + )); + } + + let mut components = HashMap::new(); + // defaults first so they're overwritten if present + for (name, value) in self.defaults.iter() { + components.insert(name.as_str(), value.as_ref()); + } + + for (name, value) in self.parts_dir.iter().zip(parts) { + components.insert(name.as_str(), value); + } + + for (name, value) in self.parts_file_name.iter().zip(filename_parts) { + components.insert(name.as_str(), value); + } + + Custom::new(path, components) + } +} + +/// Filepath that conforms to the corresponding [`ComponentConfig`]. +/// +/// This is essentially the same as [`super::Metadata`] except the data isn't owned to conform more to the convention set +/// by the other DRS specs. +/// +/// Paths must be UTF-8. +#[derive(Debug, PartialEq, Eq)] +pub struct Custom<'a> { + pub(crate) path: &'a Utf8Path, + pub(crate) activity: &'a str, + pub(crate) product: &'a str, + pub(crate) institute: &'a str, + pub(crate) model: &'a str, + pub(crate) experiment: &'a str, + pub(crate) frequency: &'a str, + pub(crate) modeling_realm: &'a str, + pub(crate) variable: &'a str, + pub(crate) ensemble: &'a str, + pub(crate) mip_table: &'a str, + pub(crate) version: Option<&'a str>, +} + +impl<'a> Custom<'a> { + fn new(path: &'a Utf8Path, components: HashMap<&str, &'a str>) -> Result<Self, Error> { + let key_set = components.keys().copied().collect(); + + let missing: Vec<_> = REQUIRED + .difference(&key_set) + .map(|&m| m.to_owned()) + .collect(); + if !missing.is_empty() { + return Err(Error::MissingValues(Vec::from_iter(missing))); + } + + Ok(Self { + path, + activity: components["project"], + product: components["product"], + institute: components["institute"], + model: components["model"], + experiment: components["experiment"], + frequency: components["time_frequency"], + modeling_realm: components["realm"], + variable: components["variable"], + ensemble: components["ensemble"], + mip_table: components["cmor_table"], + version: components.get("version").copied(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn owned_vec(v: Vec<&str>) -> Vec<String> { + v.into_iter().map(|s| s.to_owned()).collect() + } + + #[test] + fn test_from_normal_path() { + let path = Utf8Path::new("act/prod/inst/mod/exp/freq/v1/var_modrel_mip_ens.nc"); + let config = ComponentConfig { + parts_dir: owned_vec(vec![ + "project", + "product", + "institute", + "model", + "experiment", + "time_frequency", + "version", + ]), + parts_file_name: owned_vec(vec!["variable", "realm", "cmor_table", "ensemble"]), + parts_time: "".to_owned(), + defaults: HashMap::new(), + }; + + let custom = config.metadata_from_path(path).unwrap(); + assert_eq!( + Custom { + path, + activity: "act", + product: "prod", + institute: "inst", + model: "mod", + experiment: "exp", + frequency: "freq", + modeling_realm: "modrel", + variable: "var", + ensemble: "ens", + mip_table: "mip", + version: Some("v1"), + }, + custom + ); + } + + #[test] + fn test_optional_version() { + let path = Utf8Path::new("act/prod/inst/mod/exp/freq/var_modrel_mip_ens.nc"); + let config = ComponentConfig { + parts_dir: owned_vec(vec![ + "project", + "product", + "institute", + "model", + "experiment", + "time_frequency", + ]), + parts_file_name: owned_vec(vec!["variable", "realm", "cmor_table", "ensemble"]), + parts_time: "".to_owned(), + defaults: HashMap::new(), + }; + + let custom = config.metadata_from_path(path).unwrap(); + assert_eq!( + Custom { + path, + activity: "act", + product: "prod", + institute: "inst", + model: "mod", + experiment: "exp", + frequency: "freq", + modeling_realm: "modrel", + variable: "var", + ensemble: "ens", + mip_table: "mip", + version: None, + }, + custom + ); + } +} diff --git a/freva/src/drs/path.rs b/freva/src/drs/path.rs new file mode 100644 index 0000000000000000000000000000000000000000..93c32092134dbe628131b2ced1289fa12326464c --- /dev/null +++ b/freva/src/drs/path.rs @@ -0,0 +1,48 @@ +use std::{ + env::current_dir, + io, + path::{Component, Path, PathBuf}, +}; + +/// Resolves `.` and `..` in a path to a fully qualified path without resolving symlinks. This also will check if the +/// path is already fully qualified and if not, will prepend the path with the current directory (using +/// [`std::env::current_dir`]). +/// +/// The majority of this function is copied from +/// [cargo](https://github.com/rust-lang/cargo/blob/6d6dd9d9be9c91390da620adf43581619c2fa90e/crates/cargo-util/src/paths.rs#L81) +/// +/// There is currently an unstable function that does some of what this tries to do which is worth keeping an eye on: +/// <https://github.com/rust-lang/rust/issues/92750> +pub(super) fn absolute(path: &Path) -> Result<PathBuf, io::Error> { + let path = if !path.has_root() { + let root = current_dir()?; + root.join(path) + } else { + path.to_owned() + }; + + let mut components = path.components().peekable(); + let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() { + components.next(); + PathBuf::from(c.as_os_str()) + } else { + PathBuf::new() + }; + + for component in components { + match component { + Component::Prefix(..) => unreachable!(), + Component::RootDir => { + ret.push(component.as_os_str()); + } + Component::CurDir => {} + Component::ParentDir => { + ret.pop(); + } + Component::Normal(c) => { + ret.push(c); + } + } + } + Ok(ret) +} diff --git a/freva/src/drs/search.rs b/freva/src/drs/search.rs index da65d3fdf29008028041b3af69a8fe66e64cecff..fa3fd1cea1e8fd8258f22abf23e8b231ba3d3ed7 100644 --- a/freva/src/drs/search.rs +++ b/freva/src/drs/search.rs @@ -21,7 +21,7 @@ pub enum Error { } /// Searches Solr for drs file metadata. -pub async fn search<'d>(drs_conf: &'d Config, solr: &Solr) -> Result<Vec<Metadata<'d>>, Error> { +pub async fn search(drs_conf: &Config, solr: &Solr) -> Result<Vec<Metadata>, Error> { let facets = HashMap::new(); let mut start = 0; diff --git a/freva/src/solr.rs b/freva/src/solr.rs index 2610e188704477e41b65c707edfb2bfe4c9f0adc..3355f9dcd53579fb3ef5c5b58152d55b103fd203 100644 --- a/freva/src/solr.rs +++ b/freva/src/solr.rs @@ -8,8 +8,6 @@ use serde::Deserialize; use thiserror::Error; use tracing::debug; -use crate::drs::Metadata; - const DEFAULT_PORT: u16 = 8983; const DEFAULT_PROTOCOL: &str = "http"; @@ -111,10 +109,10 @@ impl Solr { } /// Uploads `documents` into `collection` in Solr. - pub async fn update<'a>( + pub async fn update( &self, collection: &str, - documents: &[&Metadata<'a>], + documents: &[HashMap<&str, String>], ) -> Result<(), SolrError> { let url = self.url(collection, "update/json/docs")?; debug!("{}", url); diff --git a/freva/tests/common/mod.rs b/freva/tests/common/mod.rs index 3a9b8e2f042eb67fd57983c681fb1230fb93f042..dd775dc8eccad0879de998c3cb670e7aca7342aa 100644 --- a/freva/tests/common/mod.rs +++ b/freva/tests/common/mod.rs @@ -1,11 +1,20 @@ +use freva::drs::metadata::ConfigBuilder; use freva::drs::Config; use freva::solr::Solr; +use std::sync::Once; use wiremock::MockServer; +static INIT: Once = Once::new(); + pub const REPO_ROOT: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/.."); pub fn log_init() { - env_logger::init(); + // ensure that this is only run once, running multiple integration tests in a batch will result in all but the first + // crashing. + // individual tests still must call this so if they are run separately, they will initialize the logger + INIT.call_once(|| { + tracing_subscriber::fmt::init(); + }); } pub async fn solr_server() -> (MockServer, Solr) { @@ -24,26 +33,21 @@ pub fn test_config() -> Config { let string = format!( r#" [observations] - root_dir = "{REPO_ROOT}/.docker/data" + root_dir = "{REPO_ROOT}/.docker/data/obs" + drs_format = "custom" parts_dir = [ - "project", "product", "institute", "model", "experiment", "time_frequency", "realm", - "cmor_table", "ensemble", "version", "variable", "file_name" - ] - parts_dataset = [ - "project", "product", "institute", "model", "experiment", "time_frequency", "realm", - "cmor_table", "ensemble", "", "variable" - ] - parts_versioned_dataset = [ "project", "product", "institute", "model", "experiment", "time_frequency", "realm", "cmor_table", "ensemble", "version", "variable" ] parts_file_name = ["variable", "time_frequency", "experiment", "level", "version", "time"] parts_time = "start_time-end_time" - data_type = "observations" - [observations.defaults] - project = "observations" + project = "observations" + [cmip5_name] + root_dir = "{REPO_ROOT}/.docker/data/cmip5" + drs_format = "cmip5" "#, ); - Config::try_from(&string).unwrap() + let builder: ConfigBuilder = toml::from_str(&string).unwrap(); + builder.build().unwrap() } diff --git a/freva/tests/test_ingest.rs b/freva/tests/test_ingest.rs index f202df228b69bbbf0aa40eedeb0aa11b76921598..af8c6c5f7fb42175b0551375224eb409a60c363e 100644 --- a/freva/tests/test_ingest.rs +++ b/freva/tests/test_ingest.rs @@ -1,5 +1,6 @@ +use std::{collections::HashMap, path::Path}; + use camino::Utf8PathBuf; -use freva::drs::metadata::Activity; use wiremock::{ matchers::{method, path}, Mock, ResponseTemplate, @@ -22,19 +23,74 @@ async fn test_ingest() { .mount(&mock) .await; - let config_dir = Utf8PathBuf::from(common::REPO_ROOT).join(".docker"); - let data_dir = config_dir.join("data"); let drs_config = common::test_config(); + let data_dir: Option<&Path> = None; let result = freva::drs::ingest(&solr, &drs_config, &data_dir, 10, None, None).await; assert!(result.is_ok()); let report = result.unwrap(); - assert_eq!(1, report.structures.len()); - let structure_report = &report.structures[0]; + let expected_results: HashMap<&str, (u32, u32)> = + HashMap::from_iter([("observations", (24, 0)), ("cmip5_name", (10, 0))].into_iter()); + + assert_eq!( + expected_results.len(), + report.datasets.len(), + "report did not have expected number of dataset reports" + ); + for dataset_report in report.datasets.iter() { + let name = dataset_report.dataset.name(); + let (sent, skipped) = expected_results.get(name).expect("unexpected dataset name"); + // can't assert_eq the whole struct because it includes a duration which is non-deterministic + assert_eq!( + *sent, dataset_report.sent, + "unexpected sent number for dataset {}", + name + ); + assert_eq!( + *skipped, dataset_report.skipped, + "unexpected skipped number for dataset {}", + name + ); + } +} + +#[tokio::test] +async fn test_ingest_subset() { + common::log_init(); + let (mock, solr) = common::solr_server().await; + Mock::given(method("POST")) + .and(path("/solr/files/update/json/docs")) + .respond_with(ResponseTemplate::new(200)) + .mount(&mock) + .await; + Mock::given(method("POST")) + .and(path("/solr/latest/update/json/docs")) + .respond_with(ResponseTemplate::new(200)) + .mount(&mock) + .await; + + let config_dir = Utf8PathBuf::from(common::REPO_ROOT).join(".docker"); + let data_dir = config_dir.join("data/cmip5/cmip5/output2/INM"); + let drs_config = common::test_config(); + + let result = freva::drs::ingest( + &solr, + &drs_config, + &Some(data_dir.as_std_path()), + 10, + None, + None, + ) + .await; + assert!(result.is_ok()); + let report = result.unwrap(); + assert_eq!( - &Activity("observations".to_owned()), - structure_report.structure.activity() + 1, + report.datasets.len(), + "unexpected number of dataset reports" ); - assert_eq!(24, structure_report.sent); - assert_eq!(0, structure_report.skipped); + + let cmip5_results = &report.datasets[0]; + assert_eq!(4, cmip5_results.sent); }