diff --git a/drs/src/cmip5.rs b/drs/src/cmip5.rs index 96ab4ab0f4eb41b46e6a636dd04eac96e44fd0f1..dd7e242e134e6dc12283be6cf4ed0b6d906948a3 100644 --- a/drs/src/cmip5.rs +++ b/drs/src/cmip5.rs @@ -52,40 +52,54 @@ use tracing::error; use super::parser::{parse_cmor, parse_esgf}; +/// Error extracting data from a cmor path #[derive(Debug, Error)] -pub enum Error { - #[error("error parsing CMOR path: {0}")] - InvalidCmorPath(String), - #[error("error parsing ESGF path: {0}")] - InvalidEsgfPath(String), -} +#[error("error parsing CMOR path: {0}")] +pub struct InvalidCmorPathError(String); + +/// Error extracting data from an esgf path +#[derive(Debug, Error)] +#[error("error parsing ESGF path: {0}")] +pub struct InvalidEsgfPathError(String); +/// Holds path and metadata pulled from a CMIP5 style path. #[derive(Debug)] pub struct Cmip5<'a> { + /// The path of the file relative to the root directory of the dataset meaning it contains all of and only the + /// elements relevant to CMIP5. pub path: &'a Utf8Path, + /// The metadata extracted from the path pub metadata: PathMetadata<'a>, } impl<'a> Cmip5<'a> { - /// Extracts metadata from a CMOR style file path. + /// Extracts metadata from a CMOR style file path. It expects that the path will consist of only the parts relevant + /// to the DRS structure. For example, if a dataset starts at `/foo/data/cmip5/output...` then `path` must start + /// from `cmip5/output...`. /// /// Note: /// * This will not verify that the file upholds various invariants of the CMIP spec (e.g. that `fx` frequency has /// an ensemble value of `r0i0p0`). - /// * The path must start from the beginning of the parts relevant to the CMOR data - pub fn from_cmor_path(path: &Utf8Path) -> Result<Cmip5, Error> { + pub fn from_cmor_path(path: &Utf8Path) -> Result<Cmip5, InvalidCmorPathError> { let metadata = match parse_cmor(path.as_str()) { Ok((_, metadata)) => metadata, - Err(e) => return Err(Error::InvalidCmorPath(e.to_string())), + Err(e) => return Err(InvalidCmorPathError(e.to_string())), }; Ok(Cmip5 { path, metadata }) } - pub fn from_esgf_path(path: &Utf8Path) -> Result<Cmip5, Error> { + /// Extracts metadata from an ESGF style file path. It expects that the path will consist of only the parts relevant + /// to the DRS structure. For example, if a dataset starts at `/foo/data/cmip5/output...` then `path` must start + /// from `cmip5/output...`. + /// + /// Note: + /// * This will not verify that the file upholds various invariants of the CMIP spec (e.g. that `fx` frequency has + /// an ensemble value of `r0i0p0`). + pub fn from_esgf_path(path: &Utf8Path) -> Result<Cmip5, InvalidEsgfPathError> { let metadata = match parse_esgf(path.as_str()) { Ok((_, metadata)) => metadata, - Err(e) => return Err(Error::InvalidCmorPath(e.to_string())), + Err(e) => return Err(InvalidEsgfPathError(e.to_string())), }; Ok(Cmip5 { path, metadata }) @@ -127,7 +141,9 @@ pub struct PathMetadata<'a> { /// family. These experiment names are not freely chosen, but come from controlled vocabularies defined in the /// Appendix 1.1 of the source document under the column labeled “Short Name of Experiment†pub experiment: &'a str, + /// The interval between time samples in this dataset. See [`Frequency`] for more details. pub frequency: Frequency, + /// Modeling component most relevant to this dataset. See [`ModelingRealm`] for more details. pub modeling_realm: ModelingRealm, /// This and the MIP table component identify the physical quantity and often imply something about the sampling /// frequency and modeling realm. For CMIP5 the variable anme and MIP table for requested output appear in the @@ -136,6 +152,8 @@ pub struct PathMetadata<'a> { /// Note that hyphens (-) are forbidden in CMIP5 variable names. Though later the document states that this is /// merely recommended for programming language compatibility. This chooses to allow hyphens. pub variable: &'a str, + /// Informations about what variable conditions were applied to this dataset. See [`EnsembleMember`] for more + /// details. pub ensemble: EnsembleMember, /// For CMIP5, each MIP table contains fields sample only at a single frequency. See also `variable_name`. pub mip_table: &'a str, @@ -145,62 +163,9 @@ pub struct PathMetadata<'a> { /// should not, however, assume the integer has invariably been correctly encoded (e.g., sometimes a single digit /// number might appear as in `v3`). pub version: Option<&'a str>, - /// Time instants or periods will be represented by a construction of the form `N1-N2`, where N1 and N2 are of the - /// form `yyyy[MM[dd[hh[mm[ss]]]]][-suffix]`, where `yyyy`, `MM`, `dd`, `hh`, `mm`, and `ss` are integer year, - /// month, day, hour, minute, and second, respectively, and the precision with which time is expressed must - /// unambiguously resolve the interval between time-samples contained in the file or virtual file. If only a single - /// time instant is included in the dataset, N2 may normally be omitted, but for CMIP5 N2 is required and in this - /// case would be identical to N1. - /// - /// The optional `-suffix` can be included to indicate that the netCDF file contains a climatology (suffix = - /// `-clim`) or a single time mean, for example, over multiple years (suffix = `-avg`). consider a file containing a - /// single time-average, based on daily samples for the two-week period from February 1, 1971 through February 14, - /// 1971. In this case the frequency for the dataset would be `day` (because the average is based on daily samples), - /// and the suffix would be `19710201-19710214-avg` - /// - /// Note that the DRS does not explicitly specify the calendar type (e.g., Julian, Gregorian), but the calendar will - /// be indicated by one of the attributes in each netCDF file. This is omitted for variables that are - /// time-independent. + /// The time interval covered by this dataset. See [`TemporalSubset`] for more details. pub temporal_subset: Option<TemporalSubset<'a>>, - /// The geographical indicator is always optional, but when present it should appear last in the extended path. This - /// indicator specifies geographical subsets described by bounding boxes (e.g. 20S to 20N and 0 to 180E) or by - /// named regions (e.g., 'pacific-ocean`). The underscore character (`_`) is forbidden in the geographical - /// indicator. - /// - /// The DRS specification for this indicator is a string of the form `g-XXXX[-YYYY].` The `g-` indicates that some - /// spatial selection or processing has been done (i.e., selection of a sub-global region and possibly spatial - /// averaging). The `XXXX`, which must not be omitted, is either a named region (with names from a specific - /// gazetteer, which is yet to be selected) or the bounds of a latitude-longitude rectangle (following the template - /// defined below). The `YYYY` is optional and indicates if and what sort of spatial averaging has been performed - /// and whether the average includes masking of certain areas within the region (e.g., masking of land areas). The - /// DRS currently includes a single named region: `global`, which is used to select data from the entire horizontal - /// domain. - /// - /// In the case of a bounding box, the bounds of the region should be specified following the template, - /// `latJHJJHHlonMZMMZZ` where J, JJ, M and MM are integers, indicating the southern, northern, western and eastern - /// edges of the bounding box, respectively. H and HH are restricted to `N` or `S` (indicating "north" or "south"), - /// and the Z and ZZ are restricted to `E` or `W` (indicating "east" or "west"). The longitude values should be in - /// the range from 0 to 180 (e.g., a box spanning 200 degrees of longitude could be specified by `10W170W`, but not - /// by `10W190E`, even though 170W and 190E are the same longitude). The latitude and longitude values should be - /// rounded to the nearest integer. Omission of the latitude range or the longitude range implies that data were - /// selected from the entire domain of the omitted dimension. (For example, `lat20S20N` implies all longitudes were - /// included.) Remember, however, that if `XXXX` designates a bounding box, then at least one of the dimensions must - /// appear. - /// - /// The `YYYY` string is of the form `[yyy]-[zzz]` where the hyphen should be omitted unless both `yyy` and `zzz` - /// are present. As options for `yyy`, the DRS currently includes `lnd` and `ocn`. The `lnd` suffix indicates that - /// only `land` locations are considered, and the `ocn` suffix indicates that only `ocean` locations (including sea - /// ice) are considered. As options for `zzz`, the DRS currently includes `zonalavg` and `areaavg`, which indicate - /// `zonal mean` and `area mean` respectively. - /// - /// Here are some examples of geographical indicators: - /// * `g-lat20S20Nlon170W130Wâ€`– a geographical subset defined by a bounding box (latitudes -20 to 20, and - /// longitudes -170 to -130, when rounded to the nearest integer) - /// * `g-global-ocn-areaavg` – an average over the world’s oceans. - /// * `g-lat20S20N` – a geographical subset defined by a bounding box covering all longitudes - /// and extending from 20S to 20N. - /// * `g-lat20S20N-lnd-zonalavg` – a zonal average over tropical lands, covering all - /// longitudes. + /// The geographical region covered by this dataset. See [`GeographicalInfo`] for more details. pub geographical_info: Option<GeographicalInfo<'a>>, } @@ -212,14 +177,26 @@ pub struct PathMetadata<'a> { /// Further information: <https://pcmdi.llnl.gov/mips/cmip5/requirements.html> #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Frequency { + /// Yearly frequency Year, + /// Monthly frequency Month, + /// Daily frequency Day, + /// 6 hourly frequency Hour6, + /// 3 hourly frequency Hour3, + /// Hourly frequency. + /// + /// This value is not part of the CMIP5 DRS spec but is frequently used Hour1, + // TODO: it would be nice to have more details on these 2 + /// Sub-hourly frequency SubHour, + /// Climatological monthly mean frequency ClimatologicalMonthlyMean, + /// This dataset does not have data that changes over time Fixed, } @@ -241,6 +218,7 @@ impl ToString for Frequency { } } +/// Error parsing a string value into a frequency #[derive(Debug, Error)] #[error("invalid frequency value {given}")] pub struct InvalidFrequencyError { @@ -279,14 +257,22 @@ impl FromStr for Frequency { /// Further information: <https://pcmdi.llnl.gov/mips/cmip5/requirements.html> #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ModelingRealm { + /// Dataset applies to atmospheric realm Atmosphere, + /// Dataset applies to ocean Ocean, + /// Dataset applies to land Land, + /// Dataset applies to land ice LandIce, + /// Dataset applies to sea ice SeaIce, + /// Dataset applies to aerosol Aerosol, + /// Dataset applies to atmospheric chemicals AtmosphereChemical, - OceanGeochemical, + /// Dataset applies to ocean biogeochemical + OceanBiogeochemical, } impl ToString for ModelingRealm { @@ -300,12 +286,13 @@ impl ToString for ModelingRealm { SeaIce => "seaice", Aerosol => "aerosol", AtmosphereChemical => "atmoschem", - OceanGeochemical => "ocnbgchem", + OceanBiogeochemical => "ocnbgchem", }; s.to_owned() } } +/// Error parsing modeling realm from a string #[derive(Debug, Error)] #[error("invalid modeling realm value {given}")] pub struct InvalidModelingRealmError { @@ -326,7 +313,7 @@ impl FromStr for ModelingRealm { "seaice" => Ok(SeaIce), "aerosol" => Ok(Aerosol), "atmoschem" => Ok(AtmosphereChemical), - "ocnbgchem" => Ok(OceanGeochemical), + "ocnbgchem" => Ok(OceanBiogeochemical), _ => Err(InvalidModelingRealmError { given: s }), } } @@ -338,8 +325,15 @@ impl FromStr for ModelingRealm { /// should invariably be assigned the value zero (`i0`). The same holds true for the other numbers. #[derive(Debug, Clone, PartialEq, Eq)] pub struct EnsembleMember { + /// Used to distinguish between members of an ensemble that are generated by initializing a set of runs with + /// different, equally realistic initial conditions. pub realization: u64, + /// Models used for forecasts that depend on the initial conditions might be initialized from observations using + /// different methods or different observational datasets. This value distinguishes between conditions pub initialization: u64, + /// The "perturbed physics" numbers used to distinguish between closely-related model versions which are, as a + /// group, referred to as a perturbed physics ensemble where the different models use different sets of model + /// parameters. pub physics: u64, } @@ -352,6 +346,7 @@ impl ToString for EnsembleMember { } } +/// Error parsing ensemble member from a string #[derive(Debug, Error)] #[error("invalid ensemble member string: {reason}")] pub struct InvalidEnsembleMember { @@ -375,25 +370,88 @@ impl FromStr for EnsembleMember { } } +/// Time instants or periods will be represented by a construction of the form `N1-N2`, where N1 and N2 are of the +/// form `yyyy[MM[dd[hh[mm[ss]]]]][-suffix]`, where `yyyy`, `MM`, `dd`, `hh`, `mm`, and `ss` are integer year, +/// month, day, hour, minute, and second, respectively, and the precision with which time is expressed must +/// unambiguously resolve the interval between time-samples contained in the file or virtual file. If only a single +/// time instant is included in the dataset, N2 may normally be omitted, but for CMIP5 N2 is required and in this +/// case would be identical to N1. +/// +/// Note that the DRS does not explicitly specify the calendar type (e.g., Julian, Gregorian), but the calendar will +/// be indicated by one of the attributes in each netCDF file. This is omitted for variables that are +/// time-independent. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TemporalSubset<'a> { + /// Start of the time period pub start: NaiveDateTime, + /// End of the time period pub end: NaiveDateTime, + /// The optional `-suffix` can be included to indicate that the netCDF file contains a climatology (suffix = + /// `-clim`) or a single time mean, for example, over multiple years (suffix = `-avg`). Consider a file containing a + /// single time-average, based on daily samples for the two-week period from February 1, 1971 through February 14, + /// 1971. In this case the frequency for the dataset would be `day` (because the average is based on daily samples), + /// and the suffix would be `19710201-19710214-avg` pub suffix: Option<&'a str>, } +/// The geographical indicator is always optional, but when present it should appear last in the extended path. This +/// indicator specifies geographical subsets described by bounding boxes (e.g. 20S to 20N and 0 to 180E) or by +/// named regions (e.g., 'pacific-ocean`). The underscore character (`_`) is forbidden in the geographical +/// indicator. +/// +/// The DRS specification for this indicator is a string of the form `g-XXXX[-YYYY].` The `g-` indicates that some +/// spatial selection or processing has been done (i.e., selection of a sub-global region and possibly spatial +/// averaging). The `XXXX`, which must not be omitted, is either a named region (with names from a specific +/// gazetteer, which is yet to be selected) or the bounds of a latitude-longitude rectangle (following the template +/// defined below). The `YYYY` is optional and indicates if and what sort of spatial averaging has been performed +/// and whether the average includes masking of certain areas within the region (e.g., masking of land areas). The +/// DRS currently includes a single named region: `global`, which is used to select data from the entire horizontal +/// domain. +/// +/// In the case of a bounding box, the bounds of the region should be specified following the template, +/// `latJHJJHHlonMZMMZZ` where J, JJ, M and MM are integers, indicating the southern, northern, western and eastern +/// edges of the bounding box, respectively. H and HH are restricted to `N` or `S` (indicating "north" or "south"), +/// and the Z and ZZ are restricted to `E` or `W` (indicating "east" or "west"). The longitude values should be in +/// the range from 0 to 180 (e.g., a box spanning 200 degrees of longitude could be specified by `10W170W`, but not +/// by `10W190E`, even though 170W and 190E are the same longitude). The latitude and longitude values should be +/// rounded to the nearest integer. Omission of the latitude range or the longitude range implies that data were +/// selected from the entire domain of the omitted dimension. (For example, `lat20S20N` implies all longitudes were +/// included.) Remember, however, that if `XXXX` designates a bounding box, then at least one of the dimensions must +/// appear. +/// +/// The `YYYY` string is of the form `[yyy]-[zzz]` where the hyphen should be omitted unless both `yyy` and `zzz` +/// are present. As options for `yyy`, the DRS currently includes `lnd` and `ocn`. The `lnd` suffix indicates that +/// only `land` locations are considered, and the `ocn` suffix indicates that only `ocean` locations (including sea +/// ice) are considered. As options for `zzz`, the DRS currently includes `zonalavg` and `areaavg`, which indicate +/// `zonal mean` and `area mean` respectively. +/// +/// Here are some examples of geographical indicators: +/// * `g-lat20S20Nlon170W130Wâ€`– a geographical subset defined by a bounding box (latitudes -20 to 20, and +/// longitudes -170 to -130, when rounded to the nearest integer) +/// * `g-global-ocn-areaavg` – an average over the world’s oceans. +/// * `g-lat20S20N` – a geographical subset defined by a bounding box covering all longitudes +/// and extending from 20S to 20N. +/// * `g-lat20S20N-lnd-zonalavg` – a zonal average over tropical lands, covering all +/// longitudes. #[derive(Debug, Clone, PartialEq, Eq)] pub struct GeographicalInfo<'a> { + /// The region of the file pub region: Region<'a>, + /// How the data has been averaged across the region pub averaging: Option<&'a str>, } +/// The different types of region definitions #[derive(Debug, Clone, PartialEq, Eq)] pub enum Region<'a> { + /// A region defined by a latitude-longitude bounding box BoundingBox { + /// the latitude of the bounding box, both north and south bounds lat: Option<&'a str>, + /// the longitude of the bounding box, both east and west bounds lon: Option<&'a str>, }, + /// A region defined by a single name, e.g. `global`. Named(&'a str), } @@ -438,7 +496,7 @@ mod tests { ("seaIce", SeaIce), ("aerosol", Aerosol), ("atmosChem", AtmosphereChemical), - ("ocnBgchem", OceanGeochemical), + ("ocnBgchem", OceanBiogeochemical), ]; for (s, expected) in cases.iter() { let res = ModelingRealm::from_str(s); diff --git a/drs/src/lib.rs b/drs/src/lib.rs index 5cbd6e1c45c08794146904fb98c6782cb6add68c..cfc24afd8bd838df2d1364d14c71df2f3c1f67d7 100644 --- a/drs/src/lib.rs +++ b/drs/src/lib.rs @@ -1,3 +1,4 @@ +#[warn(missing_docs)] pub mod cmip5; pub mod cmip6; pub mod cordex; diff --git a/freva/src/drs/metadata.rs b/freva/src/drs/metadata.rs index d2569d4e85d2d58ae07bb968cb2347d3d6203adf..009f4b7d0338c1a5464cd8c372d40e524977d659 100644 --- a/freva/src/drs/metadata.rs +++ b/freva/src/drs/metadata.rs @@ -45,9 +45,13 @@ pub enum Error { #[error(transparent)] ExtractMetadata(#[from] ExtractMetadataError), - /// Path does not follow cmip5's specification + /// Path does not follow cmor standard #[error(transparent)] - InvalidCmip5Path(#[from] drs::cmip5::Error), + InvalidCmorPath(#[from] drs::cmip5::InvalidCmorPathError), + + /// Path does not follow esgf standard + #[error(transparent)] + InvalidEsgfPath(#[from] drs::cmip5::InvalidEsgfPathError), /// Path does not follow cmip6's specification #[error(transparent)]