Ghost User
--- a/drs/src/cmip5.rs

+ 129

− 71
+++ b/drs/src/cmip5.rs

+ 129

− 71
 @@ -52,40 +52,54 @@ use tracing::error;

 use super::parser::{parse_cmor, parse_esgf};

+/// Error extracting data from a cmor path
 #[derive(Debug, Error)]
-pub enum Error {
-    #[error("error parsing CMOR path: {0}")]
-    InvalidCmorPath(String),
-    #[error("error parsing ESGF path: {0}")]
-    InvalidEsgfPath(String),
-}
+#[error("error parsing CMOR path: {0}")]
+pub struct InvalidCmorPathError(String);
+
+/// Error extracting data from an esgf path
+#[derive(Debug, Error)]
+#[error("error parsing ESGF path: {0}")]
+pub struct InvalidEsgfPathError(String);

+/// Holds path and metadata pulled from a CMIP5 style path.
 #[derive(Debug)]
 pub struct Cmip5<'a> {
+    /// The path of the file relative to the root directory of the dataset meaning it contains all of and only the
+    /// elements relevant to CMIP5.
    pub path: &'a Utf8Path,
+    /// The metadata extracted from the path
    pub metadata: PathMetadata<'a>,
 }

 impl<'a> Cmip5<'a> {
-    /// Extracts metadata from a CMOR style file path.
+    /// Extracts metadata from a CMOR style file path. It expects that the path will consist of only the parts relevant
+    /// to the DRS structure. For example, if a dataset starts at `/foo/data/cmip5/output...` then `path` must start
+    /// from `cmip5/output...`.
    ///
    /// Note:
    /// * This will not verify that the file upholds various invariants of the CMIP spec (e.g. that `fx` frequency has
    ///     an ensemble value of `r0i0p0`).
-    /// * The path must start from the beginning of the parts relevant to the CMOR data
-    pub fn from_cmor_path(path: &Utf8Path) -> Result<Cmip5, Error> {
+    pub fn from_cmor_path(path: &Utf8Path) -> Result<Cmip5, InvalidCmorPathError> {
        let metadata = match parse_cmor(path.as_str()) {
            Ok((_, metadata)) => metadata,
-            Err(e) => return Err(Error::InvalidCmorPath(e.to_string())),
+            Err(e) => return Err(InvalidCmorPathError(e.to_string())),
        };

        Ok(Cmip5 { path, metadata })
    }

-    pub fn from_esgf_path(path: &Utf8Path) -> Result<Cmip5, Error> {
+    /// Extracts metadata from an ESGF style file path. It expects that the path will consist of only the parts relevant
+    /// to the DRS structure. For example, if a dataset starts at `/foo/data/cmip5/output...` then `path` must start
+    /// from `cmip5/output...`.
+    ///
+    /// Note:
+    /// * This will not verify that the file upholds various invariants of the CMIP spec (e.g. that `fx` frequency has
+    ///     an ensemble value of `r0i0p0`).
+    pub fn from_esgf_path(path: &Utf8Path) -> Result<Cmip5, InvalidEsgfPathError> {
        let metadata = match parse_esgf(path.as_str()) {
            Ok((_, metadata)) => metadata,
-            Err(e) => return Err(Error::InvalidCmorPath(e.to_string())),
+            Err(e) => return Err(InvalidEsgfPathError(e.to_string())),
        };

        Ok(Cmip5 { path, metadata })
 @@ -127,7 +141,9 @@ pub struct PathMetadata<'a> {
    /// family. These experiment names are not freely chosen, but come from controlled vocabularies defined in the
    /// Appendix 1.1 of the source document under the column labeled “Short Name of Experiment”
    pub experiment: &'a str,
+    /// The interval between time samples in this dataset. See [`Frequency`] for more details.
    pub frequency: Frequency,
+    /// Modeling component most relevant to this dataset. See [`ModelingRealm`] for more details.
    pub modeling_realm: ModelingRealm,
    /// This and the MIP table component identify the physical quantity and often imply something about the sampling
    /// frequency and modeling realm. For CMIP5 the variable anme and MIP table for requested output appear in the
 @@ -136,6 +152,8 @@ pub struct PathMetadata<'a> {
    /// Note that hyphens (-) are forbidden in CMIP5 variable names. Though later the document states that this is
    /// merely recommended for programming language compatibility. This chooses to allow hyphens.
    pub variable: &'a str,
+    /// Informations about what variable conditions were applied to this dataset. See [`EnsembleMember`] for more
+    /// details.
    pub ensemble: EnsembleMember,
    /// For CMIP5, each MIP table contains fields sample only at a single frequency. See also `variable_name`.
    pub mip_table: &'a str,
 @@ -145,62 +163,9 @@ pub struct PathMetadata<'a> {
    /// should not, however, assume the integer has invariably been correctly encoded (e.g., sometimes a single digit
    /// number might appear as in `v3`).
    pub version: Option<&'a str>,
-    /// Time instants or periods will be represented by a construction of the form `N1-N2`, where N1 and N2 are of the
-    /// form `yyyy[MM[dd[hh[mm[ss]]]]][-suffix]`, where `yyyy`, `MM`, `dd`, `hh`, `mm`, and `ss` are integer year,
-    /// month, day, hour, minute, and second, respectively, and the precision with which time is expressed must
-    /// unambiguously resolve the interval between time-samples contained in the file or virtual file. If only a single
-    /// time instant is included in the dataset, N2 may normally be omitted, but for CMIP5 N2 is required and in this
-    /// case would be identical to N1.
-    ///
-    /// The optional `-suffix` can be included to indicate that the netCDF file contains a climatology (suffix =
-    /// `-clim`) or a single time mean, for example, over multiple years (suffix = `-avg`). consider a file containing a
-    /// single time-average, based on daily samples for the two-week period from February 1, 1971 through February 14,
-    /// 1971. In this case the frequency for the dataset would be `day` (because the average is based on daily samples),
-    /// and the suffix would be `19710201-19710214-avg`
-    ///
-    /// Note that the DRS does not explicitly specify the calendar type (e.g., Julian, Gregorian), but the calendar will
-    /// be indicated by one of the attributes in each netCDF file. This is omitted for variables that are
-    /// time-independent.
+    /// The time interval covered by this dataset. See [`TemporalSubset`] for more details.
    pub temporal_subset: Option<TemporalSubset<'a>>,
-    /// The geographical indicator is always optional, but when present it should appear last in the extended path. This
-    /// indicator specifies geographical subsets described by bounding boxes (e.g.  20S to 20N and 0 to 180E) or by
-    /// named regions (e.g., 'pacific-ocean`). The underscore character (`_`) is forbidden in the geographical
-    /// indicator.
-    ///
-    /// The DRS specification for this indicator is a string of the form `g-XXXX[-YYYY].` The `g-` indicates that some
-    /// spatial selection or processing has been done (i.e., selection of a sub-global region and possibly spatial
-    /// averaging).  The `XXXX`, which must not be omitted, is either a named region (with names from a specific
-    /// gazetteer, which is yet to be selected) or the bounds of a latitude-longitude rectangle (following the template
-    /// defined below). The `YYYY` is optional and indicates if and what sort of spatial averaging has been performed
-    /// and whether the average includes masking of certain areas within the region (e.g., masking of land areas). The
-    /// DRS currently includes a single named region: `global`, which is used to select data from the entire horizontal
-    /// domain.
-    ///
-    /// In the case of a bounding box, the bounds of the region should be specified following the template,
-    /// `latJHJJHHlonMZMMZZ` where J, JJ, M and MM are integers, indicating the southern, northern, western and eastern
-    /// edges of the bounding box, respectively. H and HH are restricted to `N` or `S` (indicating "north" or "south"),
-    /// and the Z and ZZ are restricted to `E` or `W` (indicating "east" or "west"). The longitude values should be in
-    /// the range from 0 to 180 (e.g., a box spanning 200 degrees of longitude could be specified by `10W170W`, but not
-    /// by `10W190E`, even though 170W and 190E are the same longitude). The latitude and longitude values should be
-    /// rounded to the nearest integer. Omission of the latitude range or the longitude range implies that data were
-    /// selected from the entire domain of the omitted dimension. (For example, `lat20S20N` implies all longitudes were
-    /// included.) Remember, however, that if `XXXX` designates a bounding box, then at least one of the dimensions must
-    /// appear.
-    ///
-    /// The `YYYY` string is of the form `[yyy]-[zzz]` where the hyphen should be omitted unless both `yyy` and `zzz`
-    /// are present. As options for `yyy`, the DRS currently includes `lnd` and `ocn`.  The `lnd` suffix indicates that
-    /// only `land` locations are considered, and the `ocn` suffix indicates that only `ocean` locations (including sea
-    /// ice) are considered. As options for `zzz`, the DRS currently includes `zonalavg` and `areaavg`, which indicate
-    /// `zonal mean` and `area mean` respectively.
-    ///
-    /// Here are some examples of geographical indicators:
-    /// * `g-lat20S20Nlon170W130W”`– a geographical subset defined by a bounding box (latitudes -20 to 20, and
-    ///     longitudes -170 to -130, when rounded to the nearest integer)
-    /// * `g-global-ocn-areaavg` – an average over the world’s oceans.
-    /// * `g-lat20S20N` – a geographical subset defined by a bounding box covering all longitudes
-    ///     and extending from 20S to 20N.
-    /// * `g-lat20S20N-lnd-zonalavg` – a zonal average over tropical lands, covering all
-    ///     longitudes.
+    /// The geographical region covered by this dataset. See [`GeographicalInfo`] for more details.
    pub geographical_info: Option<GeographicalInfo<'a>>,
 }

 @@ -212,14 +177,26 @@ pub struct PathMetadata<'a> {
 /// Further information: <https://pcmdi.llnl.gov/mips/cmip5/requirements.html>
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Frequency {
+    /// Yearly frequency
    Year,
+    /// Monthly frequency
    Month,
+    /// Daily frequency
    Day,
+    /// 6 hourly frequency
    Hour6,
+    /// 3 hourly frequency
    Hour3,
+    /// Hourly frequency.
+    ///
+    /// This value is not part of the CMIP5 DRS spec but is frequently used
    Hour1,
+    // TODO: it would be nice to have more details on these 2
+    /// Sub-hourly frequency
    SubHour,
+    /// Climatological monthly mean frequency
    ClimatologicalMonthlyMean,
+    /// This dataset does not have data that changes over time
    Fixed,
 }

 @@ -241,6 +218,7 @@ impl ToString for Frequency {
    }
 }

+/// Error parsing a string value into a frequency
 #[derive(Debug, Error)]
 #[error("invalid frequency value {given}")]
 pub struct InvalidFrequencyError {
 @@ -279,14 +257,22 @@ impl FromStr for Frequency {
 /// Further information: <https://pcmdi.llnl.gov/mips/cmip5/requirements.html>
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum ModelingRealm {
+    /// Dataset applies to atmospheric realm
    Atmosphere,
+    /// Dataset applies to ocean
    Ocean,
+    /// Dataset applies to land
    Land,
+    /// Dataset applies to land ice
    LandIce,
+    /// Dataset applies to sea ice
    SeaIce,
+    /// Dataset applies to aerosol
    Aerosol,
+    /// Dataset applies to atmospheric chemicals
    AtmosphereChemical,
-    OceanGeochemical,
+    /// Dataset applies to ocean biogeochemical
+    OceanBiogeochemical,
 }

 impl ToString for ModelingRealm {
 @@ -300,12 +286,13 @@ impl ToString for ModelingRealm {
            SeaIce => "seaice",
            Aerosol => "aerosol",
            AtmosphereChemical => "atmoschem",
-            OceanGeochemical => "ocnbgchem",
+            OceanBiogeochemical => "ocnbgchem",
        };
        s.to_owned()
    }
 }

+/// Error parsing modeling realm from a string
 #[derive(Debug, Error)]
 #[error("invalid modeling realm value {given}")]
 pub struct InvalidModelingRealmError {
 @@ -326,7 +313,7 @@ impl FromStr for ModelingRealm {
            "seaice" => Ok(SeaIce),
            "aerosol" => Ok(Aerosol),
            "atmoschem" => Ok(AtmosphereChemical),
-            "ocnbgchem" => Ok(OceanGeochemical),
+            "ocnbgchem" => Ok(OceanBiogeochemical),
            _ => Err(InvalidModelingRealmError { given: s }),
        }
    }
 @@ -338,8 +325,15 @@ impl FromStr for ModelingRealm {
 /// should invariably be assigned the value zero (`i0`). The same holds true for the other numbers.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct EnsembleMember {
+    /// Used to distinguish between members of an ensemble that are generated by initializing a set of runs with
+    /// different, equally realistic initial conditions.
    pub realization: u64,
+    /// Models used for forecasts that depend on the initial conditions might be initialized from observations using
+    /// different methods or different observational datasets. This value distinguishes between conditions
    pub initialization: u64,
+    /// The "perturbed physics" numbers used to distinguish between closely-related model versions which are, as a
+    /// group, referred to as a perturbed physics ensemble where the different models use different sets of model
+    /// parameters.
    pub physics: u64,
 }

 @@ -352,6 +346,7 @@ impl ToString for EnsembleMember {
    }
 }

+/// Error parsing ensemble member from a string
 #[derive(Debug, Error)]
 #[error("invalid ensemble member string: {reason}")]
 pub struct InvalidEnsembleMember {
 @@ -375,25 +370,88 @@ impl FromStr for EnsembleMember {
    }
 }

+/// Time instants or periods will be represented by a construction of the form `N1-N2`, where N1 and N2 are of the
+/// form `yyyy[MM[dd[hh[mm[ss]]]]][-suffix]`, where `yyyy`, `MM`, `dd`, `hh`, `mm`, and `ss` are integer year,
+/// month, day, hour, minute, and second, respectively, and the precision with which time is expressed must
+/// unambiguously resolve the interval between time-samples contained in the file or virtual file. If only a single
+/// time instant is included in the dataset, N2 may normally be omitted, but for CMIP5 N2 is required and in this
+/// case would be identical to N1.
+///
+/// Note that the DRS does not explicitly specify the calendar type (e.g., Julian, Gregorian), but the calendar will
+/// be indicated by one of the attributes in each netCDF file. This is omitted for variables that are
+/// time-independent.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TemporalSubset<'a> {
+    /// Start of the time period
    pub start: NaiveDateTime,
+    /// End of the time period
    pub end: NaiveDateTime,
+    /// The optional `-suffix` can be included to indicate that the netCDF file contains a climatology (suffix =
+    /// `-clim`) or a single time mean, for example, over multiple years (suffix = `-avg`). Consider a file containing a
+    /// single time-average, based on daily samples for the two-week period from February 1, 1971 through February 14,
+    /// 1971. In this case the frequency for the dataset would be `day` (because the average is based on daily samples),
+    /// and the suffix would be `19710201-19710214-avg`
    pub suffix: Option<&'a str>,
 }

+/// The geographical indicator is always optional, but when present it should appear last in the extended path. This
+/// indicator specifies geographical subsets described by bounding boxes (e.g.  20S to 20N and 0 to 180E) or by
+/// named regions (e.g., 'pacific-ocean`). The underscore character (`_`) is forbidden in the geographical
+/// indicator.
+///
+/// The DRS specification for this indicator is a string of the form `g-XXXX[-YYYY].` The `g-` indicates that some
+/// spatial selection or processing has been done (i.e., selection of a sub-global region and possibly spatial
+/// averaging).  The `XXXX`, which must not be omitted, is either a named region (with names from a specific
+/// gazetteer, which is yet to be selected) or the bounds of a latitude-longitude rectangle (following the template
+/// defined below). The `YYYY` is optional and indicates if and what sort of spatial averaging has been performed
+/// and whether the average includes masking of certain areas within the region (e.g., masking of land areas). The
+/// DRS currently includes a single named region: `global`, which is used to select data from the entire horizontal
+/// domain.
+///
+/// In the case of a bounding box, the bounds of the region should be specified following the template,
+/// `latJHJJHHlonMZMMZZ` where J, JJ, M and MM are integers, indicating the southern, northern, western and eastern
+/// edges of the bounding box, respectively. H and HH are restricted to `N` or `S` (indicating "north" or "south"),
+/// and the Z and ZZ are restricted to `E` or `W` (indicating "east" or "west"). The longitude values should be in
+/// the range from 0 to 180 (e.g., a box spanning 200 degrees of longitude could be specified by `10W170W`, but not
+/// by `10W190E`, even though 170W and 190E are the same longitude). The latitude and longitude values should be
+/// rounded to the nearest integer. Omission of the latitude range or the longitude range implies that data were
+/// selected from the entire domain of the omitted dimension. (For example, `lat20S20N` implies all longitudes were
+/// included.) Remember, however, that if `XXXX` designates a bounding box, then at least one of the dimensions must
+/// appear.
+///
+/// The `YYYY` string is of the form `[yyy]-[zzz]` where the hyphen should be omitted unless both `yyy` and `zzz`
+/// are present. As options for `yyy`, the DRS currently includes `lnd` and `ocn`.  The `lnd` suffix indicates that
+/// only `land` locations are considered, and the `ocn` suffix indicates that only `ocean` locations (including sea
+/// ice) are considered. As options for `zzz`, the DRS currently includes `zonalavg` and `areaavg`, which indicate
+/// `zonal mean` and `area mean` respectively.
+///
+/// Here are some examples of geographical indicators:
+/// * `g-lat20S20Nlon170W130W”`– a geographical subset defined by a bounding box (latitudes -20 to 20, and
+///     longitudes -170 to -130, when rounded to the nearest integer)
+/// * `g-global-ocn-areaavg` – an average over the world’s oceans.
+/// * `g-lat20S20N` – a geographical subset defined by a bounding box covering all longitudes
+///     and extending from 20S to 20N.
+/// * `g-lat20S20N-lnd-zonalavg` – a zonal average over tropical lands, covering all
+///     longitudes.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct GeographicalInfo<'a> {
+    /// The region of the file
    pub region: Region<'a>,
+    /// How the data has been averaged across the region
    pub averaging: Option<&'a str>,
 }

+/// The different types of region definitions
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum Region<'a> {
+    /// A region defined by a latitude-longitude bounding box
    BoundingBox {
+        /// the latitude of the bounding box, both north and south bounds
        lat: Option<&'a str>,
+        /// the longitude of the bounding box, both east and west bounds
        lon: Option<&'a str>,
    },
+    /// A region defined by a single name, e.g. `global`.
    Named(&'a str),
 }

 @@ -438,7 +496,7 @@ mod tests {
            ("seaIce", SeaIce),
            ("aerosol", Aerosol),
            ("atmosChem", AtmosphereChemical),
-            ("ocnBgchem", OceanGeochemical),
+            ("ocnBgchem", OceanBiogeochemical),
        ];
        for (s, expected) in cases.iter() {
            let res = ModelingRealm::from_str(s);