diff --git a/tests/test_validations.ipynb b/tests/test_validations.ipynb index cda0016e82c050befe432e9f88cb3708cd551aa3..816555bf5d2bea2d1f6de42f364b8b31c8f2ca5b 100644 --- a/tests/test_validations.ipynb +++ b/tests/test_validations.ipynb @@ -57,8 +57,14 @@ " return zarr_dset.set_coords(coords)\n", "\n", "def get_encoding(zarr_dset):\n", - " encoding = {var:zarr_dset[var].encoding \n", - " for var in list(zarr_dset.variables.keys())}\n", + " encoding = {\n", + " var:{\n", + " key : zarr_dset[var].encoding[key]\n", + " for key in zarr_dset[var].encoding.keys()\n", + " if key not in ['chunks', 'preferred_chunks', 'compressor', 'filters']\n", + " }\n", + " for var in list(zarr_dset.variables.keys())\n", + " }\n", " for var in zarr_dset.data_vars:\n", " encoding[var].update(dict(zlib=True, complevel=1))\n", " for var in zarr_dset.coords:\n", diff --git a/tzis/tzis.py b/tzis/tzis.py index 16c8c0bb43ae37feb565d987195cafed8c2b4f13..fc84843a2d9b0d5e5f3e618a259b593ebd7f54bf 100644 --- a/tzis/tzis.py +++ b/tzis/tzis.py @@ -106,16 +106,31 @@ class Tzis: # if open_kwargs: self.mf_dset = self.open_mf_dataset( - mf_dset, self.varname, keep_attrs=True, xarray_kwargs=xarray_kwargs,**open_kwargs + mf_dset, self.varname, xarray_kwargs=xarray_kwargs,**open_kwargs ) else: self.mf_dset = self.open_mf_dataset( - mf_dset, self.varname, keep_attrs=True, xarray_kwargs=xarray_kwargs + mf_dset, self.varname, xarray_kwargs=xarray_kwargs ) self.recent_chunk=None def get_storestring_and_options(self,store): + """ToDo: Check if string is required. If swift is the protocol, use aiohttp_retry client to + overcome writing errors. + + Parameters + ---------- + store : `fsspec` mapping. + + Returns + ------- + storestring + The root of the mapping as str. + storage_options + Backend options for to_zarr depending on the protocol as dictionary. + """ + storage_options=None storestring=store if self.store.protocol == "swift": @@ -125,6 +140,19 @@ class Tzis: return storestring, storage_options def init_verify_provenance(self, reset=False): + """ + Initializes a provenance document if reset is True or self.provenance is None. + + Parameters + ---------- + reset : If True, provenance is recreated. Default is False. + + Returns + ------- + status + True if provenance was newly created, False if not. + """ + if self.provenance and reset: if self.verbose: print( @@ -136,7 +164,38 @@ class Tzis: return True return False - def open_mf_dataset(self, mf, varname, chunkdim=None, target_mb=None, keep_attrs=True, xarray_kwargs=None): + def open_mf_dataset(self, mf, varname, chunkdim=None, target_mb=None, xarray_kwargs=None): + """ + Opens the dataset with xarrays `open_mfdataset` + - with optimized chunks by estimating sizes with a test file `mf[0]` if `chunkdim` and `target_mb` are set. + - with OPEN_MFDATASET_KWARGS and `xarray_kwargs` if provided + + It saves the original size of the source if available for estimating compression ratio. + It initializes the provenance. + It collects conflicting attributes and saves it as a new attribute. + It sets a new `tracking_id` and appends to the history attribute. + + Parameters + ---------- + mf : list or str + mf is converted to a list and used as the first argument of `xarray.open_mfdataset`. + varname: str + varname is the variable which is used for rechunking and which should be written to the target storage. + chunkdim=None : str + chunkdim is the chunk dimension used for rechunking. Only set this in combination with target_mb. + target_mb=None : int + target_mb is the desired size of one chunk in the target storage in megabytes. + Only set this in combination with chunkdim. + xarray_kwargs=None : dict + xarray_kwargs are unpacked within `open_mfdataset`. + + + Returns + ------- + Dataset or None + The xarray dataset with optimized chunk setting and attributes. + """ + # if type(mf) != list and type(mf) != str : # raise ValueError("Dataset '{0}' must either be a string or a list of strings") @@ -234,6 +293,28 @@ class Tzis: return None def get_conflict_attrs(self, mf, mf_dset, xarray_kwargs): + """ + Collects attributes which conflict within all single dsets in `mf`. + It opens all elements of `mf` with `xarray.open_dataset` and collects + attributes in a dictionary and their values in lists. + + Parameters + ---------- + mf : list or str + mf is converted to a list and used as the first argument of `xarray.open_mfdataset`. + mf_dset: Dataset + `mf_dset` is the `xarray` object returned by `xarray.open_mfdataset` which does not include + the conflicting attributes. + xarray_kwargs=None : dict + xarray_kwargs are unpacked within `open_mfdataset`. + + + Returns + ------- + Dict + All conflicting attributes and its values. + """ + conflict_attrs = {} # try: maxdigits = len(str(len(mf))) @@ -284,6 +365,27 @@ class Tzis: return return_varname def open_store(self, os_url, container, os_name): + """ + Collects attributes which conflict within all single dsets in `mf`. + It opens all elements of `mf` with `xarray.open_dataset` and collects + attributes in a dictionary and their values in lists. + + Parameters + ---------- + mf : list or str + mf is converted to a list and used as the first argument of `xarray.open_mfdataset`. + mf_dset: Dataset + `mf_dset` is the `xarray` object returned by `xarray.open_mfdataset` which does not include + the conflicting attributes. + xarray_kwargs=None : dict + xarray_kwargs are unpacked within `open_mfdataset`. + + + Returns + ------- + Dict + All conflicting attributes and its values. + """ if not all([os_url,container]): raise ValueError(f"Specify at least os_url and container for open_store") if os_name: