diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 835e3839ed6fcef89284f51b73c91f645511b223..86ea82140133b3d89876252316a133f374df4444 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -48,13 +48,21 @@ test_39: script: - /tmp/test/bin/python -m pytest -vv +test_310: + << : *py_test + before_script: + - conda create -q -p /tmp/test python=3.10 pip dask -y + - /tmp/test/bin/python -m pip install -e .[test] + script: + - /tmp/test/bin/python -m pytest -vv + pages: stage: report needs: [] tags: - conda before_script: - - conda create -c conda-forge -q -p /tmp/test python=3.10 pip dask -y + - conda create -c conda-forge -q -p /tmp/test python=3.11 pip dask hdf5 -y - /tmp/test/bin/python -m pip install -e .[test] script: - /tmp/test/bin/coverage run -m pytest diff --git a/src/rechunk_data/__init__.py b/src/rechunk_data/__init__.py index 2f849bf6a69335fb95d75c534d8a81fc45e1b579..99f4c459fed44449b34a4f2da00708e462fe2b18 100644 --- a/src/rechunk_data/__init__.py +++ b/src/rechunk_data/__init__.py @@ -10,7 +10,7 @@ from ._rechunk import ( logger, ) -__version__ = "2208.0.1" +__version__ = "2301.0.0" PROGRAM_NAME = "rechunk-data" @@ -48,7 +48,7 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace: "--netcdf-engine", help=("The netcdf engine used to create the new netcdf file."), choices=("h5netcdf", "netcdf4"), - default="h5netcdf", + default="netcdf4", type=str, ) parser.add_argument( diff --git a/src/rechunk_data/_rechunk.py b/src/rechunk_data/_rechunk.py index 18ab7eb8ac4f079839f5c805611cf123abfba413..26022cdf0d32ba7302eccfe22e28ad9da2490291 100644 --- a/src/rechunk_data/_rechunk.py +++ b/src/rechunk_data/_rechunk.py @@ -118,12 +118,13 @@ def _rechunk_dataset( encoding[data_var] = { str(k): v for k, v in dset[var].encoding.items() if str(k) in _keywords } - encoding[data_var]["chunksizes"] = new_chunks + if engine != "netcdf4" or encoding[data_var].get("contiguous", False) is False: + encoding[data_var]["chunksizes"] = new_chunks return dset, encoding def rechunk_dataset( - dset: xr.Dataset, engine: Literal["h5netcdf", "netcdf4"] = "h5netcdf" + dset: xr.Dataset, engine: Literal["h5netcdf", "netcdf4"] = "netcdf4" ) -> xr.Dataset: """Rechunk a xarray dataset. @@ -131,7 +132,7 @@ def rechunk_dataset( ---------- dset: xarray.Dataset Input dataset that is going to be rechunked - engine: str, default: h5netcdf + engine: str, default: netcdf4 The netcdf engine used to create the new netcdf file. Returns @@ -145,7 +146,7 @@ def rechunk_dataset( def rechunk_netcdf_file( input_path: os.PathLike, output_path: Optional[os.PathLike] = None, - engine: Literal["h5netcdf", "netcdf4"] = "h5netcdf", + engine: Literal["h5netcdf", "netcdf4"] = "netcdf4", ) -> None: """Rechunk netcdf files. @@ -158,7 +159,7 @@ def rechunk_netcdf_file( Output file/directory of the chunked netcdf file(s). Note: If ``input`` is a directory output should be a directory. If None given (default) the ``input`` is overidden. - engine: str, default: h5netcdf + engine: str, default: netcdf4 The netcdf engine used to create the new netcdf file. """ input_path = Path(input_path).expanduser().absolute() @@ -173,16 +174,21 @@ def rechunk_netcdf_file( output_file = Path(output_path) output_file.parent.mkdir(exist_ok=True, parents=True) try: - with xr.open_mfdataset(str(input_file), parallel=True) as nc_data: + with xr.open_mfdataset( + str(input_file), decode_cf=True, parallel=True + ) as nc_data: new_data, encoding = _rechunk_dataset(nc_data, engine) if encoding: logger.debug( - "Loading data into memory (%s).", format_bytes(new_data.nbytes) + "Loading data into memory (%s).", + format_bytes(new_data.nbytes), ) new_data = new_data.load() except Exception as error: logger.error( - "Error while processing file %s: %s", str(input_file), str(error) + "Error while processing file %s: %s", + str(input_file), + str(error), ) continue _save_dataset(