diff --git a/README.md b/README.md index 1970ebd3425eca956c1ec3cec4addaa4f40ac968..2576f05edece31f5073b37b86dae6d027eea5fa4 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,16 @@ pip install (--user) https://gitlab.dkrz.de/ch1187/rechunk-data/-/archive/2206.0 User the `--user` flag if you do not have super user rights and are not using `anaconda`, `pipenv` or `virtual env` ## Usage -Basic usage: +### Using the python module + +```python +from rechunk_data import rechunk_dataset +import xarray as xr +dset = xr.open_mfdataset("/data/*", parallel=True, combine="by_coords") +new_data = rechunk_dataset(dset) +``` + +### Using the command line interface: ```bash rechunk-data --help diff --git a/src/rechunk_data/__init__.py b/src/rechunk_data/__init__.py index 88420cb5162e735772bbe9d86b15d57c8b42e0ee..bbdf815b88f134b42d2f0e1ec1930f75bb51ba76 100644 --- a/src/rechunk_data/__init__.py +++ b/src/rechunk_data/__init__.py @@ -6,13 +6,11 @@ from pathlib import Path from typing import List, Optional from ._rechunk import ( rechunk_netcdf_file, - _search_for_nc_files, - _save_dataset, - _rechunk_dataset, + rechunk_dataset, logger, ) -__version__ = "2206.0.2" +__version__ = "2206.0.3" PROGRAM_NAME = "rechunk-data" diff --git a/src/rechunk_data/__init__.pyi b/src/rechunk_data/__init__.pyi index df953653c39e6ac7a5fc5a404af62092947907ff..5817e93c573aa8079f6121fb4d0db590e0070a2d 100644 --- a/src/rechunk_data/__init__.pyi +++ b/src/rechunk_data/__init__.pyi @@ -11,6 +11,7 @@ def _save_dataset( dset: xr.Dataset, file_name: Path, encoding: Dict[str, Any], engine: str ) -> None: ... def _rechunk_dataset(dset: xr.Dataset) -> Tuple[xr.Dataset, Dict[str, Any]]: ... +def rechunk_dataset(dset: xr.Dataset) -> xr.dataset: ... def rechunk_netcdf_file( input_path: os.PathLike, output_path: Optional[os.PathLike] = ..., diff --git a/src/rechunk_data/_rechunk.py b/src/rechunk_data/_rechunk.py index 7350800424ab638f2b6caa0e26292a25f12a4128..f591d4b4880bfa5563def6f93990aa2fe4d5a544 100644 --- a/src/rechunk_data/_rechunk.py +++ b/src/rechunk_data/_rechunk.py @@ -90,6 +90,22 @@ def _rechunk_dataset(dset: xr.Dataset) -> Tuple[xr.Dataset, Dict[str, Any]]: return dset, encoding +def rechunk_dataset(dset: xr.Dataset) -> xr.Dataset: + """Rechunk a xarray dataset. + + Parameters + ---------- + dset: xarray.Dataset + Input dataset that is going to be rechunked + + Returns + ------- + xarray.Dataset: rechunked dataset + """ + data, _ = _rechunk_dataset(dset) + return data + + def rechunk_netcdf_file( input_path: os.PathLike, output_path: Optional[os.PathLike] = None, diff --git a/src/rechunk_data/tests/test_rechunk_netcdf.py b/src/rechunk_data/tests/test_rechunk_netcdf.py index 7ec0d8abcf116538c52c9c087646207d78b2477f..fd7da6c98cadb429d3bdc568fc43c28ee2117245 100644 --- a/src/rechunk_data/tests/test_rechunk_netcdf.py +++ b/src/rechunk_data/tests/test_rechunk_netcdf.py @@ -4,7 +4,8 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory from pathlib import Path import dask -from rechunk_data import rechunk_netcdf_file, _save_dataset +from rechunk_data import rechunk_netcdf_file, rechunk_dataset +from rechunk_data._rechunk import _save_dataset def test_rechunk_data_dir_with_overwrite(data_dir: Path) -> None: @@ -38,6 +39,13 @@ def test_rechunk_single_data_file(data_file: Path) -> None: assert Path(temp_file.name).exists() +def test_rechunk_dataset(small_chunk_data) -> None: + """Test rechunking an xarray dataset.""" + with dask.config.set({"array.chunk-size": "1MiB"}): + new_data = rechunk_dataset(small_chunk_data) + assert list(new_data.data_vars) == list(small_chunk_data.data_vars) + + def test_wrong_or_format(small_chunk_data, caplog) -> None: """Testing wrong file format.""" caplog.clear() diff --git a/src/rechunk_data/tests/test_rechunking.py b/src/rechunk_data/tests/test_rechunking.py index 6b77c31ac5fc42ad68581c80720e5088c454ba3f..a8b3d7c6d9791eec1b0c1536a7ec4eb1974a8223 100644 --- a/src/rechunk_data/tests/test_rechunking.py +++ b/src/rechunk_data/tests/test_rechunking.py @@ -2,7 +2,7 @@ import xarray as xr -from rechunk_data import _rechunk_dataset +from rechunk_data._rechunk import _rechunk_dataset def test_rechunking_small_data( diff --git a/src/rechunk_data/tests/test_search_files.py b/src/rechunk_data/tests/test_search_files.py index 4a5439ca88b8860c48b67246c166e57eaeb4b9ad..cff79c039c46fbbc7c61bbd8b470b37320d27965 100644 --- a/src/rechunk_data/tests/test_search_files.py +++ b/src/rechunk_data/tests/test_search_files.py @@ -1,7 +1,7 @@ """Unit tests for searching for files.""" from pathlib import Path -from rechunk_data import _search_for_nc_files +from rechunk_data._rechunk import _search_for_nc_files def test_search_directory(data_dir: Path) -> None: