diff --git a/README.md b/README.md index 385782d918039f1e43d39bbac07b1e62ee8ed279..b75b5d68a901f69260102416628a46fbf95af2b1 100644 --- a/README.md +++ b/README.md @@ -29,20 +29,22 @@ new_data = rechunk_dataset(dset) ```bash rechunk-data --help -usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [-v] [-V] input +usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [--skip-cf-convention] [-v] [-V] input Rechunk input netcdf data to optimal chunk-size. approx. 126 MB per chunk positional arguments: input Input file/directory. If a directory is given all ``.nc`` files in all sub directories will be processed -optional arguments: +options: -h, --help show this help message and exit - --output OUTPUT Output file/directory of the chunked netcdf file(s). Note: If ``input`` is a directory output should be a - directory. If None given (default) the ``input`` is overidden. (default: None) + --output OUTPUT Output file/directory of the chunked netcdf file(s). + Note: If ``input`` is a directory output should be a directory. + If None given (default) the ``input`` is overidden. (default: None) --netcdf-engine {h5netcdf,netcdf4} - The netcdf engine used to create the new netcdf file. (default: h5netcdf) - -v + The netcdf engine used to create the new netcdf file. (default: netcdf4) + --skip-cf-convention Do not assume assume data variables follow CF conventions. (default: False) + -v Increase verbosity (default: 0) -V, --version show program's version number and exit ``` diff --git a/src/rechunk_data/__init__.py b/src/rechunk_data/__init__.py index 99f4c459fed44449b34a4f2da00708e462fe2b18..7f8dcdd8848c7063e1f527b061d66ea1988ca913 100644 --- a/src/rechunk_data/__init__.py +++ b/src/rechunk_data/__init__.py @@ -10,7 +10,7 @@ from ._rechunk import ( logger, ) -__version__ = "2301.0.0" +__version__ = "2309.0.0" PROGRAM_NAME = "rechunk-data" @@ -51,10 +51,17 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace: default="netcdf4", type=str, ) + parser.add_argument( + "--skip-cf-convention", + help="Do not assume assume data variables follow CF conventions.", + action="store_true", + default=False, + ) parser.add_argument( "-v", action="count", default=0, + help="Increase verbosity", ) parser.add_argument( "-V", @@ -63,11 +70,16 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace: version=f"%(prog)s {__version__}", ) args = parser.parse_args(argv) - logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 0)) + logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 10)) return args def cli(argv: Optional[List[str]] = None) -> None: """Command line interface calling the rechunking method.""" args = parse_args(argv) - rechunk_netcdf_file(args.input, args.output, engine=args.netcdf_engine) + rechunk_netcdf_file( + args.input, + args.output, + engine=args.netcdf_engine, + decode_cf=args.skip_cf_convention is False, + ) diff --git a/src/rechunk_data/_rechunk.py b/src/rechunk_data/_rechunk.py index 26022cdf0d32ba7302eccfe22e28ad9da2490291..311ed2508fb36b78904887a398e13d08c410904c 100644 --- a/src/rechunk_data/_rechunk.py +++ b/src/rechunk_data/_rechunk.py @@ -92,7 +92,11 @@ def _rechunk_dataset( ) from error for data_var in dset.data_vars: var = str(data_var) - if not isinstance(dset[var].data, Array): + if ( + not isinstance(dset[var].data, Array) + or "bnds" in var + or "rotated_pole" in var + ): logger.debug("Skipping rechunking variable %s", var) continue logger.debug("Rechunking variable %s", var) @@ -146,6 +150,7 @@ def rechunk_dataset( def rechunk_netcdf_file( input_path: os.PathLike, output_path: Optional[os.PathLike] = None, + decode_cf: bool = True, engine: Literal["h5netcdf", "netcdf4"] = "netcdf4", ) -> None: """Rechunk netcdf files. @@ -159,6 +164,9 @@ def rechunk_netcdf_file( Output file/directory of the chunked netcdf file(s). Note: If ``input`` is a directory output should be a directory. If None given (default) the ``input`` is overidden. + decode_cf: bool, default: True + Whether to decode these variables, assuming they were saved according + to CF conventions. engine: str, default: netcdf4 The netcdf engine used to create the new netcdf file. """ @@ -175,7 +183,9 @@ def rechunk_netcdf_file( output_file.parent.mkdir(exist_ok=True, parents=True) try: with xr.open_mfdataset( - str(input_file), decode_cf=True, parallel=True + str(input_file), + parallel=True, + decode_cf=decode_cf, ) as nc_data: new_data, encoding = _rechunk_dataset(nc_data, engine) if encoding: diff --git a/src/rechunk_data/tests/test_rechunk_netcdf.py b/src/rechunk_data/tests/test_rechunk_netcdf.py index 81d02ff2e1b0dd3f11e74df07ac54a1e93ac8fa9..3544d3d6138bbc1c9b4204c9b206ac88b41708b2 100644 --- a/src/rechunk_data/tests/test_rechunk_netcdf.py +++ b/src/rechunk_data/tests/test_rechunk_netcdf.py @@ -67,6 +67,5 @@ def test_wrong_or_format(small_chunk_data, caplog) -> None: def test_wrong_engine(small_chunk_data) -> None: - with pytest.raises(ValueError): rechunk_dataset(small_chunk_data, engine="foo")