diff --git a/README.md b/README.md index 385782d918039f1e43d39bbac07b1e62ee8ed279..b75b5d68a901f69260102416628a46fbf95af2b1 100644 --- a/README.md +++ b/README.md @@ -29,20 +29,22 @@ new_data = rechunk_dataset(dset) ```bash rechunk-data --help -usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [-v] [-V] input +usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [--skip-cf-convention] [-v] [-V] input Rechunk input netcdf data to optimal chunk-size. approx. 126 MB per chunk positional arguments: input Input file/directory. If a directory is given all ``.nc`` files in all sub directories will be processed -optional arguments: +options: -h, --help show this help message and exit - --output OUTPUT Output file/directory of the chunked netcdf file(s). Note: If ``input`` is a directory output should be a - directory. If None given (default) the ``input`` is overidden. (default: None) + --output OUTPUT Output file/directory of the chunked netcdf file(s). + Note: If ``input`` is a directory output should be a directory. + If None given (default) the ``input`` is overidden. (default: None) --netcdf-engine {h5netcdf,netcdf4} - The netcdf engine used to create the new netcdf file. (default: h5netcdf) - -v + The netcdf engine used to create the new netcdf file. (default: netcdf4) + --skip-cf-convention Do not assume assume data variables follow CF conventions. (default: False) + -v Increase verbosity (default: 0) -V, --version show program's version number and exit ``` diff --git a/src/rechunk_data/__init__.py b/src/rechunk_data/__init__.py index 99f4c459fed44449b34a4f2da00708e462fe2b18..2d6cc7b9a2f0fc4ed6ea82f3f9fff9f2c6b6d782 100644 --- a/src/rechunk_data/__init__.py +++ b/src/rechunk_data/__init__.py @@ -51,10 +51,17 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace: default="netcdf4", type=str, ) + parser.add_argument( + "--skip-cf-convention", + help="Do not assume assume data variables follow CF conventions.", + action="store_true", + default=False, + ) parser.add_argument( "-v", action="count", default=0, + help="Increase verbosity", ) parser.add_argument( "-V", @@ -63,11 +70,16 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace: version=f"%(prog)s {__version__}", ) args = parser.parse_args(argv) - logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 0)) + logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 10)) return args def cli(argv: Optional[List[str]] = None) -> None: """Command line interface calling the rechunking method.""" args = parse_args(argv) - rechunk_netcdf_file(args.input, args.output, engine=args.netcdf_engine) + rechunk_netcdf_file( + args.input, + args.output, + engine=args.netcdf_engine, + decode_cf=args.skip_cf_convention is False, + ) diff --git a/src/rechunk_data/_rechunk.py b/src/rechunk_data/_rechunk.py index 26022cdf0d32ba7302eccfe22e28ad9da2490291..8d104f5908e38bd31f2d4bb0979f3bf33630433e 100644 --- a/src/rechunk_data/_rechunk.py +++ b/src/rechunk_data/_rechunk.py @@ -92,13 +92,21 @@ def _rechunk_dataset( ) from error for data_var in dset.data_vars: var = str(data_var) - if not isinstance(dset[var].data, Array): + if ( + not isinstance(dset[var].data, Array) + or "bnds" in var + or "rotated_pole" in var + ): logger.debug("Skipping rechunking variable %s", var) continue logger.debug("Rechunking variable %s", var) chunks: Dict[int, Optional[str]] = {} for i, dim in enumerate(map(str, dset[var].dims)): - if "lon" in dim.lower() or "lat" in dim.lower() or "bnds" in dim.lower(): + if ( + "lon" in dim.lower() + or "lat" in dim.lower() + or "bnds" in dim.lower() + ): chunks[i] = None else: chunks[i] = "auto" @@ -116,9 +124,14 @@ def _rechunk_dataset( ) logger.debug("Settings encoding of variable %s", var) encoding[data_var] = { - str(k): v for k, v in dset[var].encoding.items() if str(k) in _keywords + str(k): v + for k, v in dset[var].encoding.items() + if str(k) in _keywords } - if engine != "netcdf4" or encoding[data_var].get("contiguous", False) is False: + if ( + engine != "netcdf4" + or encoding[data_var].get("contiguous", False) is False + ): encoding[data_var]["chunksizes"] = new_chunks return dset, encoding @@ -146,6 +159,7 @@ def rechunk_dataset( def rechunk_netcdf_file( input_path: os.PathLike, output_path: Optional[os.PathLike] = None, + decode_cf: bool = True, engine: Literal["h5netcdf", "netcdf4"] = "netcdf4", ) -> None: """Rechunk netcdf files. @@ -159,6 +173,9 @@ def rechunk_netcdf_file( Output file/directory of the chunked netcdf file(s). Note: If ``input`` is a directory output should be a directory. If None given (default) the ``input`` is overidden. + decode_cf: bool, default: True + Whether to decode these variables, assuming they were saved according + to CF conventions. engine: str, default: netcdf4 The netcdf engine used to create the new netcdf file. """ @@ -175,7 +192,10 @@ def rechunk_netcdf_file( output_file.parent.mkdir(exist_ok=True, parents=True) try: with xr.open_mfdataset( - str(input_file), decode_cf=True, parallel=True + str(input_file), + parallel=True, + decode_cf=decode_cf, + inline_array=True, ) as nc_data: new_data, encoding = _rechunk_dataset(nc_data, engine) if encoding: