Skip to content
Snippets Groups Projects
Commit 536bfc3f authored by Martin Bergemann's avatar Martin Bergemann :speech_balloon:
Browse files

Check make cf conventions configurable

parent 3d787859
No related branches found
No related tags found
1 merge request!14Check make cf conventions configurable
Pipeline #46070 failed
...@@ -29,20 +29,22 @@ new_data = rechunk_dataset(dset) ...@@ -29,20 +29,22 @@ new_data = rechunk_dataset(dset)
```bash ```bash
rechunk-data --help rechunk-data --help
usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [-v] [-V] input usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [--skip-cf-convention] [-v] [-V] input
Rechunk input netcdf data to optimal chunk-size. approx. 126 MB per chunk Rechunk input netcdf data to optimal chunk-size. approx. 126 MB per chunk
positional arguments: positional arguments:
input Input file/directory. If a directory is given all ``.nc`` files in all sub directories will be processed input Input file/directory. If a directory is given all ``.nc`` files in all sub directories will be processed
optional arguments: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--output OUTPUT Output file/directory of the chunked netcdf file(s). Note: If ``input`` is a directory output should be a --output OUTPUT Output file/directory of the chunked netcdf file(s).
directory. If None given (default) the ``input`` is overidden. (default: None) Note: If ``input`` is a directory output should be a directory.
If None given (default) the ``input`` is overidden. (default: None)
--netcdf-engine {h5netcdf,netcdf4} --netcdf-engine {h5netcdf,netcdf4}
The netcdf engine used to create the new netcdf file. (default: h5netcdf) The netcdf engine used to create the new netcdf file. (default: netcdf4)
-v --skip-cf-convention Do not assume assume data variables follow CF conventions. (default: False)
-v Increase verbosity (default: 0)
-V, --version show program's version number and exit -V, --version show program's version number and exit
``` ```
......
...@@ -51,10 +51,17 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace: ...@@ -51,10 +51,17 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace:
default="netcdf4", default="netcdf4",
type=str, type=str,
) )
parser.add_argument(
"--skip-cf-convention",
help="Do not assume assume data variables follow CF conventions.",
action="store_true",
default=False,
)
parser.add_argument( parser.add_argument(
"-v", "-v",
action="count", action="count",
default=0, default=0,
help="Increase verbosity",
) )
parser.add_argument( parser.add_argument(
"-V", "-V",
...@@ -63,11 +70,16 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace: ...@@ -63,11 +70,16 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace:
version=f"%(prog)s {__version__}", version=f"%(prog)s {__version__}",
) )
args = parser.parse_args(argv) args = parser.parse_args(argv)
logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 0)) logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 10))
return args return args
def cli(argv: Optional[List[str]] = None) -> None: def cli(argv: Optional[List[str]] = None) -> None:
"""Command line interface calling the rechunking method.""" """Command line interface calling the rechunking method."""
args = parse_args(argv) args = parse_args(argv)
rechunk_netcdf_file(args.input, args.output, engine=args.netcdf_engine) rechunk_netcdf_file(
args.input,
args.output,
engine=args.netcdf_engine,
decode_cf=args.skip_cf_convention is False,
)
...@@ -92,13 +92,21 @@ def _rechunk_dataset( ...@@ -92,13 +92,21 @@ def _rechunk_dataset(
) from error ) from error
for data_var in dset.data_vars: for data_var in dset.data_vars:
var = str(data_var) var = str(data_var)
if not isinstance(dset[var].data, Array): if (
not isinstance(dset[var].data, Array)
or "bnds" in var
or "rotated_pole" in var
):
logger.debug("Skipping rechunking variable %s", var) logger.debug("Skipping rechunking variable %s", var)
continue continue
logger.debug("Rechunking variable %s", var) logger.debug("Rechunking variable %s", var)
chunks: Dict[int, Optional[str]] = {} chunks: Dict[int, Optional[str]] = {}
for i, dim in enumerate(map(str, dset[var].dims)): for i, dim in enumerate(map(str, dset[var].dims)):
if "lon" in dim.lower() or "lat" in dim.lower() or "bnds" in dim.lower(): if (
"lon" in dim.lower()
or "lat" in dim.lower()
or "bnds" in dim.lower()
):
chunks[i] = None chunks[i] = None
else: else:
chunks[i] = "auto" chunks[i] = "auto"
...@@ -116,9 +124,14 @@ def _rechunk_dataset( ...@@ -116,9 +124,14 @@ def _rechunk_dataset(
) )
logger.debug("Settings encoding of variable %s", var) logger.debug("Settings encoding of variable %s", var)
encoding[data_var] = { encoding[data_var] = {
str(k): v for k, v in dset[var].encoding.items() if str(k) in _keywords str(k): v
for k, v in dset[var].encoding.items()
if str(k) in _keywords
} }
if engine != "netcdf4" or encoding[data_var].get("contiguous", False) is False: if (
engine != "netcdf4"
or encoding[data_var].get("contiguous", False) is False
):
encoding[data_var]["chunksizes"] = new_chunks encoding[data_var]["chunksizes"] = new_chunks
return dset, encoding return dset, encoding
...@@ -146,6 +159,7 @@ def rechunk_dataset( ...@@ -146,6 +159,7 @@ def rechunk_dataset(
def rechunk_netcdf_file( def rechunk_netcdf_file(
input_path: os.PathLike, input_path: os.PathLike,
output_path: Optional[os.PathLike] = None, output_path: Optional[os.PathLike] = None,
decode_cf: bool = True,
engine: Literal["h5netcdf", "netcdf4"] = "netcdf4", engine: Literal["h5netcdf", "netcdf4"] = "netcdf4",
) -> None: ) -> None:
"""Rechunk netcdf files. """Rechunk netcdf files.
...@@ -159,6 +173,9 @@ def rechunk_netcdf_file( ...@@ -159,6 +173,9 @@ def rechunk_netcdf_file(
Output file/directory of the chunked netcdf file(s). Note: If ``input`` Output file/directory of the chunked netcdf file(s). Note: If ``input``
is a directory output should be a directory. If None given (default) is a directory output should be a directory. If None given (default)
the ``input`` is overidden. the ``input`` is overidden.
decode_cf: bool, default: True
Whether to decode these variables, assuming they were saved according
to CF conventions.
engine: str, default: netcdf4 engine: str, default: netcdf4
The netcdf engine used to create the new netcdf file. The netcdf engine used to create the new netcdf file.
""" """
...@@ -175,7 +192,10 @@ def rechunk_netcdf_file( ...@@ -175,7 +192,10 @@ def rechunk_netcdf_file(
output_file.parent.mkdir(exist_ok=True, parents=True) output_file.parent.mkdir(exist_ok=True, parents=True)
try: try:
with xr.open_mfdataset( with xr.open_mfdataset(
str(input_file), decode_cf=True, parallel=True str(input_file),
parallel=True,
decode_cf=decode_cf,
inline_array=True,
) as nc_data: ) as nc_data:
new_data, encoding = _rechunk_dataset(nc_data, engine) new_data, encoding = _rechunk_dataset(nc_data, engine)
if encoding: if encoding:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment