Skip to content
Snippets Groups Projects
Commit 536bfc3f authored by Martin Bergemann's avatar Martin Bergemann :speech_balloon:
Browse files

Check make cf conventions configurable

parent 3d787859
No related branches found
No related tags found
1 merge request!14Check make cf conventions configurable
Pipeline #46070 failed
......@@ -29,20 +29,22 @@ new_data = rechunk_dataset(dset)
```bash
rechunk-data --help
usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [-v] [-V] input
usage: rechunk-data [-h] [--output OUTPUT] [--netcdf-engine {h5netcdf,netcdf4}] [--skip-cf-convention] [-v] [-V] input
Rechunk input netcdf data to optimal chunk-size. approx. 126 MB per chunk
positional arguments:
input Input file/directory. If a directory is given all ``.nc`` files in all sub directories will be processed
optional arguments:
options:
-h, --help show this help message and exit
--output OUTPUT Output file/directory of the chunked netcdf file(s). Note: If ``input`` is a directory output should be a
directory. If None given (default) the ``input`` is overidden. (default: None)
--output OUTPUT Output file/directory of the chunked netcdf file(s).
Note: If ``input`` is a directory output should be a directory.
If None given (default) the ``input`` is overidden. (default: None)
--netcdf-engine {h5netcdf,netcdf4}
The netcdf engine used to create the new netcdf file. (default: h5netcdf)
-v
The netcdf engine used to create the new netcdf file. (default: netcdf4)
--skip-cf-convention Do not assume assume data variables follow CF conventions. (default: False)
-v Increase verbosity (default: 0)
-V, --version show program's version number and exit
```
......
......@@ -51,10 +51,17 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace:
default="netcdf4",
type=str,
)
parser.add_argument(
"--skip-cf-convention",
help="Do not assume assume data variables follow CF conventions.",
action="store_true",
default=False,
)
parser.add_argument(
"-v",
action="count",
default=0,
help="Increase verbosity",
)
parser.add_argument(
"-V",
......@@ -63,11 +70,16 @@ def parse_args(argv: Optional[List[str]]) -> argparse.Namespace:
version=f"%(prog)s {__version__}",
)
args = parser.parse_args(argv)
logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 0))
logger.setLevel(max(logging.ERROR - (10 + args.v * 10), 10))
return args
def cli(argv: Optional[List[str]] = None) -> None:
"""Command line interface calling the rechunking method."""
args = parse_args(argv)
rechunk_netcdf_file(args.input, args.output, engine=args.netcdf_engine)
rechunk_netcdf_file(
args.input,
args.output,
engine=args.netcdf_engine,
decode_cf=args.skip_cf_convention is False,
)
......@@ -92,13 +92,21 @@ def _rechunk_dataset(
) from error
for data_var in dset.data_vars:
var = str(data_var)
if not isinstance(dset[var].data, Array):
if (
not isinstance(dset[var].data, Array)
or "bnds" in var
or "rotated_pole" in var
):
logger.debug("Skipping rechunking variable %s", var)
continue
logger.debug("Rechunking variable %s", var)
chunks: Dict[int, Optional[str]] = {}
for i, dim in enumerate(map(str, dset[var].dims)):
if "lon" in dim.lower() or "lat" in dim.lower() or "bnds" in dim.lower():
if (
"lon" in dim.lower()
or "lat" in dim.lower()
or "bnds" in dim.lower()
):
chunks[i] = None
else:
chunks[i] = "auto"
......@@ -116,9 +124,14 @@ def _rechunk_dataset(
)
logger.debug("Settings encoding of variable %s", var)
encoding[data_var] = {
str(k): v for k, v in dset[var].encoding.items() if str(k) in _keywords
str(k): v
for k, v in dset[var].encoding.items()
if str(k) in _keywords
}
if engine != "netcdf4" or encoding[data_var].get("contiguous", False) is False:
if (
engine != "netcdf4"
or encoding[data_var].get("contiguous", False) is False
):
encoding[data_var]["chunksizes"] = new_chunks
return dset, encoding
......@@ -146,6 +159,7 @@ def rechunk_dataset(
def rechunk_netcdf_file(
input_path: os.PathLike,
output_path: Optional[os.PathLike] = None,
decode_cf: bool = True,
engine: Literal["h5netcdf", "netcdf4"] = "netcdf4",
) -> None:
"""Rechunk netcdf files.
......@@ -159,6 +173,9 @@ def rechunk_netcdf_file(
Output file/directory of the chunked netcdf file(s). Note: If ``input``
is a directory output should be a directory. If None given (default)
the ``input`` is overidden.
decode_cf: bool, default: True
Whether to decode these variables, assuming they were saved according
to CF conventions.
engine: str, default: netcdf4
The netcdf engine used to create the new netcdf file.
"""
......@@ -175,7 +192,10 @@ def rechunk_netcdf_file(
output_file.parent.mkdir(exist_ok=True, parents=True)
try:
with xr.open_mfdataset(
str(input_file), decode_cf=True, parallel=True
str(input_file),
parallel=True,
decode_cf=decode_cf,
inline_array=True,
) as nc_data:
new_data, encoding = _rechunk_dataset(nc_data, engine)
if encoding:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment