diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a5970e5b9d668aedfe18f97009be2c7efebe72d0..6f41b7d29bebd84c55abd831327cf212f0d331f9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -80,7 +80,7 @@ hiopy-levante: script: - | module load git - /sw/spack-levante/python-3.9.9-fwvsvi/bin/python -m venv venv + /work/bk1414/m301120/sw-spack-common/python-3.11.2-sk474k/bin/python -m venv venv . venv/bin/activate ICON_DIR=`pwd -P`/icon ( diff --git a/README.md b/README.md index 99ab095e191b6b032f7885ebf67f31c62c33f986..49183eb999848b6343a976c874a6afa21fe706bf 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ python -m pip install git+https://gitlab.dkrz.de/nils/coyote.git ## Installation with ICON on levante ```bash -/sw/spack-levante/python-3.9.9-fwvsvi/bin/python -m venv ./venv --prompt icon +/home/m/m301120/sw/spack-levante/python-3.11.2-sk474k -m venv ./venv --prompt icon . ./venv/bin/activate git clone --recursive git@gitlab.dkrz.de:icon/icon.git icon pushd icon diff --git a/apps/hiopy/_zarr_utils.py b/apps/hiopy/_zarr_utils.py index 39c6cde8986a9a865809bcca53a46e3a6481e03e..401027436abd869f1748dc4cf61d0819933697b6 100644 --- a/apps/hiopy/_zarr_utils.py +++ b/apps/hiopy/_zarr_utils.py @@ -2,9 +2,13 @@ import zarr def get_var_group(v): - root = zarr.Group(store=v.store) - last_slash_idx = v.name.rindex("/") - return root[v.name[:last_slash_idx]] + if not hasattr(v, "root"): + v.root = zarr.open(v.store) + group_path = "/".join(v.path.split("/")[:-1]) + if group_path == "": + return v.root + else: + return v.root[group_path] def get_time_axis(v): @@ -18,3 +22,10 @@ def get_time_axis(v): return time_axis else: return None + + +def get_var_parent_group(v): + var_group = get_var_group(v) + parent_var_path = var_group.attrs["hiopy::parent"] + parent_group = v.root[parent_var_path] + return parent_group diff --git a/apps/hiopy/configure/configure.py b/apps/hiopy/configure/configure.py index d2902c771b7a4a13aeae6cb41770788812343af8..de186ac9745c272882d938e7bee714c46b571a74 100755 --- a/apps/hiopy/configure/configure.py +++ b/apps/hiopy/configure/configure.py @@ -31,7 +31,8 @@ def _collect_groups(dataset): def add_height(dataset, name, n): for g in _collect_groups(dataset): - height = g.create_dataset(name, data=np.arange(n)) + height = g.create_array(name, fill_value=None, dtype=np.int64, shape=np.arange(n).shape) + height[:] = np.arange(n) height.attrs["_ARRAY_DIMENSIONS"] = [name] height.attrs["axis"] = "Z" height.attrs["long_name"] = "generalized_height" @@ -51,18 +52,19 @@ def add_variable( frac_mask=None, yac_name=None, attributes=None, + chunks_per_shard=None, **kwargs, ): for g in _collect_groups(dataset): taxis_tuple = tuple() if taxis is None else (taxis,) ntime = tuple() if taxis is None else (g[taxis].shape[0],) - grid_mapping_name = g.crs.attrs["grid_mapping_name"] + grid_mapping_name = g["crs"].attrs["grid_mapping_name"] spatial_attr = "point" if (grid_mapping_name == "point_cloud") else "cell" crs_len = 0 if grid_mapping_name == "healpix": - crs_len = healpy.nside2npix(g.crs.attrs["healpix_nside"]) + crs_len = healpy.nside2npix(g["crs"].attrs["healpix_nside"]) elif grid_mapping_name == "point_cloud": - lon_coord, lat_coord = g.crs.attrs["coordinates"].split(" ") + lon_coord, lat_coord = g["crs"].attrs["coordinates"].split(" ") assert lon_coord in g and lat_coord in g assert g[lon_coord].shape[0] == g[lat_coord].shape[0] crs_len = g[lat_coord].shape[0] @@ -70,18 +72,32 @@ def add_variable( raise Exception("Unknown crs.") _attributes = attributes or {} + if zaxis is None: shape = (*ntime, crs_len) - _chunk_shape = np.minimum(chunk_shape, shape) if chunk_shape is not None else None _attributes["_ARRAY_DIMENSIONS"] = (*taxis_tuple, spatial_attr) else: nheight = g[zaxis].shape[0] shape = (*ntime, nheight, crs_len) - _chunk_shape = np.minimum(chunk_shape, shape) if chunk_shape is not None else None _attributes["_ARRAY_DIMENSIONS"] = (*taxis_tuple, zaxis, spatial_attr) + _attributes["grid_mapping"] = "crs" - v = g.create_dataset( - name, shape=shape, dtype=np.float32, fill_value=np.nan, chunks=_chunk_shape, **kwargs + _chunk_shape = "auto" + _shard_shape = None + + if chunk_shape is not None: + _chunk_shape = tuple(min(chunk_shape, shape)) + if chunks_per_shard is not None: + _shard_shape = tuple(i * chunks_per_shard for i in _chunk_shape) + + v = g.create_array( + name, + shape=shape, + dtype=np.float32, + fill_value=np.nan, + chunks=_chunk_shape, + shards=_shard_shape, + **kwargs, ) # TODO: Use a generic name instead of hiopy such that it represents arbitrary grid too diff --git a/apps/hiopy/configure/create_dataset.py b/apps/hiopy/configure/create_dataset.py index 36e336e860beb49bb55003c60c3d395ba3f897ca..dd84adf6711165449852b0cab25d1c42056102ac 100644 --- a/apps/hiopy/configure/create_dataset.py +++ b/apps/hiopy/configure/create_dataset.py @@ -1,41 +1,112 @@ #!/usr/bin/env python3 import numpy as np +import zarr -def add_coordinates(dataset, coordinates, coord_names=("lon", "lat")): - # TODO: update create_dataset() calls to adhrer to zarr 3.0 recommendations - crs = dataset.create_dataset("crs", data=np.array([np.nan], dtype=np.float32)) +def add_coordinates( + dataset: zarr.Group, + coordinates: list[tuple[float, float]], + coord_names: tuple[str, str] = ("lon", "lat"), +) -> None: + """ + Add longitude and latitude coordinates to the specified Zarr dataset. + + Parameters + ---------- + dataset : zarr.Group + The Zarr group where the coordinates will be added. + coordinates : list[tuple[float, float]] + A list of tuples containing the (longitude, latitude) values for each point. + coord_names : tuple[str, str], optional + The names to use for the longitude and latitude arrays. Defaults to ("lon", "lat"). + + Notes + ----- + This function creates two new arrays in the dataset: `coord_names[0]` for longitude and + `coord_names[1]` for latitude. The `crs` array is also created, with its attributes set + to indicate that it's a "point_cloud" coordinate reference system. + Example: add_coordinates(dataset, [(10.2, 45.3), (20.4, 50.5)]) + """ + + crs = dataset.create_array(name="crs", dtype=np.float32, shape=(1,)) crs.attrs["_ARRAY_DIMENSIONS"] = ("crs",) crs.attrs["grid_mapping_name"] = "point_cloud" crs.attrs["coordinates"] = f"{coord_names[0]} {coord_names[1]}" lat_list, lon_list = zip(*coordinates) - lon = dataset.create_dataset(coord_names[0], data=np.array(lon_list)) + lon = dataset.create_array(name=coord_names[0], dtype=np.float32, shape=(len(coordinates),)) + lon[:] = np.array(lon_list) lon.attrs["_ARRAY_DIMENSIONS"] = [coord_names[0]] lon.attrs["long_name"] = "longitude" lon.attrs["units"] = "degree" lon.attrs["standard_name"] = "grid_longitude" - lat = dataset.create_dataset(coord_names[1], data=np.array(lat_list)) + lat = dataset.create_array(name=coord_names[1], dtype=np.float32, shape=(len(coordinates),)) + lat[:] = np.array(lat_list) lat.attrs["_ARRAY_DIMENSIONS"] = [coord_names[1]] lat.attrs["long_name"] = "latitude" lat.attrs["units"] = "degree" lat.attrs["standard_name"] = "grid_latitude" -def add_healpix_grid(dataset, order): - crs = dataset.create_dataset("crs", data=np.array([np.nan], dtype=np.float32), shape=(1,)) +def add_healpix_grid(dataset: zarr.Group, order: int): + """ + Add a HealPix grid to the specified Zarr dataset. + + Parameters + ---------- + dataset : zarr.Group + The Zarr group where the HealPix grid will be added to the crs. + order : int + The order of the HealPix grid. This corresponds to 2^order for the NSIDE. + + Notes + ----- + The HealPix grid is stored as a single array named "crs" in the dataset, with + the healpix_nside and healpix_order attributes set accordingly. + No values are added to it + """ + crs = dataset.create_array(name="crs", dtype=np.float32, shape=(1,)) crs.attrs["_ARRAY_DIMENSIONS"] = ("crs",) crs.attrs["grid_mapping_name"] = "healpix" crs.attrs["healpix_nside"] = 2**order crs.attrs["healpix_order"] = "nest" -def add_healpix_hierarchy(dataset, order, prefix="healpix_"): - for o in range(order + 1): - zg = dataset.create_group(f"{prefix}{o}") +def add_healpix_hierarchy( + dataset: zarr.Group, + order: int, + nr_of_coarsenings: int = 4, + prefix: str = "healpix_", +) -> None: + """ + Add a hierarchical structure to the specified Zarr dataset for a given Healpix order. + + This function creates a group hierarchy with each level representing a specific + resolution of the Healpix grid. The `add_healpix_grid` function is used to create the + actual grid arrays within each group. + + Parameters + ---------- + dataset : zarr.Group + The Zarr group where the hierarchy will be added. + order : int + The maximum level in the hierarchy. + nr_of_coarsenings : int + Number of coarsening aggregation levels needed + prefix : str, optional + The prefix to use for naming each group. Defaults to "healpix_". + + Notes + ----- + This function sets up a hierarchical structure with each level representing a + specific resolution of the Healpix grid. The `hiopy::parent` attribute is used + to link each group to its parent in the hierarchy, allowing for efficient navigation. + """ + for o in range(order, order - nr_of_coarsenings, -1): + zg = dataset.create_group(name=f"{prefix}{o}") add_healpix_grid(zg, o) if o < order: zg.attrs["hiopy::parent"] = f"{prefix}{o+1}" diff --git a/apps/hiopy/tests/CMakeLists.txt b/apps/hiopy/tests/CMakeLists.txt index 0d1eb8cb232047f2b35181d9a539370d016c7750..6f96015b69f4fc71c99f513153dc457df019bf22 100644 --- a/apps/hiopy/tests/CMakeLists.txt +++ b/apps/hiopy/tests/CMakeLists.txt @@ -31,6 +31,7 @@ add_test( NAME hiopy.check_hierarchy COMMAND "/usr/bin/env" "python3" "${CMAKE_CURRENT_SOURCE_DIR}/check_hierarchy.py" "simple_dataset.zarr" ) + set_tests_properties(hiopy.check_hierarchy PROPERTIES FIXTURES_REQUIRED simple_dataset.zarr ENVIRONMENT "PYTHONPATH=${CMAKE_BINARY_DIR}/python:${CMAKE_SOURCE_DIR}/apps:$ENV{PYTHONPATH}" diff --git a/apps/hiopy/tests/check_hierarchy.py b/apps/hiopy/tests/check_hierarchy.py index 4b6905af1799870eeaac243acb73630036023368..fbfbcb365cfd8827376e7fb007596cce17a2dcd4 100755 --- a/apps/hiopy/tests/check_hierarchy.py +++ b/apps/hiopy/tests/check_hierarchy.py @@ -21,7 +21,9 @@ def check_interpolation(source_var, target_var): def check_metadata(var): - assert "hiopy::copy_metadata" not in var.attrs + assert ( + "hiopy::copy_metadata" not in var.attrs + ), f"Attributes of {var.name} {var.attrs.asdict()} is not cleaned" def _collect_groups(dataset, parent=None): diff --git a/apps/hiopy/worker.py b/apps/hiopy/worker.py index a9f4e43e8076ad755f284d98309a30519970c8bd..ebf7419141e40d0b7c5de03863e0d2bb4902ef27 100755 --- a/apps/hiopy/worker.py +++ b/apps/hiopy/worker.py @@ -1,12 +1,4 @@ #!/usr/bin/env python3 - -import json -import logging -from argparse import ArgumentParser -from itertools import chain, groupby - -import numpy as np -import zarr from coyote import ( Coyote, ensure_enddef, @@ -21,9 +13,17 @@ from coyote import ( from ._data_handler import DataHandler from ._distribute_work import distribute_work from ._grids import def_grid, grid_id -from ._zarr_utils import get_time_axis, get_var_group +from ._zarr_utils import get_var_group, get_var_parent_group, get_time_axis from .loco import LocoServer +from argparse import ArgumentParser +from itertools import chain, groupby + +import json +import logging +import numpy as np +import zarr + def main(): parser = ArgumentParser() @@ -67,7 +67,7 @@ def main(): assert len(args.datasets) == 1, "Loco only supports reading from 1 dataset" loco_store = zarr.MemoryStore() zarr.copy_store(args.datasets[0].store, loco_store) - zarr.convenience.consolidate_metadata(loco_store) + zarr.consolidate_metadata(loco_store) loco_server = LocoServer(loco_store, args.loco_host, args.loco_port) args.datasets = [zarr.open(store=loco_store)] @@ -79,15 +79,15 @@ def main(): ) # find all variables considered to be written in the input datasets: - def collect_data_vars(group): + def collect_data_vars(group, root): for _name, item in group.arrays(): if "hiopy::enable" in item.attrs and item.attrs["hiopy::enable"]: + item.root = root yield item for _name, item in group.groups(): - item.parent = group - yield from collect_data_vars(item) + yield from collect_data_vars(item, root) - all_data_vars = list(chain(*[collect_data_vars(z) for z in args.datasets])) + all_data_vars = list(chain(*[collect_data_vars(z, z) for z in args.datasets])) logging.info(f"Found {len(all_data_vars)} variables") if len(all_data_vars) == 0: raise RuntimeError("No variables found by the hiopy worker.") @@ -115,13 +115,13 @@ def main(): gid: Coyote(f"{args.process_group}_{gid}") for gid, data_vars, chunk_slice in my_data_vars } + data_handlers = [] + for gid, data_vars, chunk_slice in my_data_vars: coyote = coyote_instances[gid] # all vars in data_vars define the same grid def_grid(coyote, data_vars[0], chunk_slice) - data_handlers = [] - for v in data_vars: # compute timestep var_group = get_var_group(v) @@ -131,10 +131,11 @@ def main(): # compute time start index t0 = ( np.datetime64(start_datetime()) - - np.datetime64(var_group.time.attrs["units"][len("seconds since ") :]) + - np.datetime64(var_group["time"].attrs["units"][len("seconds since ") :]) ) / np.timedelta64(1, "s") + dt - t0_idx = np.searchsorted(var_group.time, t0) - assert var_group.time[t0_idx] == t0, "start_datetime not found in time axis" + t0_idx = np.searchsorted(var_group["time"], t0) + assert var_group["time"][t0_idx] == t0, f"start_datetime {t0} not found in time axis \ + at index {t0_idx} which has value {var_group['time'][t0_idx]}" # see YAC_REDUCTION_TIME_NONE etc. (TODO: pass constants through coyote) time_methods2yac = {"point": 0, "sum": 1, "mean": 2, "min": 3, "max": 4} @@ -145,8 +146,8 @@ def main(): src_comp, src_grid = v.attrs["hiopy::yac_source"] else: assert "hiopy::parent" in var_group.attrs, f"No source for field {v.name} specified" - parent_var_path = var_group.attrs["hiopy::parent"] + "/" + src_name - source_var = zarr.Group(store=v.store)[parent_var_path] + parent_group = get_var_parent_group(v) + source_var = parent_group[v.basename] src_name = source_var.name source_var_gid = grid_id(source_var) src_comp = src_grid = f"{args.process_group}_{source_var_gid}" @@ -186,14 +187,13 @@ def main(): ) def get_source_triple(v): - group = get_var_group(v) - src_field = v.attrs.get("hiopy::src_name", v.basename) - if "hiopy::parent" in group.attrs: - parent_var_path = group.attrs["hiopy::parent"] + "/" + src_field - source_var = zarr.Group(store=v.store)[parent_var_path] - return get_source_triple(source_var) + var_group = get_var_group(v) + if "hiopy::parent" in var_group.attrs: + pgroup = get_var_parent_group(v) + return get_source_triple(pgroup[v.basename]) elif "hiopy::yac_source" in v.attrs: src_comp, src_grid = v.attrs["hiopy::yac_source"] + src_field = v.attrs.get("hiopy::src_name", v.basename) return src_comp, src_grid, src_field else: raise RuntimeError("Invalid attributes: " + str(dict(v.attrs))) @@ -204,13 +204,16 @@ def main(): if "hiopy::copy_metadata" in v.attrs: comp, grid, field = get_source_triple(v) md_str = get_field_metadata(comp, grid, field) - print(md_str) metadata = json.loads(md_str) - print(metadata) + logging.debug(f"Found metadata for {field}: {metadata}") for key, value in metadata["cf"].items(): v.attrs[key] = value del v.attrs["hiopy::copy_metadata"] # copy only once + # re-consolidate the newly updated metadata + for ds in args.datasets: + zarr.consolidate_metadata(ds.store) + run() for dh in data_handlers: diff --git a/ci/Dockerfile b/ci/Dockerfile index fc086792c5a6a6fea56309dcfce30967ae21dd08..22bb9f70b31e0845e165d1b8bc0eb5aeb2ba2ecb 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.11-slim LABEL maintainer="dreier@dkrz.de" #Remove some warnings @@ -23,7 +23,7 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -RUN pip install --no-cache-dir numpy mpi4py matplotlib cython healpy aiohttp zarr +RUN pip install --no-cache-dir numpy mpi4py matplotlib cython healpy aiohttp zarr rich # install yaxt RUN curl -s -L https://gitlab.dkrz.de/dkrz-sw/yaxt/-/archive/release-0.11.3/yaxt-release-0.11.3.tar.gz | \ diff --git a/pyproject.toml b/pyproject.toml index a8f24220ff29cc1e19262aac7925bc8d980dc897..2d27f6422bd65a755d1743936043ae9f6d297aa8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,10 +8,11 @@ version = "0.0.1" dependencies = [ "numpy", "pybind11", - "zarr<3.0", + "zarr>=3.0.6", "healpy", "aiohttp", - "regex_engine" + "regex_engine", + "rich" ] [tool.scikit-build] @@ -43,7 +44,5 @@ lint.select = [ "B", # flake8-simplify "SIM", - # isort - "I", ] line-length = 100 # accomodate any libc++ motivated requirements of over 80 characters diff --git a/requirements-dev.txt b/requirements-dev.txt index 01b1586255b459bf7135b583c6bc9a5ca9114c54..f9301910976fc6ef9f87180fa06405a183e73785 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,5 +3,6 @@ wheel ruff pre-commit healpy -zarr<3.0 +zarr>=3.0.6 aiohttp +rich diff --git a/scripts/setup_devenv/build_dependencies.sh b/scripts/setup_devenv/build_dependencies.sh index a6b50a8173578dadd9348309b2309c94587d3c92..33a60222ba012e719f1aeec167c21f2d9a47f4ed 100755 --- a/scripts/setup_devenv/build_dependencies.sh +++ b/scripts/setup_devenv/build_dependencies.sh @@ -220,9 +220,9 @@ function install_yac { function install_all { - echo "========================" - echo "== building HEALPIX & Co" - check_and_install healpix_cxx + # echo "========================" + # echo "== building HEALPIX & Co" + # check_and_install healpix_cxx echo "========================" echo "== building YAC & Co" check_and_install yac diff --git a/scripts/setup_devenv/levante_omp412.sh b/scripts/setup_devenv/levante_omp412.sh index 0a10e26a61fc711c9718b63e1f6da88891b24806..92054b2280297b0721956f254ba482a59390b96f 100755 --- a/scripts/setup_devenv/levante_omp412.sh +++ b/scripts/setup_devenv/levante_omp412.sh @@ -36,7 +36,7 @@ INSTALL_PATH=$BUILD_PATH/install mkdir -p "$BUILD_PATH" pushd "$BUILD_PATH" -eval `spack load --sh python@3.10.10%gcc@=11.2.0` +eval `spack load --sh python@3.11.2%gcc@=11.2.0` # recommended to use a compute node for the build process with > 8 threads THREADS=64 @@ -61,7 +61,7 @@ echo "=== Building coyote ===" CC="${CC}" CXX="${CXX}" FC="${FC}" cmake $ABSOLUTE_coyote_ROOT -DCMAKE_PREFIX_PATH=$INSTALL_PATH -DCMAKE_BUILD_TYPE=Debug cmake --build . -j $THREADS -cp $BUILD_PATH/python/coyote.*.so $VENV_PATH/lib/python3.10/site-packages/ +cp $BUILD_PATH/python/coyote.*.so $VENV_PATH/lib/python3.11/site-packages/ export PYTHONPATH=${BUILD_PATH}/python:${ABSOLUTE_coyote_ROOT}/apps echo $PYTHONPATH