diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c80026a8aa91e23441739292e1b33b81f2fa8fe6..f273c433d879e31d9246041ca00c21c76d401467 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,6 +24,7 @@ build: tags: - roadrunner artifacts: + when: always paths: - docs/build/html expire_in: 5min diff --git a/README.md b/README.md index ca0af452806c86d7743cbd29cb349f534fdfffb0..afe5db0428da3730e72a5b368d0928ff59a98e42 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Using Notebooks for Model Data Analysis +( Find our sphinx built [here](https://mipdata.gitlab-pages.dkrz.de/tutorials-and-use-cases/) and the latest sphinx build from a branch [here](https://mipdata.gitlab-pages.dkrz.de/-/tutorials-and-use-cases/-/jobs/30069/artifacts/docs/build/html/index.html). ) + Welcome to the DKRZ tutorials and use cases repository! In the "notebooks" folder here you can find [Jupyter](https://jupyter.org/) notebooks with coding examples showing how to use Big Data and High-Performance Computing software with direct access to the [DKRZ data pool](https://www.dkrz.de/up/services/data-management/cmip-data-pool). Find more information on how to get a DKRZ account: diff --git a/docs/source/conf.py b/docs/source/conf.py index c924e9a689713a044e73a03d6c001f0f3e83a38a..32b2b835e92c93fcd4e2e0cf8db12258be9e21b3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -32,19 +32,25 @@ release = '0.1' # ones. extensions = [ "nbsphinx" +# "myst_nb" ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] nbsphinx_allow_errors = True +#execution_allow_errors = True #nbsphinx_execute = 'never' jupyter_execute_notebooks = "on" +#jupyter_execute_notebooks = "cache" +nbsphinx_kernel_name = 'anaconda3_bleeding' +nbsphinx_timeout = -1 +#exection_timeout = -1 # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns = ['_build', '**.ipynb_checkpoints'] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/source/tutorial_esmvaltool.ipynb b/docs/source/tutorial_esmvaltool.ipynb index 81f9e9ca0ce40b5c40aac9a6c6eb13dfb8374c3f..44762ac66bfb7239cee2ee5f310034c49291aeb5 120000 --- a/docs/source/tutorial_esmvaltool.ipynb +++ b/docs/source/tutorial_esmvaltool.ipynb @@ -1 +1 @@ -../../notebooks/demo/tutorial_esmvaltool.ipynb \ No newline at end of file +../../notebooks/./demo/tutorial_esmvaltool.ipynb \ No newline at end of file diff --git a/docs/source/tutorial_intake.ipynb b/docs/source/tutorial_intake.ipynb new file mode 120000 index 0000000000000000000000000000000000000000..2a79fdf1861aee88f16beeb845a3b364ea77bd81 --- /dev/null +++ b/docs/source/tutorial_intake.ipynb @@ -0,0 +1 @@ +../../notebooks/./demo/tutorial_intake.ipynb \ No newline at end of file diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst index 27748d62a7b6569f37337f218c75591bea5155b8..a947a4e7d330e999c01e18dd05dac4ae64617523 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/tutorials.rst @@ -10,3 +10,4 @@ Tutorials :maxdepth: 1 tutorial_esmvaltool.ipynb + tutorial_intake.ipynb diff --git a/docs/source/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb b/docs/source/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb index dd716e7fe7a2f902e92e48725d835daab37e1aee..a6640c552e6b34f7874fbe73a1d57065323b0ebd 120000 --- a/docs/source/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb +++ b/docs/source/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb @@ -1 +1 @@ -../../notebooks/demo/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb \ No newline at end of file +../../notebooks/./demo/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb \ No newline at end of file diff --git a/docs/source/use-case_calculate-frost-days_intake-xarray_cmip6.ipynb b/docs/source/use-case_calculate-frost-days_intake-xarray_cmip6.ipynb new file mode 120000 index 0000000000000000000000000000000000000000..d7561c5db5d0b3ecca39a6c1e7f8d2dfc36c3fe1 --- /dev/null +++ b/docs/source/use-case_calculate-frost-days_intake-xarray_cmip6.ipynb @@ -0,0 +1 @@ +../../notebooks/./demo/use-case_calculate-frost-days_intake-xarray_cmip6.ipynb \ No newline at end of file diff --git a/docs/source/use-case_frost_days_intake_xarray_cmip6.ipynb b/docs/source/use-case_frost_days_intake_xarray_cmip6.ipynb deleted file mode 120000 index 90ecde6152a7bdfc5a7aa09c130c3e775ce2c533..0000000000000000000000000000000000000000 --- a/docs/source/use-case_frost_days_intake_xarray_cmip6.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../notebooks/demo/use-case_frost_days_intake_xarray_cmip6.ipynb \ No newline at end of file diff --git a/docs/source/use-case_multimodel-comparison_xarray-cdo_cmip6.ipynb b/docs/source/use-case_multimodel-comparison_xarray-cdo_cmip6.ipynb new file mode 120000 index 0000000000000000000000000000000000000000..28fb12c888bf2384369f6ec1ee62e7c145f79c28 --- /dev/null +++ b/docs/source/use-case_multimodel-comparison_xarray-cdo_cmip6.ipynb @@ -0,0 +1 @@ +../../notebooks/./demo/use-case_multimodel-comparison_xarray-cdo_cmip6.ipynb \ No newline at end of file diff --git a/docs/source/use-case_multimodel_comparison_xarray_cdo_cmip6.ipynb b/docs/source/use-case_multimodel_comparison_xarray_cdo_cmip6.ipynb deleted file mode 120000 index 39ca45d74c0696145462f93335beb71a886c9728..0000000000000000000000000000000000000000 --- a/docs/source/use-case_multimodel_comparison_xarray_cdo_cmip6.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../notebooks/demo/use-case_multimodel_comparison_xarray_cdo_cmip6.ipynb \ No newline at end of file diff --git a/docs/source/use-case_plot-unstructured_psyplot_cmip6.ipynb b/docs/source/use-case_plot-unstructured_psyplot_cmip6.ipynb new file mode 120000 index 0000000000000000000000000000000000000000..cc052fb98a1f4ad741e5d437512772f4414cac5b --- /dev/null +++ b/docs/source/use-case_plot-unstructured_psyplot_cmip6.ipynb @@ -0,0 +1 @@ +../../notebooks/./demo/use-case_plot-unstructured_psyplot_cmip6.ipynb \ No newline at end of file diff --git a/docs/source/use-case_simple-vis_xarray-matplotlib_cmip6.ipynb b/docs/source/use-case_simple-vis_xarray-matplotlib_cmip6.ipynb index 11bc0486ea3d777cf00826fa02cce495aaaf98bd..7a5686e8ec75021409998bd3a3e37462e211291e 120000 --- a/docs/source/use-case_simple-vis_xarray-matplotlib_cmip6.ipynb +++ b/docs/source/use-case_simple-vis_xarray-matplotlib_cmip6.ipynb @@ -1 +1 @@ -../../notebooks/demo/use-case_simple-vis_xarray-matplotlib_cmip6.ipynb \ No newline at end of file +../../notebooks/./demo/use-case_simple-vis_xarray-matplotlib_cmip6.ipynb \ No newline at end of file diff --git a/docs/source/use-cases.rst b/docs/source/use-cases.rst index 94779f1af0f9d68f63b3f3c6a6c8ad6ac0e4a85d..a45d4f29d630fe04e0996e55181e12dbf2ceb998 100644 --- a/docs/source/use-cases.rst +++ b/docs/source/use-cases.rst @@ -9,9 +9,10 @@ Use-cases .. toctree:: :maxdepth: 1 - use-case_simple-vis_xarray-matplotlib_cmip6.ipynb - use-case_frost_days_intake_xarray_cmip6.ipynb - use-case_multimodel_comparison_xarray_cdo_cmip6.ipynb use-case_advanced_summer_days_intake_xarray_cmip6.ipynb + use-case_calculate-frost-days_intake-xarray_cmip6.ipynb + use-case_multimodel-comparison_xarray-cdo_cmip6.ipynb + use-case_plot-unstructured_psyplot_cmip6.ipynb + use-case_simple-vis_xarray-matplotlib_cmip6.ipynb use-case_global-yearly-mean-anomaly_xarray-hvplot_cmip6.ipynb use-case_convert-nc-to-tiff_rioxarray-xesmf_cmip.ipynb diff --git a/environment.yml b/environment.yml index 52cee5982e8f5cfac221b1793cd435e84b8eea7a..f5420f54dd6f29906a0590c436ad058488471c4c 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ name: nbdemo channels: - conda-forge dependencies: - - python>=3.7 + - python=3.7 #from psyplot - pip - esmpy - ipykernel @@ -15,7 +15,7 @@ dependencies: - netcdf4 - numpy - pandas - - matplotlib + - matplotlib<3.1 #from psyplot - cartopy - scipy - cdo @@ -27,12 +27,21 @@ dependencies: - pytest - nbval - nbsphinx - - nbconvert=5.6.1 + - nbconvert=5.6.1 #from envkernel - sphinx - esmvalcore - requests - aiohttp - - rioxarray +#psyplot + - psy-maps + - psyplot +# +#for psyplot on centos: +#sudo ln -s /usr/lib64/libc.so.6 /usr/lib64/libc.musl-x86_64.so.1 +# +#intake pangeo + - gcsfs + - rioxarray # for raster data - pip: - envkernel - - xESMF + - xESMF #for raster data diff --git a/notebooks/demo/tutorial_intake.ipynb b/notebooks/demo/tutorial_intake.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ffaf266261c234e97a89f9de170d9e40a7c5a9d8 --- /dev/null +++ b/notebooks/demo/tutorial_intake.ipynb @@ -0,0 +1,590 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tutorial on how to find and access data with `intake-esm` ESM-collections\n", + "\n", + "We follow here the guidance presented by `intake-esm` on its [repository](https://intake-esm.readthedocs.io/en/latest/user-guide/cmip6-tutorial.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Motivation of intake-esm\n", + "\n", + "> Simulations of the Earth’s climate and weather generate huge amounts of data. These data are often persisted on different storages in a variety of formats (netCDF, zarr, etc...). Finding, investigating, loading these data assets into compute-ready data containers costs time and effort. The data user needs to know what data sets are available, the attributes describing each data set, before loading a specific data set and analyzing it.\n", + "\n", + "> `Intake-esm` addresses these issues by providing necessary functionality for **searching, discovering, data access and data loading**." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Features of intake and intake-esm\n", + "\n", + "Intake is a generic **cataloging system** for listing data sources. As a plugin, `intake-esm` is built on top of `intake`, `pandas`, and `xarray` and configures `intake` such that it is able to also **load** ESM data.\n", + "\n", + "- display catalogs as clearly structured tables inside jupyter notebooks for easy investigation\n", + "- browse through the catalog and select your data without being on the pool file system\n", + "- open climate data in an analysis ready dictionary of `xarray` datasets\n", + "\n", + "⇨ `intake-esm` reduces the data access and data preparation tasks on analysists side" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we load a CMIP6 catalog which contains all data from the pool on DKRZ's mistral disk storage.\n", + "CMIP6 is the 6th phase of the Coupled Model Intercomparison Project and builds the data base used in the IPCC AR6.\n", + "The CMIP6 catalog contains all data that is published or replicated at the ESGF node at DKRZ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparation\n", + "First of all, we need to import the required packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import intake" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Open and browse through catalogs and collections\n", + "\n", + "We begin with using only `intake` functions for catalogs. Afterwards, we continue with concrete `intake-esm` utilites.\n", + "\n", + "`intake` **opens** catalogs for data sources given in `yaml` format. These contain information about plugins and sources required for accessing and loading the data. The command is `open_catalog`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dkrz_cdp=intake.open_catalog(\"https://swift.dkrz.de/v1/dkrz_a44962e3ba914c309a7421573a6949a6/intake-esm/dkrz_data-pool_cloudcatalog.yaml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can look into the catalog with `print` and `list`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(dkrz_cdp.yaml())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The variable `dkrz_cdp` is short for DKRZ's CMIP Data Pool. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(dkrz_cdp)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Over the time, many collections of data have been created. `dkrz_cdp` is a **Master** catalog prepared to keep an overview of all other collections. `list` shows all **ESM collections** (Earth System Model) for different data sources which are available at DKRZ. An **ESM collection** will be opened with `intake-esm`.\n", + "\n", + "Let's have a look into a master catalog of [Pangeo](https://pangeo.io/):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pangeo=intake.open_catalog(\"https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/master.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pangeo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(pangeo)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While DKRZ's master catalog has one sublevel, Pangeo's is a nested one. We can access another `yaml` catalog which is also a **parent** catalog by simply:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pangeo.climate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pangeo's ESM collections are one level deeper in the catalog tree:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(pangeo.climate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In contrast to these ESM collections in `pangeo.climate`, the DKRZ ESM-Collections follow a name template:\n", + "\n", + "`dkrz_${MIP}_${store}_${fileaccess}[_${catalog_store}`\n", + "\n", + "where\n", + "- **MIP** is the *model intercomparison project* and one of `cmip6`, `cmip5`, `cordex`, `era5` or `mpi-ge`. \n", + "- **store** is the data store and can be one of:\n", + " - `disk`: DKRZ holds a lot of data on a consortial disk space on the file system of the High Performance Computer (HPC) where it is accessible for every HPC user. If you use this ESM Collection, you have to work on the HPC if you want to load the data. Browsing and discovering will work independently from your work station.\n", + " - `swift`: A small subset is transferred into DKRZ's cloud in order to test the performance. swift` is DKRZ's cloud storage.\n", + "- **fileaccess** means also *file format* and is one of:\n", + " - `netcdf`: All projects define a data standard based on `netCDF`. The original data format is, up to CMIP6, netCDF.\n", + " - `zarr`: We experiment with the zarr format and provide a small subset in the cloud.\n", + " - `opendap`: opendap ESM collections point at the same catalog as the one with `fileaccess = netcdf` but configures `intake-esm` such that it opens the data via the opendap protocol and the `opendap_url`. Therefore, the data can be loaded from remote.\n", + "- **catalog_store** is a suffix named *fromcloud* which indicates that the source of the ESM collection is loaded from cloud. At DKRZ, we also provide the catalogs on filesystem where we do not need and do not use this suffix.\n", + "\n", + "Each Collection has unique features which serves different use cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The role of `intake-esm`\n", + "\n", + "We now look into a ESM Collection which is opened by the plugin `intake-esm`.\n", + "\n", + "> An ESM (Earth System Model) collection file is a `JSON` file that conforms to the ESM Collection Specification. When provided a link/path to an esm collection file, intake-esm establishes a link to a database (`CSV` file) that contains data assets locations and associated metadata (i.e., which experiment, model, the come from).\n", + "\n", + "Since the data base of the CMIP6 ESM Collection is about 350MB in compressed format, it takes up to a minute to load the catalog." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "esm_col=dkrz_cdp.dkrz_cmip6_disk_netcdf_fromcloud" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "esm_col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "esm_col.df=esm_col.df[~(esm_col.df[\"time_range\"].isnull()) & (~esm_col.df[\"table_id\"].str.contains(\"fx\"))]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`intake-esm` gives us an overview over the content of the ESM collection. The ESM collection is a data base described by specific attributes which are technically columns. Each project data standard is the basis for the columns and used to parse information given by the path and file names.\n", + "\n", + "The pure display of `esm_col` shows us the number of unique values in each column. Since each `path` refers to one file, we can conclude that the DKRZ-CMIP6 ESM Collection contains **5.47 Mio Files** in July 2021." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data base is loaded into an underlying `panda`s dataframe which we can access with `col.df`. `col.df.head()` displays the first rows of the table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "esm_col.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can find out details about `esm_col` with the object's attributes. `esm_col.esmcol_data` contains all information given in the `JSON` file which is a bit unclear. We can also focus on some specific attributes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#esm_col.esmcol_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "print(\"What is this catalog about? \\n\" + esm_col.esmcol_data[\"description\"])\n", + "#\n", + "print(\"The link to the data base: \"+ esm_col.esmcol_data[\"catalog_file\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Advanced: To find out how many datasets are available, we can use pandas functions (drop columns that are irrelevant for a dataset, drop the duplicates, keep one):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat = esm_col.df.drop(['path','time_range', 'opendap_url'],1).drop_duplicates(keep=\"first\")\n", + "print(len(cat))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browse through the data of the ESM collection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most of the time, we want to set more than one attribute for a search. Therefore, we define a query `dict`ionary and use the `search` function of the `esm_col` object. In the following case, we look for temperature at surface in monthly resolution for 3 different experiments:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = dict(\n", + " variable_id=\"tas\",\n", + " table_id=\"Amon\",\n", + " experiment_id=[\"piControl\", \"historical\", \"ssp370\"])\n", + "# piControl = pre-industrial control, simulation to represent a stable climate from 1850 for >100 years.\n", + "# historical = historical Simulation, 1850-2014\n", + "# ssp370 = Shared Socioeconomic Pathways (SSPs) are scenarios of projected socioeconomic global changes. Simulation covers 2015-2100\n", + "cat = esm_col.search(**query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We could also use *Wildcards*. For example, in order to find out which ESMs of the institution *MPI-M* have produced data for our subset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat.search(source_id=\"MPI-ES*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can find out which models have submitted data for at least one of them by:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat.unique([\"source_id\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we instead look for the models that have submitted data for ALL experiments, we use the `require_all_on` keyword argument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat = esm_col.search(require_all_on=[\"source_id\"], **query)\n", + "cat.unique([\"source_id\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that only the combination of a `variable_id` and a `table_id` is unique in CMIP6. If you search for `tas` in all tables, you will find many entries more:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = dict(\n", + " variable_id=\"tas\",\n", + "# table_id=\"Amon\",\n", + " experiment_id=[\"piControl\", \"historical\", \"ssp370\"])\n", + "cat = esm_col.search(**query)\n", + "cat.unique([\"table_id\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Be careful when you search for specific time slices. Each frequency is connected with a individual name template for the filename. If the data is yearly, you have YYYY-YYYY whereas you have YYYYMM-YYYYMM for monthly data. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Access and load data of the ESM collection\n", + "\n", + "<div class=\"alert alert-block alert-info\"><b>Note:</b> <br> \n", + " Up to this point, you could do all steps on your local PC because the data was not touched.<br> Since we are working with the <i>disk_netcdf</i> ESM collection, you have to work on DKRZ's HPC file system to load data.<br> <b> Use the <i> disk_opendap </i> ESM collection instead if you work remotely.</b>\n", + "</div>\n", + "\n", + "\n", + "With the power of `xarray`, `intake` can load your subset into a `dict`ionary of datasets. We therefore focus on the data of `MPI-ESM1-2-LR`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = dict(\n", + " variable_id=\"tas\",\n", + " table_id=\"Amon\",\n", + " source_id=\"MPI-ESM1-2-HR\",\n", + " experiment_id=\"historical\")\n", + "cat = esm_col.search(**query)\n", + "cat" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The corresponding function is `to_dataset_dict`. We also have to set the `chunks` keyword argument for `xarray` because `xarray` chooses too large chunks otherwise. <mark> Note that this keyword argument varies depending on the fileaccess/fileformat.</mark> If your collection contains `zarr` formatted data, you need a different keyword argument." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xr_dict = cat.to_dataset_dict(cdf_kwargs={\"chunks\":{\"time\":1}})\n", + "xr_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Intake` was able to aggregate many files into only one dataset:\n", + "- The `time_range` column was used to **concat** data along the `time` dimension\n", + "- The `member_id` column was used to generate a new dimension\n", + "\n", + "The underlying `dask` package will only load the data into memory if needed.\n", + "We can get the `xarray` dataset with python commands:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xr_dset = xr_dict[list(xr_dict.keys())[0]]\n", + "xr_dset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pangeo's data store\n", + "\n", + "Let's have a look into Pangeo's ESM Collection as well. This is accessible via cloud from everywhere - you only need internet to load data. We use the same `query` as in the example before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pangeo_cmip6=pangeo.climate.cmip6_gcs\n", + "cat = pangeo_cmip6.search(**query)\n", + "cat" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are differences between the collections because\n", + "- Pangeo provides files in *consolidated*, `zarr` formatted datasets which correspond to `zstore` entries in the catalog instead of `path`s or `opendap_url`s. \n", + "- The `zarr` datasets are already aggregated over time so there is no need for a `time_range` column\n", + "\n", + "If we now open the data with `intake`, we have to specify keyword arguments as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset_dict = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": True, \"use_cftime\": True}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`dset_dict` and `xr_dict` are the same. You succesfully did the intake tutorial!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- You can also do another [CMIP6 tutorial](https://intake-esm.readthedocs.io/en/latest/user-guide/cmip6-tutorial.html) from the official intake page." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:root] *", + "language": "python", + "name": "anaconda3_bleeding" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/notebooks/demo/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb b/notebooks/demo/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb index 3f36d2a60bb62e7fc218cec6eff38734af914e23..eda407a1de4ac2dbff62859758773b4af8203cca 100644 --- a/notebooks/demo/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb +++ b/notebooks/demo/use-case_advanced_summer_days_intake_xarray_cmip6.ipynb @@ -169,11 +169,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Path to catalog descriptor on the DKRZ server\n", - "col_url = \"/pool/data/Catalogs/mistral-cmip6.json\"\n", + "# Path to master catalog on the DKRZ server\n", + "col_url = \"https://swift.dkrz.de/v1/dkrz_a44962e3ba914c309a7421573a6949a6/intake-esm/dkrz_data-pool_cloudcatalog.yaml\"\n", + "parent_col=intake.open_catalog(col_url)\n", + "list(parent_col)\n", "\n", "# Open the catalog with the intake package and name it \"col\" as short for \"collection\"\n", - "col = intake.open_esm_datastore(col_url)" + "col=parent_col[\"dkrz_cmip6_disk_netcdf_fromcloud\"]" ] }, { @@ -423,7 +425,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:root] *", + "display_name": "mambaenv (conda, /home/dkrz/k204210/mambaenv)", "language": "python", "name": "anaconda3_bleeding" }, @@ -437,9 +439,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/notebooks/demo/use-case_frost_days_intake_xarray_cmip6.ipynb b/notebooks/demo/use-case_calculate-frost-days_intake-xarray_cmip6.ipynb similarity index 100% rename from notebooks/demo/use-case_frost_days_intake_xarray_cmip6.ipynb rename to notebooks/demo/use-case_calculate-frost-days_intake-xarray_cmip6.ipynb diff --git a/notebooks/demo/use-case_global-yearly-mean-anomaly_xarray-hvplot_cmip6.ipynb b/notebooks/demo/use-case_global-yearly-mean-anomaly_xarray-hvplot_cmip6.ipynb index 3593d348d246ea5fb22c6b20646101bfe9fee821..ae576d9cad9c8577ca3dd20f37a3305d78e8b86a 100644 --- a/notebooks/demo/use-case_global-yearly-mean-anomaly_xarray-hvplot_cmip6.ipynb +++ b/notebooks/demo/use-case_global-yearly-mean-anomaly_xarray-hvplot_cmip6.ipynb @@ -96,8 +96,18 @@ "metadata": {}, "outputs": [], "source": [ - "col_url = \"https://swift.dkrz.de/v1/dkrz_a44962e3ba914c309a7421573a6949a6/intake-esm/dkrz-cmip-data-pool-opendap.json\"\n", - "col = intake.open_esm_datastore(col_url)" + "col_url = \"https://swift.dkrz.de/v1/dkrz_a44962e3ba914c309a7421573a6949a6/intake-esm/dkrz_data-pool_cloudcatalog.yaml\"\n", + "parent_col=intake.open_catalog(col_url)\n", + "list(parent_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "col=parent_col[\"dkrz_cmip6_disk_netcdf_fromcloud\"]" ] }, { @@ -136,7 +146,7 @@ "query = dict(\n", " variable_id=variable_id,\n", " table_id=\"Amon\",\n", - " experiment_id=[\"historical\", \"ssp245\", \"ssp585\"],\n", + " experiment_id=[\"historical\", \"ssp585\"], # we have excluded \"ssp245\" from the list because it would take 15min to finish the nb\n", " source_id=[\"MPI-ESM1-2-HR\", \"AWI-CM-1-1-MR\"],\n", ")\n", "cat = col.search(**query)" @@ -173,10 +183,19 @@ "metadata": {}, "outputs": [], "source": [ - "xr_dset_dict = cat.to_dataset_dict()\n", + "xr_dset_dict = cat.to_dataset_dict(cdf_kwargs={\"chunks\":{\"time\":1}})\n", "print(xr_dset_dict.keys())" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xr_dset_dict['ScenarioMIP.DWD.MPI-ESM1-2-HR.ssp585.Amon.gn']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -323,7 +342,7 @@ "metadata": {}, "outputs": [], "source": [ - "hvplot.save(plot, \"/home/dkrz/k204210/globalmean-yearlymean-tas.html\")" + "#hvplot.save(plot, \"globalmean-yearlymean-tas.html\")" ] }, { @@ -355,13 +374,20 @@ "source": [ "We acknowledge the CMIP community for providing the climate model data, retained and globally distributed in the framework of the ESGF. The CMIP data of this study were replicated and made available for this study by the DKRZ.â€" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:root] *", "language": "python", - "name": "anaconda3_bleeding" + "name": "conda-root-py" }, "language_info": { "codemirror_mode": { diff --git a/notebooks/demo/use-case_multimodel_comparison_xarray_cdo_cmip6.ipynb b/notebooks/demo/use-case_multimodel-comparison_xarray-cdo_cmip6.ipynb similarity index 100% rename from notebooks/demo/use-case_multimodel_comparison_xarray_cdo_cmip6.ipynb rename to notebooks/demo/use-case_multimodel-comparison_xarray-cdo_cmip6.ipynb diff --git a/notebooks/demo/use-case_plot-unstructured_psyplot_cmip6.ipynb b/notebooks/demo/use-case_plot-unstructured_psyplot_cmip6.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b4c5a553e0e2556f25aa87aca756d2c7a747282e --- /dev/null +++ b/notebooks/demo/use-case_plot-unstructured_psyplot_cmip6.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plot ESM data on *unstructured* grids with `psyplot`\n", + "\n", + "This notebook introduces you to the `mapplot` function of the package `psyplot` and its plugin `psy_maps`.\n", + "It is suitable to plot maps from data on unstructured grids like the ones from ICON and FESOM.\n", + "\n", + "We therefore search for the corresponding data in the CMIP6 data pool with intake-esm.\n", + "Afterwards, we open a file with `xarray` and configure the opened xarray dataset as well as psyplot for a map plot.\n", + "\n", + "This Jupyter notebook is meant to run in the [Jupyterhub](https://jupyterhub.dkrz.de/hub/login?next=%2Fhub%2Fhome) server of the German Climate Computing Center [DKRZ](https://www.dkrz.de/). The DKRZ hosts the CMIP data pool including 4 petabytes of CMIP6 data. Please, choose the Python 3 unstable kernel on the Kernel tab above, it contains all the common geoscience packages. See more information on how to run Jupyter notebooks at DKRZ [here](https://www.dkrz.de/up/systems/mistral/programming/jupyter-notebook).\n", + "\n", + "Running this Jupyter notebook in your premise, which is also known as [client-side](https://en.wikipedia.org/wiki/Client-side) computing, will require that you install the necessary packages\n", + "`intake`, `xarray`, `maplotlib`, `psyplot`, `psy_maps`\n", + "\n", + "and either download the data or use the `opendap_url` column of the intake catalog if available." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Learning Objectives\n", + "\n", + "- How to access data on an *unstructured* grid from the DKRZ CMIP data pool with `intake-esm`\n", + "- How to subset data with `xarray`\n", + "- How to visualize the results with `matplotlib`, `psyplot` and `psy_maps`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import psyplot.project as psy\n", + "import matplotlib as mpl\n", + "import xarray as xr\n", + "import intake" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We open a swift catalog from dkrz cloud which is accessible without authentication." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "col_url = \"https://swift.dkrz.de/v1/dkrz_a44962e3ba914c309a7421573a6949a6/intake-esm/dkrz_data-pool_cloudcatalog.yaml\"\n", + "parent_col=intake.open_catalog(col_url)\n", + "list(parent_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "col=parent_col[\"dkrz_cmip6_disk_netcdf_fromcloud\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we aim at plotting the **Sea Surface Temperature** `tos` of the upper boundary of the liquid ocean, including temperatures below sea-ice and floating ice shelves from the earth system model **AWI-CM-1-1-MR**.\n", + "We therefore search for `tos` in the catalog for monthly frequency. We only use one realization `r1i1p1f1` of one experiment only." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tos=col.search(source_id=\"AWI-CM-1-1-MR\",\n", + " experiment_id=\"ssp370\",\n", + " variable_id=\"tos\",\n", + " table_id=\"Omon\",\n", + " member_id=\"r1i1p1f1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tos.df[\"path\"].to_list()[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now open the file on the mistral lustre file system. Note that if you work remotely, you could try to use `opendap_url` instead of `path`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset = xr.open_dataset(tos.df[\"path\"].to_list()[0],\n", + " decode_cf=True,\n", + " chunks={\"time\":1},\n", + " lock=False)\n", + "dset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to make `tos` plottable, we set the following configuration.\n", + "- The `CDI_grid_type` is a keyword for `psyplot`. It must match the *grid type* of the source model.\n", + "- Coordinates are not fully recognized by `xarray` so that we have to add some manually (version from Dec 2020)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset[\"tos\"][\"CDI_grid_type\"]=\"unstructured\"\n", + "coordlist=[\"vertices_latitude\", \"vertices_longitude\", \"lat_bnds\", \"lon_bnds\"]\n", + "dset=dset.set_coords([coord for coord in dset.data_vars if coord in coordlist])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following is based on the [psyplot example](https://psyplot.readthedocs.io/projects/psy-maps/en/latest/examples/example_ugrid.html#gallery-examples-example-ugrid-ipynb). We set a resoltion for the land sea mask `lsm` and a color map via `cmap`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "psy.rcParams['plotter.maps.xgrid'] = False\n", + "psy.rcParams['plotter.maps.ygrid'] = False\n", + "mpl.rcParams['figure.figsize'] = [10, 8.]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_unstructured():\n", + " iconplot11=psy.plot.mapplot(\n", + " dset, name=\"tos\", cmap='rainbow',\n", + " clabel=dset[\"tos\"].description,\n", + " stock_img=True, lsm='50m')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now do the same with a smaller subset to highlight the fine resolution and the structure of the AWI ocean model FESOM.\n", + "We first *subset* the data because otherwise plotting takes too long. We choose indices of dimensions with the `xarray` function `isel`. We select a slice of two time steps and focus on a region Ireland. We have to save the data to an intermediate file `test.nc` because otherwise we receive an error." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset2 = dset.isel(time=slice(1,2)).where( (dset.lon > -10. ) &\n", + " (dset.lon < 50. ) &\n", + " (dset.lat > 40. ) &\n", + " (dset.lat < 70. ), drop=True).drop(\"time_bnds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset2.to_netcdf(\"test.nc\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset2.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset=xr.open_dataset(\"test.nc\",\n", + " decode_cf=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset[\"tos\"][\"CDI_grid_type\"]=\"unstructured\"\n", + "coordlist=[\"vertices_latitude\", \"vertices_longitude\", \"lat_bnds\", \"lon_bnds\"]\n", + "dset=dset.set_coords([coord for coord in dset.data_vars if coord in coordlist])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "psy.plot.mapplot(\n", + " dset, name=\"tos\", cmap='rainbow',\n", + " lonlatbox='Ireland',\n", + " clabel=dset[\"tos\"].description,\n", + " stock_img=True,\n", + " lsm='50m',\n", + " datagrid=dict(c='b', lw=0.2)).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Used data\n", + "\n", + "- [Semmler et al., 2019: AWI AWI-CM1.1MR model output prepared for CMIP6](https://doi.org/10.22033/ESGF/CMIP6.2803)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We acknowledge the CMIP community for providing the climate model data, retained and globally distributed in the framework of the ESGF. The CMIP data of this study were replicated and made available for this study by the DKRZ.â€" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mambaenv (conda, /home/dkrz/k204210/mambaenv)", + "language": "python", + "name": "anaconda3_bleeding" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}