Commit c924bc86 authored by Marco Kulüke's avatar Marco Kulüke
Browse files

Merge branch 'develop' into 'master'

Develop

Closes #15

See merge request mipdata/tutorials-and-use-cases!10
parents a009dc57 6a89b3f9
......@@ -2,3 +2,5 @@
*.nc
*.grb
*.grib
notebooks/dask-worker-space/*
*png
This diff is collapsed.
......@@ -197,7 +197,7 @@
"source": [
"# Store the name of the model we chose in a variable named \"climate_model\"\n",
"\n",
"climate_model = \"MPI-ESM1-2-LR\" # here we choose Max-Plack Institute's Earth Sytem Model in high resolution\n",
"climate_model = \"MPI-ESM1-2-HR\" # here we choose Max-Plack Institute's Earth Sytem Model in high resolution\n",
"\n",
"# This is how we tell intake what data we want\n",
"\n",
......@@ -258,7 +258,7 @@
"outputs": [],
"source": [
"# Select the file that contains the year we selected in the drop down menu above, e.g. 2015\n",
"selected_file = query_result_df_m[(year_box.value >= query_result_df[\"start_year\"]) & (\n",
"selected_file = query_result_df[(year_box.value >= query_result_df[\"start_year\"]) & (\n",
" year_box.value <= query_result_df[\"end_year\"])]\n",
"\n",
"# Path of the file that contains the selected year \n",
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How to use Dask for Climate Data Processing?\n",
"\n",
"This tutorial builds requires the skills, which have learned in the summer days tutorial (provide link)\n",
"\n",
"1. What is Dask?\n",
" 1. Parallelism\n",
" 1. Overview\n",
" 1. `dask.delayed`\n",
"2. Process climate data with dask?\n",
"3. Common mistakes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. What is Dask?\n",
"Dask is an open source library for parallel computing written in Python. It is used to process larger-than memory datasets (e.g. large climate data sets). All information can be found here: https://docs.dask.org"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1C. Parallelism\n",
"- use Maria's metaphor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1B. Dask Overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1C. `dask.delayed`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let us start with an easy example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dask.distributed import Client\n",
"\n",
"client = Client(n_workers=4)\n",
"\n",
"client"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dask import delayed\n",
"import time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Not Parallel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"\n",
"def inc(x):\n",
" time.sleep(0.5)\n",
" return x + 1\n",
"\n",
"def double(x):\n",
" time.sleep(0.5)\n",
" return 2 * x\n",
"\n",
"def add(x, y):\n",
" time.sleep(0.5)\n",
" return x + y\n",
"\n",
"data = list(range(4))\n",
"\n",
"output = []\n",
"for x in data:\n",
" a = inc(x)\n",
" b = double(x)\n",
" c = add(a, b)\n",
" output.append(c)\n",
"\n",
"total = sum(output)\n",
"total"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Parallel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@delayed\n",
"def inc(x):\n",
" time.sleep(0.5)\n",
" return x + 1\n",
"\n",
"@delayed\n",
"def double(x):\n",
" time.sleep(0.5)\n",
" return 2 * x\n",
"\n",
"#@delayed\n",
"def add(x, y):\n",
" time.sleep(0.5)\n",
" return x + y\n",
"\n",
"data = list(range(4))\n",
"\n",
"output = []\n",
"for x in data:\n",
" a = inc(x)\n",
" b = double(x)\n",
" c = add(a, b)\n",
" output.append(c)\n",
"\n",
"total_delayed = sum(output) #also delay sum because it is a function\n",
"#%time total_delayed.compute()\n",
"total_delayed.visualize()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"total_delayed.visualize()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looks good in theory, right? Now, let us apply our knowldege to climate model data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Process Climate Data with Dask\n",
"- load with intake\n",
"- think about processing\n",
"- compare conventional with dask"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import intake\n",
"import xarray as xr\n",
"#from import Basemap, cm\n",
"import matplotlib.pyplot as plt\n",
"from netCDF4 import Dataset as open_ncfile\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Path to catalog descriptor on the DKRZ server\n",
"col_url = \"/work/ik1017/Catalogs/mistral-cmip6.json\"\n",
"\n",
"# Open the catalog with the intake package and name it \"col\" as short for \"collection\"\n",
"col = intake.open_esm_datastore(col_url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Store the name of the model we chose in a variable named \"climate_model\"\n",
"\n",
"climate_model = \"MPI-ESM1-2-HR\" # here we choose Max-Plack Institute's Earth Sytem Model in high resolution\n",
"\n",
"# This is how we tell intake what data we want\n",
"\n",
"query = dict(\n",
" source_id = \"MPI-ESM1-2-HR\", # the model \n",
" variable_id = \"tasmax\", # temperature at surface, maximum\n",
" table_id = \"day\", # daily maximum\n",
" experiment_id = \"historical\", # what we selected in the drop down menu,e.g. SSP2.4-5 2015-2100\n",
" member_id = \"r10i1p1f1\", # \"r\" realization, \"i\" initialization, \"p\" physics, \"f\" forcing\n",
" time_range =\"20100101-20141231\",\n",
")\n",
"\n",
"# Intake looks for the query we just defined in the catalog of the CMIP6 data pool at DKRZ\n",
"cat = col.search(**query)\n",
"\n",
"# Show query results\n",
"cat.df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ds = xr.open_dataset(cat.df['path'][0])\n",
"ds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"var = ds.variables['tasmax'][0,:,:]\n",
"lat = ds.variables['lat'][:]\n",
"lon = ds.variables['lon'][:]\n",
"\n",
"#-- create figure and axes instadses\n",
"dpi = 100\n",
"fig = plt.figure(figsize=(1100/dpi, 1100/dpi), dpi=dpi)\n",
"ax = fig.add_axes([0.1,0.1,0.8,0.9])\n",
"\n",
"#-- create map\n",
"map = Basemap(projection='cyl',llcrnrlat= -90.,urcrnrlat= 90.,\\\n",
" resolution='c', llcrnrlon=-180.,urcrnrlon=180.)\n",
"\n",
"#-- draw coastlines, state and country boundaries, edge of map\n",
"map.drawcoastlines()\n",
"map.drawstates()\n",
"map.drawcountries()\n",
"\n",
"#-- create and draw meridians and parallels grid lines\n",
"map.drawparallels(np.arange( -90., 90.,30.),labels=[1,0,0,0],fontsize=10)\n",
"map.drawmeridians(np.arange(-180.,180.,30.),labels=[0,0,0,1],fontsize=10)\n",
"\n",
"#-- convert latitude/longitude values to plot x/y values\n",
"x, y = map(*np.meshgrid(lon,lat))\n",
"\n",
"#-- contour levels\n",
"clevs = np.arange(210,320,5)\n",
"\n",
"#-- draw filled contours\n",
"cnplot = map.contourf(x,y,var,clevs,cmap=plt.cm.jet)\n",
"\n",
"#-- add colorbar\n",
"cbar = map.colorbar(cnplot,location='bottom',pad=\"10%\") #-- pad: distadse between map and colorbar\n",
"cbar.set_label('deg K') #-- add colorbar title string\n",
"\n",
"#-- add plot title\n",
"plt.title('Temperature')\n",
"\n",
"#-- displa on screen\n",
"#plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# <font color='darkred'>Now use Gradient Operator in parallel</font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# <font color='darkgreen'>Take home message</font>\n",
"\n",
"Parallelism brings extra complexiity and often it is not necessary for your problems. Before using Dask you may want try alternatives:\n",
"- use better algorithms or data structures\n",
"- better file formats\n",
"- compiled code\n",
"- sampling\n",
"- profile your code"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 unstable (using the module python3/unstable)",
"language": "python",
"name": "python3_unstable"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Plot Earth System Model data on unstructured grids with psyplot\n",
"\n",
"This notebook introduces you to the `mapplot` function of the package `psyplot`.\n",
"It is suitable to plot maps from data on unstructured grids like the ones from ICON and FESOM.\n",
"\n",
"We therefore search for the corresponding data in the CMIP6 data pool with intake-esm.\n",
"Afterwards, we open a file with `xarray` and configure the opened xarray dataset as well as psyplot for a map plot.\n",
"\n",
"In the end, we discuss the functions of `psyplot.project.plot.mapplot`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import psyplot.project as psy\n",
"import matplotlib as mpl\n",
"import xarray as xr\n",
"import intake"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We open a swift catalog from dkrz cloud which is accessible without authentication."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"col_url = \"https://swift.dkrz.de/v1/dkrz_a44962e3ba914c309a7421573a6949a6/intake-esm/mistral-cmip6.json\"\n",
"col = intake.open_esm_datastore(col_url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this example, we aim at plotting the Sea Surface Temperature of the upper boundary of the liquid ocean, including temperatures below sea-ice and floating ice shelves from AWI.\n",
"We therefore search for `tos` in the catalog for monthly frequency. We use 1 realization of 1 experiment only."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tos=col.search(source_id=\"AWI-CM-1-1-MR\",\n",
" experiment_id=\"ssp370\",\n",
" variable_id=\"tos\",\n",
" table_id=\"Omon\",\n",
" member_id=\"r1i1p1f1\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now open the file on the mistral file system."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset = xr.open_dataset(tos.df[\"path\"].to_list()[0])\n",
"#dset = xr.open_mfdataset(tos.df[\"path\"].to_list())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In order to make `tos` plottable, we set the following configuration.\n",
"- The `CDI_grid_type` is a keyword for `psyplot`.\n",
"- Coordinates are not fully recognized by `xarray` so that we have to add some manually (version from Dec 2020)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset[\"tos\"][\"CDI_grid_type\"]=\"unstructured\"\n",
"coordlist=[\"vertices_latitude\", \"vertices_longitude\", \"lat_bnds\", \"lon_bnds\"]\n",
"dset=dset.set_coords([coord for coord in dset.data_vars if coord in coordlist])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is based on the example from:\n",
"https://psyplot.readthedocs.io/projects/psy-maps/en/latest/examples/example_ugrid.html#gallery-examples-example-ugrid-ipynb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"psy.rcParams['plotter.maps.xgrid'] = False\n",
"psy.rcParams['plotter.maps.ygrid'] = False\n",
"mpl.rcParams['figure.figsize'] = [10, 8.]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iconplot11=psy.plot.mapplot(\n",
" dset, name=\"tos\", cmap='rainbow',\n",
" clabel=dset[\"tos\"].description,\n",
" stock_img=True, lsm='50m')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now do the same with a smaller subset to see the fine resolution of the AWI ocean model FESOM.\n",
"The subsetting is required because the plotting takes too long otherwise."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset2 = dset.isel(time=slice(1,3)).where( (dset.lon > -10. ) &\n",
" (dset.lon < 50. ) &\n",
" (dset.lat > 40. ) &\n",
" (dset.lat < 70. ), drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset2.to_netcdf(\"/home/dkrz/k204210/test.nc\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset2=xr.open_dataset(\"/home/dkrz/k204210/test.nc\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset2[\"tos\"][\"CDI_grid_type\"]=\"unstructured\"\n",
"coordlist=[\"vertices_latitude\", \"vertices_longitude\", \"lat_bnds\", \"lon_bnds\"]\n",
"dset2=dset2.set_coords([coord for coord in dset2.data_vars if coord in coordlist])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"iconplot12=psy.plot.mapplot(\n",
" dset2, name=\"tos\", cmap='rainbow',\n",
" lonlatbox='Ireland',\n",
" clabel=dset[\"tos\"].description,\n",
" stock_img=True,\n",
" lsm='50m',\n",
" datagrid=dict(c='b', lw=0.2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 unstable (using the module python3/unstable)",
"language": "python",
"name": "python3_unstable"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment