Commit 3f105b67 authored by Anderson Banihirwe's avatar Anderson Banihirwe

Add glade catalogs

parent 6d6ef486
......@@ -107,4 +107,5 @@ _build/
notes/
worker-*
slurm-*
dask-worker*
\ No newline at end of file
dask-worker*
*.txt
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import fnmatch\n",
"import dask.dataframe as dd\n",
"from intake.source.utils import reverse_format\n",
"import os\n",
"import re\n",
"import subprocess\n",
"from tqdm.auto import tqdm\n",
"from pathlib import Path\n",
"import shutil"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create text file containing all files available"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def get_file_list(persist_path):\n",
" root = Path(\"/glade/collections/cmip/CMIP6\")\n",
" dirs = [x for x in root.iterdir() if x.is_dir()]\n",
" for directory in tqdm(dirs):\n",
" print(directory)\n",
" stem = directory.stem\n",
" f = open(f\"{persist_path}/{stem}.txt\", \"w\")\n",
" cmd = [\"find\", \"-L\", directory.as_posix(), \"-name\", \"*.nc\"]\n",
" p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)\n",
" p.wait()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"persist_path = \"./CMIP6_filelist\"\n",
"#get_file_list(persist_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## Extract attributes of a file using information from CMI6 DRS.\n",
"\n",
"\n",
"References\n",
" 1. CMIP6 DRS: http://goo.gl/v1drZl\n",
" 2. Controlled Vocabularies (CVs) for use in CMIP6:\n",
" https://github.com/WCRP-CMIP/CMIP6_CVs\n",
" \n",
" \n",
"Directory structure =\n",
"```<mip_era>/\n",
" <activity_id>/\n",
" <institution_id>/\n",
" <source_id>/\n",
" <experiment_id>/\n",
" <member_id>/\n",
" <table_id>/\n",
" <variable_id>/\n",
" <grid_label>/\n",
" <version>\n",
"```\n",
"file name =\n",
"```<variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc```\n",
"For time-invariant fields, the last segment (time_range) above is omitted.\n",
"Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`\n",
"Example with a sub-experiment: `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "OSError",
"evalue": "./CMIP6_filelist/*.txt resolved to no files",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-a5a13cd998d0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{persist_path}/*.txt\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"path\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/glade/work/abanihi/softwares/miniconda3/envs/analysis/lib/python3.7/site-packages/dask/dataframe/io/csv.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(urlpath, blocksize, collection, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)\u001b[0m\n\u001b[1;32m 580\u001b[0m \u001b[0mstorage_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstorage_options\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 581\u001b[0m \u001b[0minclude_path_column\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minclude_path_column\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 582\u001b[0;31m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 583\u001b[0m )\n\u001b[1;32m 584\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/glade/work/abanihi/softwares/miniconda3/envs/analysis/lib/python3.7/site-packages/dask/dataframe/io/csv.py\u001b[0m in \u001b[0;36mread_pandas\u001b[0;34m(reader, urlpath, blocksize, collection, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0minclude_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minclude_path_column\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 409\u001b[0;31m \u001b[0;34m**\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstorage_options\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 410\u001b[0m )\n\u001b[1;32m 411\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/glade/work/abanihi/softwares/miniconda3/envs/analysis/lib/python3.7/site-packages/dask/bytes/core.py\u001b[0m in \u001b[0;36mread_bytes\u001b[0;34m(urlpath, delimiter, not_zero, blocksize, sample, compression, include_path, **kwargs)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpaths\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIOError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"%s resolved to no files\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0murlpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mblocksize\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mOSError\u001b[0m: ./CMIP6_filelist/*.txt resolved to no files"
]
}
],
"source": [
"df = dd.read_csv(f\"{persist_path}/*.txt\", header=None).compute()\n",
"df.columns = [\"path\"]\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):\n",
" \"\"\"\n",
" Uses intake's ``reverse_format`` utility to reverse the string method format.\n",
" Given format_string and resolved_string, find arguments\n",
" that would give format_string.format(arguments) == resolved_string\n",
" \"\"\"\n",
" try:\n",
" return reverse_format(filename_template, file_basename)\n",
" except ValueError:\n",
" try:\n",
" return reverse_format(gridspec_template, file_basename)\n",
" except:\n",
" print(\n",
" f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'\n",
" )\n",
" return {}\n",
" \n",
"def _extract_attr_with_regex(input_str, regex, strip_chars=None):\n",
" pattern = re.compile(regex, re.IGNORECASE)\n",
" match = re.findall(pattern, input_str)\n",
" if match:\n",
" match = max(match, key=len)\n",
" if strip_chars:\n",
" match = match.strip(strip_chars)\n",
"\n",
" else:\n",
" match = match.strip()\n",
"\n",
" return match\n",
"\n",
" else:\n",
" return None\n",
" \n",
"\n",
"exclude_patterns = ['*/files/*', '*/latest/*']\n",
"def _filter_func(path):\n",
" return not any(\n",
" fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns\n",
" )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"files = df.path.tolist()\n",
"filelist = list(filter(_filter_func, files))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(filelist)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_attrs(filepath):\n",
" basename = os.path.basename(filepath)\n",
" dirname = os.path.dirname(filepath)\n",
" filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'\n",
"\n",
" gridspec_template = (\n",
" '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'\n",
" )\n",
" \n",
" f = _reverse_filename_format(\n",
" basename, filename_template=filename_template, gridspec_template=gridspec_template\n",
" )\n",
"\n",
" fileparts = {}\n",
" fileparts.update(f)\n",
" parent = os.path.dirname(filepath).strip('/')\n",
" parent_split = parent.split(f\"/{fileparts['source_id']}/\")\n",
" part_1 = parent_split[0].strip('/').split('/')\n",
" grid_label = parent.split(f\"/{fileparts['variable_id']}/\")[1].strip('/').split('/')[0]\n",
" fileparts['grid_label'] = grid_label\n",
" fileparts['activity_id'] = part_1[-2]\n",
" fileparts['institution_id'] = part_1[-1]\n",
" version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n",
" version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'\n",
" fileparts['version'] = version\n",
" fileparts['path'] = filepath\n",
" return fileparts "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"entries = list(map(get_attrs, filelist))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"entries[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(entries)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(entries)\n",
"df = df.drop_duplicates(subset=['path'], keep='last').reset_index(drop=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Separate Decadal Predictions from the rest of activities\n",
"\n",
"- Decadal prediction catalog requires additional columns (`start_year`)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dcpp = df[df.activity_id==\"DCPP\"].copy().reset_index(drop=True)\n",
"rest = df[~(df.activity_id==\"DCPP\")].copy().reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"table_id\", \"variable_id\",\n",
" \"grid_label\", \"version\", \"time_range\", \"path\"]\n",
"rest = rest[columns]\n",
"rest.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rest.to_csv(\"/glade/collections/cmip/catalog/glade-cmip6.csv.gz\", compression=\"gzip\", index=False)\n",
"rest.to_csv(\"../catalogs/glade-cmip6.csv.gz\", compression=\"gzip\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dcpp.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dcpp[\"start_year\"] = dcpp.member_id.map(lambda x: x.split(\"-\")[0][1:])\n",
"dcpp[\"member_id\"] = dcpp[\"member_id\"].map(lambda x: x.split(\"-\")[-1])\n",
"dcpp.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"start_year\", \"table_id\", \"variable_id\",\n",
" \"grid_label\", \"version\", \"time_range\", \"path\"]\n",
"dcpp = dcpp[columns]\n",
"dcpp.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dcpp.to_csv(\"/glade/collections/cmip/catalog/glade-cmip6-dcpp.csv.gz\", compression=\"gzip\", index=False)\n",
"dcpp.to_csv(\"../catalogs/glade-cmip6-dcpp.csv.gz\", compression=\"gzip\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"file_extension": ".py",
"kernelspec": {
"display_name": "Python [conda env:analysis]",
"language": "python",
"name": "conda-env-analysis-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"mimetype": "text/x-python",
"name": "python",
"npconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment