Commit 9d8d141d authored by Anderson Banihirwe's avatar Anderson Banihirwe
Browse files

update glade catalog

parent 2f718c6e
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import fnmatch\n",
"import dask.dataframe as dd\n",
"from intake.source.utils import reverse_format\n",
"import os\n",
"import re\n",
"import subprocess\n",
"from tqdm.auto import tqdm\n",
"from pathlib import Path\n",
"import shutil\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create text file containing all files available"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def get_file_list(persist_path):\n",
" root = Path(\"/glade/collections/cmip/cmip5/\")\n",
" p_path = Path(persist_path)\n",
" p_path.mkdir(exist_ok=True)\n",
" dirs = [x for x in root.iterdir() if x.is_dir()]\n",
" for directory in tqdm(dirs):\n",
" print(directory)\n",
" stem = directory.stem\n",
" f = open(f\"{persist_path}/{stem}.txt\", \"w\")\n",
" cmd = [\"find\", \"-L\", directory.as_posix(), \"-name\", \"*.nc\"]\n",
" p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)\n",
" p.wait()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"persist_path = \"./CMIP5_filelist\"\n",
"#get_file_list(persist_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## Extract attributes of a file using information from CMI6 DRS.\n",
"\n",
"\n",
"References\n",
" 1. CMIP6 DRS: http://goo.gl/v1drZl\n",
" 2. Controlled Vocabularies (CVs) for use in CMIP6:\n",
" https://github.com/WCRP-CMIP/CMIP6_CVs\n",
" \n",
" \n",
"Directory structure =\n",
"```<mip_era>/\n",
" <activity_id>/\n",
" <institution_id>/\n",
" <source_id>/\n",
" <experiment_id>/\n",
" <member_id>/\n",
" <table_id>/\n",
" <variable_id>/\n",
" <grid_label>/\n",
" <version>\n",
"```\n",
"file name =\n",
"```<variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc```\n",
"For time-invariant fields, the last segment (time_range) above is omitted.\n",
"Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`\n",
"Example with a sub-experiment: `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/glade/collections/cmip/cmip5/output1/CNRM-CERFACS\n",
"/glade/collections/cmip/cmip5/output1/UNSW\n",
"/glade/collections/cmip/cmip5/output1/BCC\n",
"/glade/collections/cmip/cmip5/output1/NCAR\n",
"/glade/collections/cmip/cmip5/output1/FIO\n",
"/glade/collections/cmip/cmip5/output1/MOHC\n",
"/glade/collections/cmip/cmip5/output1/CSIRO-BOM\n",
"/glade/collections/cmip/cmip5/output1/NOAA-GFDL\n",
"/glade/collections/cmip/cmip5/output1/NIMR-KMA\n",
"/glade/collections/cmip/cmip5/output1/NASA-GISS\n",
"/glade/collections/cmip/cmip5/output1/CSIRO-QCCCE\n",
"/glade/collections/cmip/cmip5/output1/CCCma\n",
"/glade/collections/cmip/cmip5/output1/INPE\n",
"/glade/collections/cmip/cmip5/output1/LASG-CESS\n",
"/glade/collections/cmip/cmip5/output1/INM\n",
"/glade/collections/cmip/cmip5/output1/ICHEC\n",
"/glade/collections/cmip/cmip5/output1/NSF-DOE-NCAR\n",
"/glade/collections/cmip/cmip5/output1/CMCC\n",
"/glade/collections/cmip/cmip5/output1/BNU\n",
"/glade/collections/cmip/cmip5/output1/LASG-IAP\n",
"/glade/collections/cmip/cmip5/output1/MRI\n",
"/glade/collections/cmip/cmip5/output1/MIROC\n",
"/glade/collections/cmip/cmip5/output1/IPSL\n",
"/glade/collections/cmip/cmip5/output1/NCC\n",
"/glade/collections/cmip/cmip5/output1/MPI-M\n"
]
}
],
"source": [
"a = Path(\"/glade/collections/cmip/cmip5/output1/\")\n",
"a\n",
"for d in a.iterdir():\n",
" print(d)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['output1', 'output2', 'output']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"activity_ids = list(Path(persist_path).rglob(\"*.txt\"))\n",
"activity_ids = [activity_id.stem for activity_id in activity_ids]\n",
"activity_ids"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>path</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" path\n",
"0 /glade/collections/cmip/cmip5/output/CCCma/Can...\n",
"1 /glade/collections/cmip/cmip5/output/CCCma/Can...\n",
"2 /glade/collections/cmip/cmip5/output/CCCma/Can...\n",
"3 /glade/collections/cmip/cmip5/output/CCCma/Can...\n",
"4 /glade/collections/cmip/cmip5/output/CCCma/Can..."
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = dd.read_csv(f\"{persist_path}/*.txt\", header=None).compute()\n",
"df.columns = [\"path\"]\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"927318"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):\n",
" \"\"\"\n",
" Uses intake's ``reverse_format`` utility to reverse the string method format.\n",
" Given format_string and resolved_string, find arguments\n",
" that would give format_string.format(arguments) == resolved_string\n",
" \"\"\"\n",
" try:\n",
" return reverse_format(filename_template, file_basename)\n",
" except ValueError:\n",
" try:\n",
" return reverse_format(gridspec_template, file_basename)\n",
" except:\n",
" print(\n",
" f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'\n",
" )\n",
" return {}\n",
" \n",
"def _extract_attr_with_regex(input_str, regex, strip_chars=None):\n",
" pattern = re.compile(regex, re.IGNORECASE)\n",
" match = re.findall(pattern, input_str)\n",
" if match:\n",
" match = max(match, key=len)\n",
" if strip_chars:\n",
" match = match.strip(strip_chars)\n",
"\n",
" else:\n",
" match = match.strip()\n",
"\n",
" return match\n",
"\n",
" else:\n",
" return None\n",
" \n",
"\n",
"exclude_patterns = ['*/files/*', '*/latest/*']\n",
"def _filter_func(path):\n",
" return not any(\n",
" fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns\n",
" )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.68 s, sys: 94 ms, total: 3.77 s\n",
"Wall time: 3.76 s\n"
]
}
],
"source": [
"%%time\n",
"files = df.path.tolist()\n",
"filelist = list(filter(_filter_func, files))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"629942"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(filelist)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# def get_attrs(filepath):\n",
"# basename = os.path.basename(filepath)\n",
"# dirname = os.path.dirname(filepath)\n",
"# filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'\n",
"\n",
"# gridspec_template = (\n",
"# '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'\n",
"# )\n",
" \n",
"# f = _reverse_filename_format(\n",
"# basename, filename_template=filename_template, gridspec_template=gridspec_template\n",
"# )\n",
"\n",
"# fileparts = {}\n",
"# fileparts.update(f)\n",
"# parent = os.path.dirname(filepath).strip('/')\n",
"# parent_split = parent.split(f\"/{fileparts['source_id']}/\")\n",
"# part_1 = parent_split[0].strip('/').split('/')\n",
"# grid_label = parent.split(f\"/{fileparts['variable_id']}/\")[1].strip('/').split('/')[0]\n",
"# fileparts['grid_label'] = grid_label\n",
"# fileparts['activity_id'] = part_1[-2]\n",
"# fileparts['institution_id'] = part_1[-1]\n",
"# version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n",
"# version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'\n",
"# fileparts['version'] = version\n",
"# fileparts['path'] = filepath\n",
"# return fileparts \n",
"\n",
"def get_attrs(filepath):\n",
" \"\"\" Extract attributes of a file using information from CMIP5 DRS.\n",
" Notes\n",
" -----\n",
" Reference:\n",
" - CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27\n",
" \"\"\"\n",
"\n",
" fileparts = {}\n",
"\n",
" freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'\n",
" realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'\n",
" version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n",
"\n",
" file_basename = os.path.basename(filepath)\n",
" fileparts['path'] = filepath\n",
"\n",
" filename_template = (\n",
" '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'\n",
" )\n",
" gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'\n",
" f = _reverse_filename_format(\n",
" file_basename, filename_template=filename_template, gridspec_template=gridspec_template\n",
" )\n",
" fileparts.update(f)\n",
"\n",
" frequency = _extract_attr_with_regex(\n",
" filepath, regex=freq_regex, strip_chars='/'\n",
" )\n",
" realm = _extract_attr_with_regex(filepath, regex=realm_regex)\n",
" version = _extract_attr_with_regex(filepath, regex=version_regex) or 'v0'\n",
" fileparts['frequency'] = frequency\n",
" fileparts['modeling_realm'] = realm\n",
" fileparts['version'] = version\n",
"\n",
" return fileparts"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r10i1p1/v20130331/hfls/hfls_Amon_CanCM4_historical_r10i1p1_196101-200512.nc',\n",
" 'variable': 'hfls',\n",
" 'mip_table': 'Amon',\n",
" 'model': 'CanCM4',\n",
" 'experiment': 'historical',\n",
" 'ensemble_member': 'r10i1p1',\n",
" 'temporal_subset': '196101-200512',\n",
" 'frequency': 'mon',\n",
" 'modeling_realm': 'atmos',\n",
" 'version': 'v20130331'}"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_attrs(filelist[0])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 28.6 s, sys: 639 ms, total: 29.2 s\n",
"Wall time: 29.3 s\n"
]
}
],
"source": [
"%%time\n",
"entries = list(map(get_attrs, filelist))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r9i1p1/v20130331/tas/tas_Amon_CanCM4_historical_r9i1p1_196101-200512.nc',\n",
" 'variable': 'tas',\n",
" 'mip_table': 'Amon',\n",
" 'model': 'CanCM4',\n",
" 'experiment': 'historical',\n",
" 'ensemble_member': 'r9i1p1',\n",
" 'temporal_subset': '196101-200512',\n",
" 'frequency': 'mon',\n",
" 'modeling_realm': 'atmos',\n",
" 'version': 'v20130331'}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"entries[10]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"629942"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(entries)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>path</th>\n",
" <th>variable</th>\n",
" <th>mip_table</th>\n",
" <th>model</th>\n",
" <th>experiment</th>\n",
" <th>ensemble_member</th>\n",
" <th>temporal_subset</th>\n",
" <th>frequency</th>\n",
" <th>modeling_realm</th>\n",
" <th>version</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" <td>hfls</td>\n",
" <td>Amon</td>\n",
" <td>CanCM4</td>\n",
" <td>historical</td>\n",
" <td>r10i1p1</td>\n",
" <td>196101-200512</td>\n",
" <td>mon</td>\n",
" <td>atmos</td>\n",
" <td>v20130331</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" <td>tas</td>\n",
" <td>Amon</td>\n",
" <td>CanCM4</td>\n",
" <td>historical</td>\n",
" <td>r10i1p1</td>\n",
" <td>196101-200512</td>\n",
" <td>mon</td>\n",
" <td>atmos</td>\n",
" <td>v20130331</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" <td>pr</td>\n",
" <td>Amon</td>\n",
" <td>CanCM4</td>\n",
" <td>historical</td>\n",
" <td>r10i1p1</td>\n",
" <td>196101-200512</td>\n",
" <td>mon</td>\n",
" <td>atmos</td>\n",
" <td>v20130331</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" <td>hfls</td>\n",
" <td>Amon</td>\n",
" <td>CanCM4</td>\n",
" <td>historical</td>\n",
" <td>r1i1p1</td>\n",
" <td>196101-200512</td>\n",
" <td>mon</td>\n",
" <td>atmos</td>\n",
" <td>v20130331</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>/glade/collections/cmip/cmip5/output/CCCma/Can...</td>\n",
" <td>tas</td>\n",
" <td>Amon</td>\n",
" <td>CanCM4</td>\n",
" <td>historical</td>\n",
" <td>r1i1p1</td>\n",
" <td>196101-200512</td>\n",
" <td>mon</td>\n",
" <td>atmos</td>\n",
" <td>v20130331</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" path variable mip_table \\\n",
"0 /glade/collections/cmip/cmip5/output/CCCma/Can... hfls Amon \n",
"1 /glade/collections/cmip/cmip5/output/CCCma/Can... tas Amon \n",
"2 /glade/collections/cmip/cmip5/output/CCCma/Can... pr Amon \n",
"3 /glade/collections/cmip/cmip5/output/CCCma/Can... hfls Amon \n",
"4 /glade/collections/cmip/cmip5/output/CCCma/Can... tas Amon \n",
"\n",
" model experiment ensemble_member temporal_subset frequency \\\n",
"0 CanCM4 historical r10i1p1 196101-200512 mon \n",
"1 CanCM4 historical r10i1p1 196101-200512 mon \n",
"2 CanCM4 historical r10i1p1 196101-200512 mon \n",
"3 CanCM4 historical r1i1p1 196101-200512 mon \n",
"4 CanCM4 historical r1i1p1 196101-200512 mon \n",
"\n",
" modeling_realm version \n",
"0 atmos v20130331 \n",
"1 atmos v20130331 \n",
"2 atmos v20130331 \n",
"3 atmos v20130331 \n",
"4 atmos v20130331 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(entries)\n",