diff --git a/builders/cmip5_catalog_builder.ipynb b/builders/cmip5_catalog_builder.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6c1c61ce2103ce0b72ca2ac718c687318fbfe7a6 --- /dev/null +++ b/builders/cmip5_catalog_builder.ipynb @@ -0,0 +1,1414 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import fnmatch\n", + "import dask.dataframe as dd\n", + "from intake.source.utils import reverse_format\n", + "import os\n", + "import re\n", + "import subprocess\n", + "from tqdm.auto import tqdm\n", + "from pathlib import Path\n", + "import shutil\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create text file containing all files available" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def get_file_list(persist_path):\n", + " root = Path(\"/glade/collections/cmip/cmip5/\")\n", + " p_path = Path(persist_path)\n", + " p_path.mkdir(exist_ok=True)\n", + " dirs = [x for x in root.iterdir() if x.is_dir()]\n", + " for directory in tqdm(dirs):\n", + " print(directory)\n", + " stem = directory.stem\n", + " f = open(f\"{persist_path}/{stem}.txt\", \"w\")\n", + " cmd = [\"find\", \"-L\", directory.as_posix(), \"-name\", \"*.nc\"]\n", + " p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)\n", + " p.wait()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "persist_path = \"./CMIP5_filelist\"\n", + "#get_file_list(persist_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Extract attributes of a file using information from CMI6 DRS.\n", + "\n", + "\n", + "References\n", + " 1. CMIP6 DRS: http://goo.gl/v1drZl\n", + " 2. Controlled Vocabularies (CVs) for use in CMIP6:\n", + " https://github.com/WCRP-CMIP/CMIP6_CVs\n", + " \n", + " \n", + "Directory structure =\n", + "```/\n", + " /\n", + " /\n", + " /\n", + " /\n", + " /\n", + " /\n", + " /\n", + " /\n", + " \n", + "```\n", + "file name =\n", + "```_____[_].nc```\n", + "For time-invariant fields, the last segment (time_range) above is omitted.\n", + "Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`\n", + "Example with a sub-experiment: `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/glade/collections/cmip/cmip5/output1/CNRM-CERFACS\n", + "/glade/collections/cmip/cmip5/output1/UNSW\n", + "/glade/collections/cmip/cmip5/output1/BCC\n", + "/glade/collections/cmip/cmip5/output1/NCAR\n", + "/glade/collections/cmip/cmip5/output1/FIO\n", + "/glade/collections/cmip/cmip5/output1/MOHC\n", + "/glade/collections/cmip/cmip5/output1/CSIRO-BOM\n", + "/glade/collections/cmip/cmip5/output1/NOAA-GFDL\n", + "/glade/collections/cmip/cmip5/output1/NIMR-KMA\n", + "/glade/collections/cmip/cmip5/output1/NASA-GISS\n", + "/glade/collections/cmip/cmip5/output1/CSIRO-QCCCE\n", + "/glade/collections/cmip/cmip5/output1/CCCma\n", + "/glade/collections/cmip/cmip5/output1/INPE\n", + "/glade/collections/cmip/cmip5/output1/LASG-CESS\n", + "/glade/collections/cmip/cmip5/output1/INM\n", + "/glade/collections/cmip/cmip5/output1/ICHEC\n", + "/glade/collections/cmip/cmip5/output1/NSF-DOE-NCAR\n", + "/glade/collections/cmip/cmip5/output1/CMCC\n", + "/glade/collections/cmip/cmip5/output1/BNU\n", + "/glade/collections/cmip/cmip5/output1/LASG-IAP\n", + "/glade/collections/cmip/cmip5/output1/MRI\n", + "/glade/collections/cmip/cmip5/output1/MIROC\n", + "/glade/collections/cmip/cmip5/output1/IPSL\n", + "/glade/collections/cmip/cmip5/output1/NCC\n", + "/glade/collections/cmip/cmip5/output1/MPI-M\n" + ] + } + ], + "source": [ + "a = Path(\"/glade/collections/cmip/cmip5/output1/\")\n", + "a\n", + "for d in a.iterdir():\n", + " print(d)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['output1', 'output2', 'output']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "activity_ids = list(Path(persist_path).rglob(\"*.txt\"))\n", + "activity_ids = [activity_id.stem for activity_id in activity_ids]\n", + "activity_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
path
0/glade/collections/cmip/cmip5/output/CCCma/Can...
1/glade/collections/cmip/cmip5/output/CCCma/Can...
2/glade/collections/cmip/cmip5/output/CCCma/Can...
3/glade/collections/cmip/cmip5/output/CCCma/Can...
4/glade/collections/cmip/cmip5/output/CCCma/Can...
\n", + "
" + ], + "text/plain": [ + " path\n", + "0 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", + "1 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", + "2 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", + "3 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", + "4 /glade/collections/cmip/cmip5/output/CCCma/Can..." + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_csv(f\"{persist_path}/*.txt\", header=None).compute()\n", + "df.columns = [\"path\"]\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "927318" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):\n", + " \"\"\"\n", + " Uses intake's ``reverse_format`` utility to reverse the string method format.\n", + " Given format_string and resolved_string, find arguments\n", + " that would give format_string.format(arguments) == resolved_string\n", + " \"\"\"\n", + " try:\n", + " return reverse_format(filename_template, file_basename)\n", + " except ValueError:\n", + " try:\n", + " return reverse_format(gridspec_template, file_basename)\n", + " except:\n", + " print(\n", + " f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'\n", + " )\n", + " return {}\n", + " \n", + "def _extract_attr_with_regex(input_str, regex, strip_chars=None):\n", + " pattern = re.compile(regex, re.IGNORECASE)\n", + " match = re.findall(pattern, input_str)\n", + " if match:\n", + " match = max(match, key=len)\n", + " if strip_chars:\n", + " match = match.strip(strip_chars)\n", + "\n", + " else:\n", + " match = match.strip()\n", + "\n", + " return match\n", + "\n", + " else:\n", + " return None\n", + " \n", + "\n", + "exclude_patterns = ['*/files/*', '*/latest/*']\n", + "def _filter_func(path):\n", + " return not any(\n", + " fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns\n", + " )\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.68 s, sys: 94 ms, total: 3.77 s\n", + "Wall time: 3.76 s\n" + ] + } + ], + "source": [ + "%%time\n", + "files = df.path.tolist()\n", + "filelist = list(filter(_filter_func, files))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "629942" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(filelist)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# def get_attrs(filepath):\n", + "# basename = os.path.basename(filepath)\n", + "# dirname = os.path.dirname(filepath)\n", + "# filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'\n", + "\n", + "# gridspec_template = (\n", + "# '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'\n", + "# )\n", + " \n", + "# f = _reverse_filename_format(\n", + "# basename, filename_template=filename_template, gridspec_template=gridspec_template\n", + "# )\n", + "\n", + "# fileparts = {}\n", + "# fileparts.update(f)\n", + "# parent = os.path.dirname(filepath).strip('/')\n", + "# parent_split = parent.split(f\"/{fileparts['source_id']}/\")\n", + "# part_1 = parent_split[0].strip('/').split('/')\n", + "# grid_label = parent.split(f\"/{fileparts['variable_id']}/\")[1].strip('/').split('/')[0]\n", + "# fileparts['grid_label'] = grid_label\n", + "# fileparts['activity_id'] = part_1[-2]\n", + "# fileparts['institution_id'] = part_1[-1]\n", + "# version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n", + "# version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'\n", + "# fileparts['version'] = version\n", + "# fileparts['path'] = filepath\n", + "# return fileparts \n", + "\n", + "def get_attrs(filepath):\n", + " \"\"\" Extract attributes of a file using information from CMIP5 DRS.\n", + " Notes\n", + " -----\n", + " Reference:\n", + " - CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27\n", + " \"\"\"\n", + "\n", + " fileparts = {}\n", + "\n", + " freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'\n", + " realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'\n", + " version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n", + "\n", + " file_basename = os.path.basename(filepath)\n", + " fileparts['path'] = filepath\n", + "\n", + " filename_template = (\n", + " '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'\n", + " )\n", + " gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'\n", + " f = _reverse_filename_format(\n", + " file_basename, filename_template=filename_template, gridspec_template=gridspec_template\n", + " )\n", + " fileparts.update(f)\n", + "\n", + " frequency = _extract_attr_with_regex(\n", + " filepath, regex=freq_regex, strip_chars='/'\n", + " )\n", + " realm = _extract_attr_with_regex(filepath, regex=realm_regex)\n", + " version = _extract_attr_with_regex(filepath, regex=version_regex) or 'v0'\n", + " fileparts['frequency'] = frequency\n", + " fileparts['modeling_realm'] = realm\n", + " fileparts['version'] = version\n", + "\n", + " return fileparts" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r10i1p1/v20130331/hfls/hfls_Amon_CanCM4_historical_r10i1p1_196101-200512.nc',\n", + " 'variable': 'hfls',\n", + " 'mip_table': 'Amon',\n", + " 'model': 'CanCM4',\n", + " 'experiment': 'historical',\n", + " 'ensemble_member': 'r10i1p1',\n", + " 'temporal_subset': '196101-200512',\n", + " 'frequency': 'mon',\n", + " 'modeling_realm': 'atmos',\n", + " 'version': 'v20130331'}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_attrs(filelist[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 28.6 s, sys: 639 ms, total: 29.2 s\n", + "Wall time: 29.3 s\n" + ] + } + ], + "source": [ + "%%time\n", + "entries = list(map(get_attrs, filelist))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r9i1p1/v20130331/tas/tas_Amon_CanCM4_historical_r9i1p1_196101-200512.nc',\n", + " 'variable': 'tas',\n", + " 'mip_table': 'Amon',\n", + " 'model': 'CanCM4',\n", + " 'experiment': 'historical',\n", + " 'ensemble_member': 'r9i1p1',\n", + " 'temporal_subset': '196101-200512',\n", + " 'frequency': 'mon',\n", + " 'modeling_realm': 'atmos',\n", + " 'version': 'v20130331'}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entries[10]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "629942" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(entries)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pathvariablemip_tablemodelexperimentensemble_membertemporal_subsetfrequencymodeling_realmversion
0/glade/collections/cmip/cmip5/output/CCCma/Can...hflsAmonCanCM4historicalr10i1p1196101-200512monatmosv20130331
1/glade/collections/cmip/cmip5/output/CCCma/Can...tasAmonCanCM4historicalr10i1p1196101-200512monatmosv20130331
2/glade/collections/cmip/cmip5/output/CCCma/Can...prAmonCanCM4historicalr10i1p1196101-200512monatmosv20130331
3/glade/collections/cmip/cmip5/output/CCCma/Can...hflsAmonCanCM4historicalr1i1p1196101-200512monatmosv20130331
4/glade/collections/cmip/cmip5/output/CCCma/Can...tasAmonCanCM4historicalr1i1p1196101-200512monatmosv20130331
\n", + "
" + ], + "text/plain": [ + " path variable mip_table \\\n", + "0 /glade/collections/cmip/cmip5/output/CCCma/Can... hfls Amon \n", + "1 /glade/collections/cmip/cmip5/output/CCCma/Can... tas Amon \n", + "2 /glade/collections/cmip/cmip5/output/CCCma/Can... pr Amon \n", + "3 /glade/collections/cmip/cmip5/output/CCCma/Can... hfls Amon \n", + "4 /glade/collections/cmip/cmip5/output/CCCma/Can... tas Amon \n", + "\n", + " model experiment ensemble_member temporal_subset frequency \\\n", + "0 CanCM4 historical r10i1p1 196101-200512 mon \n", + "1 CanCM4 historical r10i1p1 196101-200512 mon \n", + "2 CanCM4 historical r10i1p1 196101-200512 mon \n", + "3 CanCM4 historical r1i1p1 196101-200512 mon \n", + "4 CanCM4 historical r1i1p1 196101-200512 mon \n", + "\n", + " modeling_realm version \n", + "0 atmos v20130331 \n", + "1 atmos v20130331 \n", + "2 atmos v20130331 \n", + "3 atmos v20130331 \n", + "4 atmos v20130331 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(entries)\n", + "df = df.drop_duplicates(subset=['path'], keep='last').reset_index(drop=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "629942" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'activity_id'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Some entries are invalid\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0minvalids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mactivity_id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mactivity_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m#df = df[df.activity_id.isin(activity_ids)]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/glade/work/abanihi/softwares/miniconda3/envs/analysis/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5177\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5178\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5179\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5180\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5181\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'activity_id'" + ] + } + ], + "source": [ + "# Some entries are invalid\n", + "invalids = df[~df.activity_id.isin(activity_ids)]\n", + "#df = df[df.activity_id.isin(activity_ids)]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'invalids' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0minvalids\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'invalids' is not defined" + ] + } + ], + "source": [ + "invalids" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "609904" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Separate Decadal Predictions from the rest of activities\n", + "\n", + "- Decadal prediction catalog requires additional columns (`start_year`)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "dcpp = df[df.activity_id==\"DCPP\"].copy().reset_index(drop=True)\n", + "rest = df[~(df.activity_id==\"DCPP\")].copy().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
activity_idinstitution_idsource_idexperiment_idmember_idtable_idvariable_idgrid_labelversiontime_rangepath
0AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonhflsgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonvagnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amontasgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonrsdsgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonprgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
\n", + "
" + ], + "text/plain": [ + " activity_id institution_id source_id experiment_id member_id table_id \\\n", + "0 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", + "1 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", + "2 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", + "3 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", + "4 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", + "\n", + " variable_id grid_label version time_range \\\n", + "0 hfls gn v20190624 201501-205512 \n", + "1 va gn v20190624 201501-205512 \n", + "2 tas gn v20190624 201501-205512 \n", + "3 rsds gn v20190624 201501-205512 \n", + "4 pr gn v20190624 201501-205512 \n", + "\n", + " path \n", + "0 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", + "1 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", + "2 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", + "3 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", + "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"table_id\", \"variable_id\",\n", + " \"grid_label\", \"version\", \"time_range\", \"path\"]\n", + "rest = rest[columns]\n", + "rest.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "rest.to_csv(\"../catalogs/glade-cmip6.csv.gz\", compression=\"gzip\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variable_idtable_idsource_idexperiment_idmember_idgrid_labeltime_rangeactivity_idinstitution_idversionpath
0tasdayCanESM5dcppA-hindcasts2015-r7i1p2f1gn20160101-20251231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
1tasdayCanESM5dcppA-hindcasts1977-r2i1p2f1gn19780101-19871231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
2tasAmonCanESM5dcppA-hindcasts1977-r2i1p2f1gn197801-198712DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
3tasdayCanESM5dcppA-hindcasts1975-r8i1p2f1gn19760101-19851231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
4tasAmonCanESM5dcppA-hindcasts1975-r8i1p2f1gn197601-198512DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
\n", + "
" + ], + "text/plain": [ + " variable_id table_id source_id experiment_id member_id grid_label \\\n", + "0 tas day CanESM5 dcppA-hindcast s2015-r7i1p2f1 gn \n", + "1 tas day CanESM5 dcppA-hindcast s1977-r2i1p2f1 gn \n", + "2 tas Amon CanESM5 dcppA-hindcast s1977-r2i1p2f1 gn \n", + "3 tas day CanESM5 dcppA-hindcast s1975-r8i1p2f1 gn \n", + "4 tas Amon CanESM5 dcppA-hindcast s1975-r8i1p2f1 gn \n", + "\n", + " time_range activity_id institution_id version \\\n", + "0 20160101-20251231 DCPP CCCma v20190429 \n", + "1 19780101-19871231 DCPP CCCma v20190429 \n", + "2 197801-198712 DCPP CCCma v20190429 \n", + "3 19760101-19851231 DCPP CCCma v20190429 \n", + "4 197601-198512 DCPP CCCma v20190429 \n", + "\n", + " path \n", + "0 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "1 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "2 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "3 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "4 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dcpp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variable_idtable_idsource_idexperiment_idmember_idgrid_labeltime_rangeactivity_idinstitution_idversionpathstart_year
0tasdayCanESM5dcppA-hindcastr7i1p2f1gn20160101-20251231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...2015.0
1tasdayCanESM5dcppA-hindcastr2i1p2f1gn19780101-19871231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1977.0
2tasAmonCanESM5dcppA-hindcastr2i1p2f1gn197801-198712DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1977.0
3tasdayCanESM5dcppA-hindcastr8i1p2f1gn19760101-19851231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1975.0
4tasAmonCanESM5dcppA-hindcastr8i1p2f1gn197601-198512DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1975.0
\n", + "
" + ], + "text/plain": [ + " variable_id table_id source_id experiment_id member_id grid_label \\\n", + "0 tas day CanESM5 dcppA-hindcast r7i1p2f1 gn \n", + "1 tas day CanESM5 dcppA-hindcast r2i1p2f1 gn \n", + "2 tas Amon CanESM5 dcppA-hindcast r2i1p2f1 gn \n", + "3 tas day CanESM5 dcppA-hindcast r8i1p2f1 gn \n", + "4 tas Amon CanESM5 dcppA-hindcast r8i1p2f1 gn \n", + "\n", + " time_range activity_id institution_id version \\\n", + "0 20160101-20251231 DCPP CCCma v20190429 \n", + "1 19780101-19871231 DCPP CCCma v20190429 \n", + "2 197801-198712 DCPP CCCma v20190429 \n", + "3 19760101-19851231 DCPP CCCma v20190429 \n", + "4 197601-198512 DCPP CCCma v20190429 \n", + "\n", + " path start_year \n", + "0 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 2015.0 \n", + "1 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1977.0 \n", + "2 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1977.0 \n", + "3 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1975.0 \n", + "4 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1975.0 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note: For 'dcppA-assim' experiment_id, there's no start year\n", + "dcpp[\"start_year\"] = dcpp.member_id.map(lambda x: float(x.split(\"-\")[0][1:] if x.startswith(\"s\") else np.nan))\n", + "dcpp[\"member_id\"] = dcpp[\"member_id\"].map(lambda x: x.split(\"-\")[-1] if x.startswith(\"s\") else x)\n", + "dcpp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
activity_idinstitution_idsource_idexperiment_idmember_idstart_yeartable_idvariable_idgrid_labelversiontime_rangepath
0DCPPCCCmaCanESM5dcppA-hindcastr7i1p2f12015.0daytasgnv2019042920160101-20251231/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
1DCPPCCCmaCanESM5dcppA-hindcastr2i1p2f11977.0daytasgnv2019042919780101-19871231/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
2DCPPCCCmaCanESM5dcppA-hindcastr2i1p2f11977.0Amontasgnv20190429197801-198712/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
3DCPPCCCmaCanESM5dcppA-hindcastr8i1p2f11975.0daytasgnv2019042919760101-19851231/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
4DCPPCCCmaCanESM5dcppA-hindcastr8i1p2f11975.0Amontasgnv20190429197601-198512/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
\n", + "
" + ], + "text/plain": [ + " activity_id institution_id source_id experiment_id member_id start_year \\\n", + "0 DCPP CCCma CanESM5 dcppA-hindcast r7i1p2f1 2015.0 \n", + "1 DCPP CCCma CanESM5 dcppA-hindcast r2i1p2f1 1977.0 \n", + "2 DCPP CCCma CanESM5 dcppA-hindcast r2i1p2f1 1977.0 \n", + "3 DCPP CCCma CanESM5 dcppA-hindcast r8i1p2f1 1975.0 \n", + "4 DCPP CCCma CanESM5 dcppA-hindcast r8i1p2f1 1975.0 \n", + "\n", + " table_id variable_id grid_label version time_range \\\n", + "0 day tas gn v20190429 20160101-20251231 \n", + "1 day tas gn v20190429 19780101-19871231 \n", + "2 Amon tas gn v20190429 197801-198712 \n", + "3 day tas gn v20190429 19760101-19851231 \n", + "4 Amon tas gn v20190429 197601-198512 \n", + "\n", + " path \n", + "0 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "1 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "2 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "3 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", + "4 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"start_year\", \"table_id\", \"variable_id\",\n", + " \"grid_label\", \"version\", \"time_range\", \"path\"]\n", + "dcpp = dcpp[columns]\n", + "dcpp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "dcpp.to_csv(\"../catalogs/glade-cmip6-dcpp.csv.gz\", compression=\"gzip\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "file_extension": ".py", + "kernelspec": { + "display_name": "Python [conda env:analysis]", + "language": "python", + "name": "conda-env-analysis-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "mimetype": "text/x-python", + "name": "python", + "npconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/builders/cmip6_catalog_builder.ipynb b/builders/cmip6_catalog_builder.ipynb index 9539f74942743ee64df26e11bcc57fa50fa44b06..5437e45cff147dfc55fbec3d41bac7315fdab84f 100644 --- a/builders/cmip6_catalog_builder.ipynb +++ b/builders/cmip6_catalog_builder.ipynb @@ -49,46 +49,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "92e78f720bfa4eef97649b014a4f91d8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=12), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/glade/collections/cmip/CMIP6/HighResMIP\n", - "/glade/collections/cmip/CMIP6/ScenarioMIP\n", - "/glade/collections/cmip/CMIP6/AerChemMIP\n", - "/glade/collections/cmip/CMIP6/OMIP\n", - "/glade/collections/cmip/CMIP6/C4MIP\n", - "/glade/collections/cmip/CMIP6/DCPP\n", - "/glade/collections/cmip/CMIP6/CMIP\n", - "/glade/collections/cmip/CMIP6/CFMIP\n", - "/glade/collections/cmip/CMIP6/LUMIP\n", - "/glade/collections/cmip/CMIP6/PAMIP\n", - "/glade/collections/cmip/CMIP6/DAMIP\n", - "/glade/collections/cmip/CMIP6/LS3MIP\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "persist_path = \"./CMIP6_filelist\"\n", - "get_file_list(persist_path)" + "#get_file_list(persist_path)" ] }, { @@ -126,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -141,12 +107,13 @@ " 'HighResMIP',\n", " 'DCPP',\n", " 'AerChemMIP',\n", + " 'PMIP',\n", " 'DAMIP',\n", " 'PAMIP',\n", " 'ScenarioMIP']" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -159,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -220,7 +187,7 @@ "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B..." ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -238,21 +205,22 @@ "\n", "- 10/13/2019: 1,027,617 \n", "- 10/15/2019: 1,113,227\n", - "- 10/16/2019: 1,129,214" + "- 10/16/2019: 1,129,214\n", + "- 10/16/2019: 1,138,743" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1129214" + "1138743" ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -263,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -318,15 +286,15 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.16 s, sys: 38.2 ms, total: 4.2 s\n", - "Wall time: 4.2 s\n" + "CPU times: user 3.72 s, sys: 0 ns, total: 3.72 s\n", + "Wall time: 3.71 s\n" ] } ], @@ -338,16 +306,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "711508" + "722392" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -358,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -393,15 +361,15 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 18.5 s, sys: 816 ms, total: 19.3 s\n", - "Wall time: 19.3 s\n" + "CPU times: user 18 s, sys: 851 ms, total: 18.8 s\n", + "Wall time: 18.8 s\n" ] } ], @@ -412,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -431,7 +399,7 @@ " 'path': '/glade/collections/cmip/CMIP6/AerChemMIP/BCC/BCC-ESM1/ssp370/r2i1p1f1/day/pr/gn/v20190702/pr/pr_day_BCC-ESM1_ssp370_r2i1p1f1_gn_20150101-20551231.nc'}" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -442,16 +410,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "711508" + "722392" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -462,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -597,7 +565,7 @@ "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... " ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -609,16 +577,16 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "711508" + "722392" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -629,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -668,7 +636,77 @@ " \n", " \n", " \n", - " 582271\n", + " 583241\n", + " basin\n", + " Ofx\n", + " abrupt-4xCO2\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 583243\n", + " deptho\n", + " Ofx\n", + " abrupt-4xCO2\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 583244\n", + " volcello\n", + " Ofx\n", + " abrupt-4xCO2\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gr\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 583531\n", + " deptho\n", + " Ofx\n", + " historical\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 584584\n", + " sftof\n", + " Ofx\n", + " piControl\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 584587\n", " deptho\n", " Ofx\n", " piControl\n", @@ -682,7 +720,63 @@ " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", " \n", " \n", - " 583805\n", + " 586036\n", + " sftof\n", + " Ofx\n", + " 1pctCO2\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 586039\n", + " basin\n", + " Ofx\n", + " 1pctCO2\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 586041\n", + " deptho\n", + " Ofx\n", + " 1pctCO2\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 586042\n", + " volcello\n", + " Ofx\n", + " 1pctCO2\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gr\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...\n", + " \n", + " \n", + " 586189\n", " thetao\n", " Omon\n", " PCMDI-test-1-0\n", @@ -695,25 +789,120 @@ " v20190926\n", " /glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FI...\n", " \n", + " \n", + " 588176\n", + " sftof\n", + " Ofx\n", + " hist-GHG\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...\n", + " \n", + " \n", + " 588177\n", + " areacello\n", + " Ofx\n", + " hist-GHG\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...\n", + " \n", + " \n", + " 588179\n", + " deptho\n", + " Ofx\n", + " hist-GHG\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gn\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...\n", + " \n", + " \n", + " 588181\n", + " volcello\n", + " Ofx\n", + " hist-GHG\n", + " NorESM2-LM\n", + " r1i1p1f1\n", + " gr\n", + " NaN\n", + " NCC\n", + " NorESM2-LM\n", + " v20190815\n", + " /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...\n", + " \n", " \n", "\n", "" ], "text/plain": [ " variable_id table_id source_id experiment_id member_id \\\n", - "582271 deptho Ofx piControl NorESM2-LM r1i1p1f1 \n", - "583805 thetao Omon PCMDI-test-1-0 piControl-withism r3i1p1f1 \n", + "583241 basin Ofx abrupt-4xCO2 NorESM2-LM r1i1p1f1 \n", + "583243 deptho Ofx abrupt-4xCO2 NorESM2-LM r1i1p1f1 \n", + "583244 volcello Ofx abrupt-4xCO2 NorESM2-LM r1i1p1f1 \n", + "583531 deptho Ofx historical NorESM2-LM r1i1p1f1 \n", + "584584 sftof Ofx piControl NorESM2-LM r1i1p1f1 \n", + "584587 deptho Ofx piControl NorESM2-LM r1i1p1f1 \n", + "586036 sftof Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", + "586039 basin Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", + "586041 deptho Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", + "586042 volcello Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", + "586189 thetao Omon PCMDI-test-1-0 piControl-withism r3i1p1f1 \n", + "588176 sftof Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", + "588177 areacello Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", + "588179 deptho Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", + "588181 volcello Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", "\n", " grid_label time_range activity_id institution_id version \\\n", - "582271 gn NaN NCC NorESM2-LM v20190815 \n", - "583805 gn 016201-016201 v20190926 thetao v20190926 \n", + "583241 gn NaN NCC NorESM2-LM v20190815 \n", + "583243 gn NaN NCC NorESM2-LM v20190815 \n", + "583244 gr NaN NCC NorESM2-LM v20190815 \n", + "583531 gn NaN NCC NorESM2-LM v20190815 \n", + "584584 gn NaN NCC NorESM2-LM v20190815 \n", + "584587 gn NaN NCC NorESM2-LM v20190815 \n", + "586036 gn NaN NCC NorESM2-LM v20190815 \n", + "586039 gn NaN NCC NorESM2-LM v20190815 \n", + "586041 gn NaN NCC NorESM2-LM v20190815 \n", + "586042 gr NaN NCC NorESM2-LM v20190815 \n", + "586189 gn 016201-016201 v20190926 thetao v20190926 \n", + "588176 gn NaN NCC NorESM2-LM v20190815 \n", + "588177 gn NaN NCC NorESM2-LM v20190815 \n", + "588179 gn NaN NCC NorESM2-LM v20190815 \n", + "588181 gr NaN NCC NorESM2-LM v20190815 \n", "\n", " path \n", - "582271 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", - "583805 /glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FI... " + "583241 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "583243 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "583244 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "583531 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "584584 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "584587 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "586036 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "586039 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "586041 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "586042 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", + "586189 /glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FI... \n", + "588176 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... \n", + "588177 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... \n", + "588179 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... \n", + "588181 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... " ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -727,34 +916,43 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/piControl/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_piControl_NorESM2-LM_r1i1p1f1_gn.nc'" + "['/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/abrupt-4xCO2/r1i1p1f1/Ofx/basin/gn/v20190815/basin/basin_Ofx_abrupt-4xCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/abrupt-4xCO2/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_abrupt-4xCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/abrupt-4xCO2/r1i1p1f1/Ofx/volcello/gr/v20190815/volcello/volcello_Ofx_abrupt-4xCO2_NorESM2-LM_r1i1p1f1_gr.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/historical/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_historical_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/piControl/r1i1p1f1/Ofx/sftof/gn/v20190815/sftof/sftof_Ofx_piControl_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/piControl/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_piControl_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/sftof/gn/v20190815/sftof/sftof_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/basin/gn/v20190815/basin/basin_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/volcello/gr/v20190815/volcello/volcello_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gr.nc',\n", + " '/glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FIO-ESM-2-0/piControl/r1i1p1f1/Omon/thetao/gn/v20190926/thetao/thetao_Omon_PCMDI-test-1-0_piControl-withism_r3i1p1f1_gn_016201-016201.nc',\n", + " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/sftof/gn/v20190815/sftof/sftof_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/areacello/gn/v20190815/areacello/areacello_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gn.nc',\n", + " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/volcello/gr/v20190815/volcello/volcello_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gr.nc']" ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "invalids.iloc[0].path\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "invalids.path.tolist()\n", + "\n", "## Keep latest version" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -764,15 +962,15 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3min 59s, sys: 1.85 s, total: 4min\n", - "Wall time: 3min 59s\n" + "CPU times: user 3min 36s, sys: 2.86 s, total: 3min 39s\n", + "Wall time: 3min 37s\n" ] } ], @@ -786,16 +984,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "23587" + "23653" ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -806,16 +1004,16 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "711506" + "722377" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -826,16 +1024,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "687919" + "698724" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -847,7 +1045,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -988,7 +1186,7 @@ "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... NaN " ] }, - "execution_count": 27, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1001,7 +1199,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1142,7 +1340,7 @@ "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... " ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1156,7 +1354,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ diff --git a/catalogs/glade-cmip6.csv.gz b/catalogs/glade-cmip6.csv.gz index 67027e8de4e40fa3bbfd5ccec8f810a1eb4cb569..7fa1ba90dec970671f3401816296ba5a4d473bc0 100644 Binary files a/catalogs/glade-cmip6.csv.gz and b/catalogs/glade-cmip6.csv.gz differ