{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import fnmatch\n", "import dask.dataframe as dd\n", "from intake.source.utils import reverse_format\n", "import os\n", "import re\n", "import subprocess\n", "from tqdm.auto import tqdm\n", "from pathlib import Path\n", "import shutil\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create text file containing all files available" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_file_list(persist_path):\n", " root = Path(\"/glade/collections/cmip/cmip5/\")\n", " p_path = Path(persist_path)\n", " p_path.mkdir(exist_ok=True)\n", " dirs = [x for x in root.iterdir() if x.is_dir()]\n", " for directory in tqdm(dirs):\n", " print(directory)\n", " stem = directory.stem\n", " f = open(f\"{persist_path}/{stem}.txt\", \"w\")\n", " cmd = [\"find\", \"-L\", directory.as_posix(), \"-name\", \"*.nc\"]\n", " p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)\n", " p.wait()\n", " " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "persist_path = \"./CMIP5_filelist\"\n", "#get_file_list(persist_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "## Extract attributes of a file using information from CMI6 DRS.\n", "\n", "\n", "References\n", " 1. CMIP6 DRS: http://goo.gl/v1drZl\n", " 2. Controlled Vocabularies (CVs) for use in CMIP6:\n", " https://github.com/WCRP-CMIP/CMIP6_CVs\n", " \n", " \n", "Directory structure =\n", "```/\n", " /\n", " /\n", " /\n", " /\n", " /\n", " /\n", " /\n", " /\n", " \n", "```\n", "file name =\n", "```_____[_].nc```\n", "For time-invariant fields, the last segment (time_range) above is omitted.\n", "Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`\n", "Example with a sub-experiment: `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/glade/collections/cmip/cmip5/output1/CNRM-CERFACS\n", "/glade/collections/cmip/cmip5/output1/UNSW\n", "/glade/collections/cmip/cmip5/output1/BCC\n", "/glade/collections/cmip/cmip5/output1/NCAR\n", "/glade/collections/cmip/cmip5/output1/FIO\n", "/glade/collections/cmip/cmip5/output1/MOHC\n", "/glade/collections/cmip/cmip5/output1/CSIRO-BOM\n", "/glade/collections/cmip/cmip5/output1/NOAA-GFDL\n", "/glade/collections/cmip/cmip5/output1/NIMR-KMA\n", "/glade/collections/cmip/cmip5/output1/NASA-GISS\n", "/glade/collections/cmip/cmip5/output1/CSIRO-QCCCE\n", "/glade/collections/cmip/cmip5/output1/CCCma\n", "/glade/collections/cmip/cmip5/output1/INPE\n", "/glade/collections/cmip/cmip5/output1/LASG-CESS\n", "/glade/collections/cmip/cmip5/output1/INM\n", "/glade/collections/cmip/cmip5/output1/ICHEC\n", "/glade/collections/cmip/cmip5/output1/NSF-DOE-NCAR\n", "/glade/collections/cmip/cmip5/output1/CMCC\n", "/glade/collections/cmip/cmip5/output1/BNU\n", "/glade/collections/cmip/cmip5/output1/LASG-IAP\n", "/glade/collections/cmip/cmip5/output1/MRI\n", "/glade/collections/cmip/cmip5/output1/MIROC\n", "/glade/collections/cmip/cmip5/output1/IPSL\n", "/glade/collections/cmip/cmip5/output1/NCC\n", "/glade/collections/cmip/cmip5/output1/MPI-M\n" ] } ], "source": [ "a = Path(\"/glade/collections/cmip/cmip5/output1/\")\n", "a\n", "for d in a.iterdir():\n", " print(d)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['output1', 'output2', 'output']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "activity_ids = list(Path(persist_path).rglob(\"*.txt\"))\n", "activity_ids = [activity_id.stem for activity_id in activity_ids]\n", "activity_ids" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
path
0/glade/collections/cmip/cmip5/output/CCCma/Can...
1/glade/collections/cmip/cmip5/output/CCCma/Can...
2/glade/collections/cmip/cmip5/output/CCCma/Can...
3/glade/collections/cmip/cmip5/output/CCCma/Can...
4/glade/collections/cmip/cmip5/output/CCCma/Can...
\n", "
" ], "text/plain": [ " path\n", "0 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", "1 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", "2 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", "3 /glade/collections/cmip/cmip5/output/CCCma/Can...\n", "4 /glade/collections/cmip/cmip5/output/CCCma/Can..." ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = dd.read_csv(f\"{persist_path}/*.txt\", header=None).compute()\n", "df.columns = [\"path\"]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "927318" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):\n", " \"\"\"\n", " Uses intake's ``reverse_format`` utility to reverse the string method format.\n", " Given format_string and resolved_string, find arguments\n", " that would give format_string.format(arguments) == resolved_string\n", " \"\"\"\n", " try:\n", " return reverse_format(filename_template, file_basename)\n", " except ValueError:\n", " try:\n", " return reverse_format(gridspec_template, file_basename)\n", " except:\n", " print(\n", " f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'\n", " )\n", " return {}\n", " \n", "def _extract_attr_with_regex(input_str, regex, strip_chars=None):\n", " pattern = re.compile(regex, re.IGNORECASE)\n", " match = re.findall(pattern, input_str)\n", " if match:\n", " match = max(match, key=len)\n", " if strip_chars:\n", " match = match.strip(strip_chars)\n", "\n", " else:\n", " match = match.strip()\n", "\n", " return match\n", "\n", " else:\n", " return None\n", " \n", "\n", "exclude_patterns = ['*/files/*', '*/latest/*']\n", "def _filter_func(path):\n", " return not any(\n", " fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns\n", " )\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.68 s, sys: 94 ms, total: 3.77 s\n", "Wall time: 3.76 s\n" ] } ], "source": [ "%%time\n", "files = df.path.tolist()\n", "filelist = list(filter(_filter_func, files))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "629942" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(filelist)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# def get_attrs(filepath):\n", "# basename = os.path.basename(filepath)\n", "# dirname = os.path.dirname(filepath)\n", "# filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'\n", "\n", "# gridspec_template = (\n", "# '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'\n", "# )\n", " \n", "# f = _reverse_filename_format(\n", "# basename, filename_template=filename_template, gridspec_template=gridspec_template\n", "# )\n", "\n", "# fileparts = {}\n", "# fileparts.update(f)\n", "# parent = os.path.dirname(filepath).strip('/')\n", "# parent_split = parent.split(f\"/{fileparts['source_id']}/\")\n", "# part_1 = parent_split[0].strip('/').split('/')\n", "# grid_label = parent.split(f\"/{fileparts['variable_id']}/\")[1].strip('/').split('/')[0]\n", "# fileparts['grid_label'] = grid_label\n", "# fileparts['activity_id'] = part_1[-2]\n", "# fileparts['institution_id'] = part_1[-1]\n", "# version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n", "# version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'\n", "# fileparts['version'] = version\n", "# fileparts['path'] = filepath\n", "# return fileparts \n", "\n", "def get_attrs(filepath):\n", " \"\"\" Extract attributes of a file using information from CMIP5 DRS.\n", " Notes\n", " -----\n", " Reference:\n", " - CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27\n", " \"\"\"\n", "\n", " fileparts = {}\n", "\n", " freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'\n", " realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'\n", " version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n", "\n", " file_basename = os.path.basename(filepath)\n", " fileparts['path'] = filepath\n", "\n", " filename_template = (\n", " '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'\n", " )\n", " gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'\n", " f = _reverse_filename_format(\n", " file_basename, filename_template=filename_template, gridspec_template=gridspec_template\n", " )\n", " fileparts.update(f)\n", "\n", " frequency = _extract_attr_with_regex(\n", " filepath, regex=freq_regex, strip_chars='/'\n", " )\n", " realm = _extract_attr_with_regex(filepath, regex=realm_regex)\n", " version = _extract_attr_with_regex(filepath, regex=version_regex) or 'v0'\n", " fileparts['frequency'] = frequency\n", " fileparts['modeling_realm'] = realm\n", " fileparts['version'] = version\n", "\n", " return fileparts" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r10i1p1/v20130331/hfls/hfls_Amon_CanCM4_historical_r10i1p1_196101-200512.nc',\n", " 'variable': 'hfls',\n", " 'mip_table': 'Amon',\n", " 'model': 'CanCM4',\n", " 'experiment': 'historical',\n", " 'ensemble_member': 'r10i1p1',\n", " 'temporal_subset': '196101-200512',\n", " 'frequency': 'mon',\n", " 'modeling_realm': 'atmos',\n", " 'version': 'v20130331'}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_attrs(filelist[0])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 28.6 s, sys: 639 ms, total: 29.2 s\n", "Wall time: 29.3 s\n" ] } ], "source": [ "%%time\n", "entries = list(map(get_attrs, filelist))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r9i1p1/v20130331/tas/tas_Amon_CanCM4_historical_r9i1p1_196101-200512.nc',\n", " 'variable': 'tas',\n", " 'mip_table': 'Amon',\n", " 'model': 'CanCM4',\n", " 'experiment': 'historical',\n", " 'ensemble_member': 'r9i1p1',\n", " 'temporal_subset': '196101-200512',\n", " 'frequency': 'mon',\n", " 'modeling_realm': 'atmos',\n", " 'version': 'v20130331'}" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entries[10]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "629942" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(entries)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pathvariablemip_tablemodelexperimentensemble_membertemporal_subsetfrequencymodeling_realmversion
0/glade/collections/cmip/cmip5/output/CCCma/Can...hflsAmonCanCM4historicalr10i1p1196101-200512monatmosv20130331
1/glade/collections/cmip/cmip5/output/CCCma/Can...tasAmonCanCM4historicalr10i1p1196101-200512monatmosv20130331
2/glade/collections/cmip/cmip5/output/CCCma/Can...prAmonCanCM4historicalr10i1p1196101-200512monatmosv20130331
3/glade/collections/cmip/cmip5/output/CCCma/Can...hflsAmonCanCM4historicalr1i1p1196101-200512monatmosv20130331
4/glade/collections/cmip/cmip5/output/CCCma/Can...tasAmonCanCM4historicalr1i1p1196101-200512monatmosv20130331
\n", "
" ], "text/plain": [ " path variable mip_table \\\n", "0 /glade/collections/cmip/cmip5/output/CCCma/Can... hfls Amon \n", "1 /glade/collections/cmip/cmip5/output/CCCma/Can... tas Amon \n", "2 /glade/collections/cmip/cmip5/output/CCCma/Can... pr Amon \n", "3 /glade/collections/cmip/cmip5/output/CCCma/Can... hfls Amon \n", "4 /glade/collections/cmip/cmip5/output/CCCma/Can... tas Amon \n", "\n", " model experiment ensemble_member temporal_subset frequency \\\n", "0 CanCM4 historical r10i1p1 196101-200512 mon \n", "1 CanCM4 historical r10i1p1 196101-200512 mon \n", "2 CanCM4 historical r10i1p1 196101-200512 mon \n", "3 CanCM4 historical r1i1p1 196101-200512 mon \n", "4 CanCM4 historical r1i1p1 196101-200512 mon \n", "\n", " modeling_realm version \n", "0 atmos v20130331 \n", "1 atmos v20130331 \n", "2 atmos v20130331 \n", "3 atmos v20130331 \n", "4 atmos v20130331 " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(entries)\n", "df = df.drop_duplicates(subset=['path'], keep='last').reset_index(drop=True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "629942" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'DataFrame' object has no attribute 'activity_id'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Some entries are invalid\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0minvalids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mactivity_id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mactivity_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m#df = df[df.activity_id.isin(activity_ids)]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/glade/work/abanihi/softwares/miniconda3/envs/analysis/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5177\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5178\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5179\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5180\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5181\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'activity_id'" ] } ], "source": [ "# Some entries are invalid\n", "invalids = df[~df.activity_id.isin(activity_ids)]\n", "#df = df[df.activity_id.isin(activity_ids)]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'invalids' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0minvalids\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'invalids' is not defined" ] } ], "source": [ "invalids" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "609904" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Separate Decadal Predictions from the rest of activities\n", "\n", "- Decadal prediction catalog requires additional columns (`start_year`)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "dcpp = df[df.activity_id==\"DCPP\"].copy().reset_index(drop=True)\n", "rest = df[~(df.activity_id==\"DCPP\")].copy().reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
activity_idinstitution_idsource_idexperiment_idmember_idtable_idvariable_idgrid_labelversiontime_rangepath
0AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonhflsgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonvagnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amontasgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonrsdsgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1Amonprgnv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
\n", "
" ], "text/plain": [ " activity_id institution_id source_id experiment_id member_id table_id \\\n", "0 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "1 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "2 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "3 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "4 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "\n", " variable_id grid_label version time_range \\\n", "0 hfls gn v20190624 201501-205512 \n", "1 va gn v20190624 201501-205512 \n", "2 tas gn v20190624 201501-205512 \n", "3 rsds gn v20190624 201501-205512 \n", "4 pr gn v20190624 201501-205512 \n", "\n", " path \n", "0 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "1 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "2 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "3 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"table_id\", \"variable_id\",\n", " \"grid_label\", \"version\", \"time_range\", \"path\"]\n", "rest = rest[columns]\n", "rest.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "rest.to_csv(\"../catalogs/glade-cmip6.csv.gz\", compression=\"gzip\", index=False)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variable_idtable_idsource_idexperiment_idmember_idgrid_labeltime_rangeactivity_idinstitution_idversionpath
0tasdayCanESM5dcppA-hindcasts2015-r7i1p2f1gn20160101-20251231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
1tasdayCanESM5dcppA-hindcasts1977-r2i1p2f1gn19780101-19871231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
2tasAmonCanESM5dcppA-hindcasts1977-r2i1p2f1gn197801-198712DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
3tasdayCanESM5dcppA-hindcasts1975-r8i1p2f1gn19760101-19851231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
4tasAmonCanESM5dcppA-hindcasts1975-r8i1p2f1gn197601-198512DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
\n", "
" ], "text/plain": [ " variable_id table_id source_id experiment_id member_id grid_label \\\n", "0 tas day CanESM5 dcppA-hindcast s2015-r7i1p2f1 gn \n", "1 tas day CanESM5 dcppA-hindcast s1977-r2i1p2f1 gn \n", "2 tas Amon CanESM5 dcppA-hindcast s1977-r2i1p2f1 gn \n", "3 tas day CanESM5 dcppA-hindcast s1975-r8i1p2f1 gn \n", "4 tas Amon CanESM5 dcppA-hindcast s1975-r8i1p2f1 gn \n", "\n", " time_range activity_id institution_id version \\\n", "0 20160101-20251231 DCPP CCCma v20190429 \n", "1 19780101-19871231 DCPP CCCma v20190429 \n", "2 197801-198712 DCPP CCCma v20190429 \n", "3 19760101-19851231 DCPP CCCma v20190429 \n", "4 197601-198512 DCPP CCCma v20190429 \n", "\n", " path \n", "0 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "1 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "2 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "3 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "4 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcpp.head()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variable_idtable_idsource_idexperiment_idmember_idgrid_labeltime_rangeactivity_idinstitution_idversionpathstart_year
0tasdayCanESM5dcppA-hindcastr7i1p2f1gn20160101-20251231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...2015.0
1tasdayCanESM5dcppA-hindcastr2i1p2f1gn19780101-19871231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1977.0
2tasAmonCanESM5dcppA-hindcastr2i1p2f1gn197801-198712DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1977.0
3tasdayCanESM5dcppA-hindcastr8i1p2f1gn19760101-19851231DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1975.0
4tasAmonCanESM5dcppA-hindcastr8i1p2f1gn197601-198512DCPPCCCmav20190429/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...1975.0
\n", "
" ], "text/plain": [ " variable_id table_id source_id experiment_id member_id grid_label \\\n", "0 tas day CanESM5 dcppA-hindcast r7i1p2f1 gn \n", "1 tas day CanESM5 dcppA-hindcast r2i1p2f1 gn \n", "2 tas Amon CanESM5 dcppA-hindcast r2i1p2f1 gn \n", "3 tas day CanESM5 dcppA-hindcast r8i1p2f1 gn \n", "4 tas Amon CanESM5 dcppA-hindcast r8i1p2f1 gn \n", "\n", " time_range activity_id institution_id version \\\n", "0 20160101-20251231 DCPP CCCma v20190429 \n", "1 19780101-19871231 DCPP CCCma v20190429 \n", "2 197801-198712 DCPP CCCma v20190429 \n", "3 19760101-19851231 DCPP CCCma v20190429 \n", "4 197601-198512 DCPP CCCma v20190429 \n", "\n", " path start_year \n", "0 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 2015.0 \n", "1 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1977.0 \n", "2 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1977.0 \n", "3 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1975.0 \n", "4 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... 1975.0 " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note: For 'dcppA-assim' experiment_id, there's no start year\n", "dcpp[\"start_year\"] = dcpp.member_id.map(lambda x: float(x.split(\"-\")[0][1:] if x.startswith(\"s\") else np.nan))\n", "dcpp[\"member_id\"] = dcpp[\"member_id\"].map(lambda x: x.split(\"-\")[-1] if x.startswith(\"s\") else x)\n", "dcpp.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
activity_idinstitution_idsource_idexperiment_idmember_idstart_yeartable_idvariable_idgrid_labelversiontime_rangepath
0DCPPCCCmaCanESM5dcppA-hindcastr7i1p2f12015.0daytasgnv2019042920160101-20251231/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
1DCPPCCCmaCanESM5dcppA-hindcastr2i1p2f11977.0daytasgnv2019042919780101-19871231/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
2DCPPCCCmaCanESM5dcppA-hindcastr2i1p2f11977.0Amontasgnv20190429197801-198712/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
3DCPPCCCmaCanESM5dcppA-hindcastr8i1p2f11975.0daytasgnv2019042919760101-19851231/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
4DCPPCCCmaCanESM5dcppA-hindcastr8i1p2f11975.0Amontasgnv20190429197601-198512/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
\n", "
" ], "text/plain": [ " activity_id institution_id source_id experiment_id member_id start_year \\\n", "0 DCPP CCCma CanESM5 dcppA-hindcast r7i1p2f1 2015.0 \n", "1 DCPP CCCma CanESM5 dcppA-hindcast r2i1p2f1 1977.0 \n", "2 DCPP CCCma CanESM5 dcppA-hindcast r2i1p2f1 1977.0 \n", "3 DCPP CCCma CanESM5 dcppA-hindcast r8i1p2f1 1975.0 \n", "4 DCPP CCCma CanESM5 dcppA-hindcast r8i1p2f1 1975.0 \n", "\n", " table_id variable_id grid_label version time_range \\\n", "0 day tas gn v20190429 20160101-20251231 \n", "1 day tas gn v20190429 19780101-19871231 \n", "2 Amon tas gn v20190429 197801-198712 \n", "3 day tas gn v20190429 19760101-19851231 \n", "4 Amon tas gn v20190429 197601-198512 \n", "\n", " path \n", "0 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "1 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "2 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "3 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... \n", "4 /glade/collections/cmip/CMIP6/DCPP/CCCma/CanES... " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"start_year\", \"table_id\", \"variable_id\",\n", " \"grid_label\", \"version\", \"time_range\", \"path\"]\n", "dcpp = dcpp[columns]\n", "dcpp.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "dcpp.to_csv(\"../catalogs/glade-cmip6-dcpp.csv.gz\", compression=\"gzip\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "file_extension": ".py", "kernelspec": { "display_name": "Python [conda env:analysis]", "language": "python", "name": "conda-env-analysis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "mimetype": "text/x-python", "name": "python", "npconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3 }, "nbformat": 4, "nbformat_minor": 4 }