{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import fnmatch\n", "import dask.dataframe as dd\n", "from intake.source.utils import reverse_format\n", "import os\n", "import re\n", "import subprocess\n", "from tqdm.auto import tqdm\n", "from pathlib import Path\n", "import shutil\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create text file containing all files available" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_file_list(persist_path):\n", " persist_path = Path(persist_path)\n", " persist_path.mkdir(exist_ok=True)\n", " root = Path(\"/glade/collections/cmip/CMIP6\")\n", " dirs = [x for x in root.iterdir() if x.is_dir()]\n", " for directory in tqdm(dirs):\n", " print(directory)\n", " stem = directory.stem\n", " f = open(f\"{persist_path}/{stem}.txt\", \"w\")\n", " cmd = [\"find\", \"-L\", directory.as_posix(), \"-name\", \"*.nc\"]\n", " p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)\n", " p.wait()\n", " " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "persist_path = \"./CMIP6_filelist\"\n", "#get_file_list(persist_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "## Extract attributes of a file using information from CMI6 DRS.\n", "\n", "\n", "References\n", " 1. CMIP6 DRS: http://goo.gl/v1drZl\n", " 2. Controlled Vocabularies (CVs) for use in CMIP6:\n", " https://github.com/WCRP-CMIP/CMIP6_CVs\n", " \n", " \n", "Directory structure =\n", "```/\n", " /\n", " /\n", " /\n", " /\n", " /\n", " /\n", " /\n", " /\n", " \n", "```\n", "file name =\n", "```_____[_].nc```\n", "For time-invariant fields, the last segment (time_range) above is omitted.\n", "Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`\n", "Example with a sub-experiment: `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['CFMIP',\n", " 'CMIP',\n", " 'LUMIP',\n", " 'C4MIP',\n", " 'LS3MIP',\n", " 'OMIP',\n", " 'HighResMIP',\n", " 'DCPP',\n", " 'AerChemMIP',\n", " 'PMIP',\n", " 'DAMIP',\n", " 'PAMIP',\n", " 'ScenarioMIP']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "activity_ids = list(Path(persist_path).rglob(\"*.txt\"))\n", "activity_ids = [activity_id.stem for activity_id in activity_ids]\n", "activity_ids" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
path
0/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
\n", "
" ], "text/plain": [ " path\n", "0 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...\n", "1 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...\n", "2 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...\n", "3 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...\n", "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B..." ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = dd.read_csv(f\"{persist_path}/*.txt\", header=None).compute()\n", "df.columns = [\"path\"]\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "- 10/13/2019: 1,027,617 \n", "- 10/15/2019: 1,113,227\n", "- 10/16/2019: 1,129,214\n", "- 10/16/2019: 1,138,743" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1138743" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):\n", " \"\"\"\n", " Uses intake's ``reverse_format`` utility to reverse the string method format.\n", " Given format_string and resolved_string, find arguments\n", " that would give format_string.format(arguments) == resolved_string\n", " \"\"\"\n", " try:\n", " return reverse_format(filename_template, file_basename)\n", " except ValueError:\n", " try:\n", " return reverse_format(gridspec_template, file_basename)\n", " except:\n", " print(\n", " f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'\n", " )\n", " return {}\n", " \n", "def _extract_attr_with_regex(input_str, regex, strip_chars=None):\n", " pattern = re.compile(regex, re.IGNORECASE)\n", " match = re.findall(pattern, input_str)\n", " if match:\n", " match = max(match, key=len)\n", " if strip_chars:\n", " match = match.strip(strip_chars)\n", "\n", " else:\n", " match = match.strip()\n", "\n", " return match\n", "\n", " else:\n", " return None\n", " \n", "\n", "exclude_patterns = ['*/files/*', '*/latest/*']\n", "def _filter_func(path):\n", " return not any(\n", " fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns\n", " )\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.72 s, sys: 0 ns, total: 3.72 s\n", "Wall time: 3.71 s\n" ] } ], "source": [ "%%time\n", "files = df.path.tolist()\n", "filelist = list(filter(_filter_func, files))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "722392" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(filelist)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def get_attrs(filepath):\n", " basename = os.path.basename(filepath)\n", " dirname = os.path.dirname(filepath)\n", " filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'\n", "\n", " gridspec_template = (\n", " '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'\n", " )\n", " \n", " f = _reverse_filename_format(\n", " basename, filename_template=filename_template, gridspec_template=gridspec_template\n", " )\n", "\n", " fileparts = {}\n", " fileparts.update(f)\n", " parent = os.path.dirname(filepath).strip('/')\n", " parent_split = parent.split(f\"/{fileparts['source_id']}/\")\n", " part_1 = parent_split[0].strip('/').split('/')\n", " grid_label = parent.split(f\"/{fileparts['variable_id']}/\")[1].strip('/').split('/')[0]\n", " fileparts['grid_label'] = grid_label\n", " fileparts['activity_id'] = part_1[-2]\n", " fileparts['institution_id'] = part_1[-1]\n", " version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n", " version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'\n", " fileparts['version'] = version\n", " fileparts['path'] = filepath\n", " return fileparts " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 18 s, sys: 851 ms, total: 18.8 s\n", "Wall time: 18.8 s\n" ] } ], "source": [ "%%time\n", "entries = list(map(get_attrs, filelist))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'variable_id': 'pr',\n", " 'table_id': 'day',\n", " 'source_id': 'BCC-ESM1',\n", " 'experiment_id': 'ssp370',\n", " 'member_id': 'r2i1p1f1',\n", " 'grid_label': 'gn',\n", " 'time_range': '20150101-20551231',\n", " 'activity_id': 'AerChemMIP',\n", " 'institution_id': 'BCC',\n", " 'version': 'v20190702',\n", " 'path': '/glade/collections/cmip/CMIP6/AerChemMIP/BCC/BCC-ESM1/ssp370/r2i1p1f1/day/pr/gn/v20190702/pr/pr_day_BCC-ESM1_ssp370_r2i1p1f1_gn_20150101-20551231.nc'}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entries[0]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "722392" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(entries)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variable_idtable_idsource_idexperiment_idmember_idgrid_labeltime_rangeactivity_idinstitution_idversionpath
0prdayBCC-ESM1ssp370r2i1p1f1gn20150101-20551231AerChemMIPBCCv20190702/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1hflsAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2prsnAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3vaAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4tasAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
\n", "
" ], "text/plain": [ " variable_id table_id source_id experiment_id member_id grid_label \\\n", "0 pr day BCC-ESM1 ssp370 r2i1p1f1 gn \n", "1 hfls Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "2 prsn Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "3 va Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "4 tas Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "\n", " time_range activity_id institution_id version \\\n", "0 20150101-20551231 AerChemMIP BCC v20190702 \n", "1 201501-205512 AerChemMIP BCC v20190624 \n", "2 201501-205512 AerChemMIP BCC v20190624 \n", "3 201501-205512 AerChemMIP BCC v20190624 \n", "4 201501-205512 AerChemMIP BCC v20190624 \n", "\n", " path \n", "0 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "1 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "2 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "3 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.DataFrame(entries)\n", "df1.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "722392" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df1)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variable_idtable_idsource_idexperiment_idmember_idgrid_labeltime_rangeactivity_idinstitution_idversionpath
583241basinOfxabrupt-4xCO2NorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
583243depthoOfxabrupt-4xCO2NorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
583244volcelloOfxabrupt-4xCO2NorESM2-LMr1i1p1f1grNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
583531depthoOfxhistoricalNorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
584584sftofOfxpiControlNorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
584587depthoOfxpiControlNorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
586036sftofOfx1pctCO2NorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
586039basinOfx1pctCO2NorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
586041depthoOfx1pctCO2NorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
586042volcelloOfx1pctCO2NorESM2-LMr1i1p1f1grNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...
586189thetaoOmonPCMDI-test-1-0piControl-withismr3i1p1f1gn016201-016201v20190926thetaov20190926/glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FI...
588176sftofOfxhist-GHGNorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...
588177areacelloOfxhist-GHGNorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...
588179depthoOfxhist-GHGNorESM2-LMr1i1p1f1gnNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...
588181volcelloOfxhist-GHGNorESM2-LMr1i1p1f1grNaNNCCNorESM2-LMv20190815/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM...
\n", "
" ], "text/plain": [ " variable_id table_id source_id experiment_id member_id \\\n", "583241 basin Ofx abrupt-4xCO2 NorESM2-LM r1i1p1f1 \n", "583243 deptho Ofx abrupt-4xCO2 NorESM2-LM r1i1p1f1 \n", "583244 volcello Ofx abrupt-4xCO2 NorESM2-LM r1i1p1f1 \n", "583531 deptho Ofx historical NorESM2-LM r1i1p1f1 \n", "584584 sftof Ofx piControl NorESM2-LM r1i1p1f1 \n", "584587 deptho Ofx piControl NorESM2-LM r1i1p1f1 \n", "586036 sftof Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", "586039 basin Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", "586041 deptho Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", "586042 volcello Ofx 1pctCO2 NorESM2-LM r1i1p1f1 \n", "586189 thetao Omon PCMDI-test-1-0 piControl-withism r3i1p1f1 \n", "588176 sftof Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", "588177 areacello Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", "588179 deptho Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", "588181 volcello Ofx hist-GHG NorESM2-LM r1i1p1f1 \n", "\n", " grid_label time_range activity_id institution_id version \\\n", "583241 gn NaN NCC NorESM2-LM v20190815 \n", "583243 gn NaN NCC NorESM2-LM v20190815 \n", "583244 gr NaN NCC NorESM2-LM v20190815 \n", "583531 gn NaN NCC NorESM2-LM v20190815 \n", "584584 gn NaN NCC NorESM2-LM v20190815 \n", "584587 gn NaN NCC NorESM2-LM v20190815 \n", "586036 gn NaN NCC NorESM2-LM v20190815 \n", "586039 gn NaN NCC NorESM2-LM v20190815 \n", "586041 gn NaN NCC NorESM2-LM v20190815 \n", "586042 gr NaN NCC NorESM2-LM v20190815 \n", "586189 gn 016201-016201 v20190926 thetao v20190926 \n", "588176 gn NaN NCC NorESM2-LM v20190815 \n", "588177 gn NaN NCC NorESM2-LM v20190815 \n", "588179 gn NaN NCC NorESM2-LM v20190815 \n", "588181 gr NaN NCC NorESM2-LM v20190815 \n", "\n", " path \n", "583241 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "583243 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "583244 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "583531 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "584584 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "584587 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "586036 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "586039 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "586041 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "586042 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n", "586189 /glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FI... \n", "588176 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... \n", "588177 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... \n", "588179 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... \n", "588181 /glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM... " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Some entries are invalid\n", "invalids = df1[~df1.activity_id.isin(activity_ids)]\n", "df = df1[df1.activity_id.isin(activity_ids)]\n", "invalids" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/abrupt-4xCO2/r1i1p1f1/Ofx/basin/gn/v20190815/basin/basin_Ofx_abrupt-4xCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/abrupt-4xCO2/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_abrupt-4xCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/abrupt-4xCO2/r1i1p1f1/Ofx/volcello/gr/v20190815/volcello/volcello_Ofx_abrupt-4xCO2_NorESM2-LM_r1i1p1f1_gr.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/historical/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_historical_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/piControl/r1i1p1f1/Ofx/sftof/gn/v20190815/sftof/sftof_Ofx_piControl_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/piControl/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_piControl_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/sftof/gn/v20190815/sftof/sftof_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/basin/gn/v20190815/basin/basin_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/1pctCO2/r1i1p1f1/Ofx/volcello/gr/v20190815/volcello/volcello_Ofx_1pctCO2_NorESM2-LM_r1i1p1f1_gr.nc',\n", " '/glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FIO-ESM-2-0/piControl/r1i1p1f1/Omon/thetao/gn/v20190926/thetao/thetao_Omon_PCMDI-test-1-0_piControl-withism_r3i1p1f1_gn_016201-016201.nc',\n", " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/sftof/gn/v20190815/sftof/sftof_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/areacello/gn/v20190815/areacello/areacello_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gn.nc',\n", " '/glade/collections/cmip/CMIP6/DAMIP/NCC/NorESM2-LM/hist-GHG/r1i1p1f1/Ofx/volcello/gr/v20190815/volcello/volcello_Ofx_hist-GHG_NorESM2-LM_r1i1p1f1_gr.nc']" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "invalids.path.tolist()\n", "\n", "## Keep latest version" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "grpby = list(set(df.columns.tolist()) - {'path', 'version'})\n", "groups = df.groupby(grpby)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3min 36s, sys: 2.86 s, total: 3min 39s\n", "Wall time: 3min 37s\n" ] } ], "source": [ "%%time\n", "idx_to_remove = []\n", "for _, group in groups:\n", " if group.version.nunique() > 1:\n", " idx_to_remove.extend(group.sort_values(by=['version'], ascending=False).index[1:].values.tolist())" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "23653" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(idx_to_remove)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "722377" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "698724" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.drop(index=idx_to_remove)\n", "len(df)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variable_idtable_idsource_idexperiment_idmember_idgrid_labeltime_rangeactivity_idinstitution_idversionpathdcpp_init_year
0prdayBCC-ESM1ssp370r2i1p1f1gn20150101-20551231AerChemMIPBCCv20190702/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...NaN
1hflsAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...NaN
2prsnAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...NaN
3vaAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...NaN
4tasAmonBCC-ESM1ssp370r2i1p1f1gn201501-205512AerChemMIPBCCv20190624/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...NaN
\n", "
" ], "text/plain": [ " variable_id table_id source_id experiment_id member_id grid_label \\\n", "0 pr day BCC-ESM1 ssp370 r2i1p1f1 gn \n", "1 hfls Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "2 prsn Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "3 va Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "4 tas Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n", "\n", " time_range activity_id institution_id version \\\n", "0 20150101-20551231 AerChemMIP BCC v20190702 \n", "1 201501-205512 AerChemMIP BCC v20190624 \n", "2 201501-205512 AerChemMIP BCC v20190624 \n", "3 201501-205512 AerChemMIP BCC v20190624 \n", "4 201501-205512 AerChemMIP BCC v20190624 \n", "\n", " path dcpp_init_year \n", "0 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... NaN \n", "1 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... NaN \n", "2 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... NaN \n", "3 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... NaN \n", "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... NaN " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"dcpp_init_year\"] = df.member_id.map(lambda x: float(x.split(\"-\")[0][1:] if x.startswith(\"s\") else np.nan))\n", "df[\"member_id\"] = df[\"member_id\"].map(lambda x: x.split(\"-\")[-1] if x.startswith(\"s\") else x)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
activity_idinstitution_idsource_idexperiment_idmember_idtable_idvariable_idgrid_labeldcpp_init_yearversiontime_rangepath
0AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1dayprgnNaNv2019070220150101-20551231/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1AmonhflsgnNaNv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1AmonprsngnNaNv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1AmonvagnNaNv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4AerChemMIPBCCBCC-ESM1ssp370r2i1p1f1AmontasgnNaNv20190624201501-205512/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
\n", "
" ], "text/plain": [ " activity_id institution_id source_id experiment_id member_id table_id \\\n", "0 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 day \n", "1 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "2 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "3 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "4 AerChemMIP BCC BCC-ESM1 ssp370 r2i1p1f1 Amon \n", "\n", " variable_id grid_label dcpp_init_year version time_range \\\n", "0 pr gn NaN v20190702 20150101-20551231 \n", "1 hfls gn NaN v20190624 201501-205512 \n", "2 prsn gn NaN v20190624 201501-205512 \n", "3 va gn NaN v20190624 201501-205512 \n", "4 tas gn NaN v20190624 201501-205512 \n", "\n", " path \n", "0 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "1 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "2 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "3 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n", "4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"table_id\", \"variable_id\",\n", " \"grid_label\", \"dcpp_init_year\", \"version\", \"time_range\", \"path\"]\n", "df = df[columns]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"../catalogs/glade-cmip6.csv.gz\", compression=\"gzip\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "file_extension": ".py", "kernelspec": { "display_name": "Python [conda env:analysis]", "language": "python", "name": "conda-env-analysis-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "mimetype": "text/x-python", "name": "python", "npconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3 }, "nbformat": 4, "nbformat_minor": 4 }