Commit 7926f9b6 authored by Anderson Banihirwe's avatar Anderson Banihirwe
Browse files

Update CMIP6 catalog

parent 0071072a
......@@ -33,6 +33,8 @@
"outputs": [],
"source": [
"def get_file_list(persist_path):\n",
" persist_path = Path(persist_path)\n",
" persist_path.mkdir(exist_ok=True)\n",
" root = Path(\"/glade/collections/cmip/CMIP6\")\n",
" dirs = [x for x in root.iterdir() if x.is_dir()]\n",
" for directory in tqdm(dirs):\n",
......@@ -49,10 +51,44 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "92e78f720bfa4eef97649b014a4f91d8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=12), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/glade/collections/cmip/CMIP6/HighResMIP\n",
"/glade/collections/cmip/CMIP6/ScenarioMIP\n",
"/glade/collections/cmip/CMIP6/AerChemMIP\n",
"/glade/collections/cmip/CMIP6/OMIP\n",
"/glade/collections/cmip/CMIP6/C4MIP\n",
"/glade/collections/cmip/CMIP6/DCPP\n",
"/glade/collections/cmip/CMIP6/CMIP\n",
"/glade/collections/cmip/CMIP6/CFMIP\n",
"/glade/collections/cmip/CMIP6/LUMIP\n",
"/glade/collections/cmip/CMIP6/PAMIP\n",
"/glade/collections/cmip/CMIP6/DAMIP\n",
"/glade/collections/cmip/CMIP6/LS3MIP\n",
"\n"
]
}
],
"source": [
"persist_path = \"./CMIP6_filelist\"\n",
"#get_file_list(persist_path)"
"get_file_list(persist_path)"
]
},
{
......@@ -99,11 +135,13 @@
"['CFMIP',\n",
" 'CMIP',\n",
" 'LUMIP',\n",
" 'C4MIP',\n",
" 'LS3MIP',\n",
" 'OMIP',\n",
" 'HighResMIP',\n",
" 'DCPP',\n",
" 'AerChemMIP',\n",
" 'DAMIP',\n",
" 'PAMIP',\n",
" 'ScenarioMIP']"
]
......@@ -193,6 +231,16 @@
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"- 10/13/2019: 1,027,617 \n",
"- 10/15/2019: 1,113,227\n",
"- 10/16/2019: 1,129,214"
]
},
{
"cell_type": "code",
"execution_count": 6,
......@@ -201,7 +249,7 @@
{
"data": {
"text/plain": [
"1027617"
"1129214"
]
},
"execution_count": 6,
......@@ -263,15 +311,22 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.24 s, sys: 19.3 ms, total: 4.26 s\n",
"Wall time: 4.26 s\n"
"CPU times: user 4.16 s, sys: 38.2 ms, total: 4.2 s\n",
"Wall time: 4.2 s\n"
]
}
],
......@@ -283,16 +338,16 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"609911"
"711508"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
......@@ -303,7 +358,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
......@@ -338,15 +393,15 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 18.5 s, sys: 528 ms, total: 19.1 s\n",
"Wall time: 19.1 s\n"
"CPU times: user 18.5 s, sys: 816 ms, total: 19.3 s\n",
"Wall time: 19.3 s\n"
]
}
],
......@@ -357,26 +412,26 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'variable_id': 'hfls',\n",
" 'table_id': 'Amon',\n",
"{'variable_id': 'pr',\n",
" 'table_id': 'day',\n",
" 'source_id': 'BCC-ESM1',\n",
" 'experiment_id': 'ssp370',\n",
" 'member_id': 'r2i1p1f1',\n",
" 'grid_label': 'gn',\n",
" 'time_range': '201501-205512',\n",
" 'time_range': '20150101-20551231',\n",
" 'activity_id': 'AerChemMIP',\n",
" 'institution_id': 'BCC',\n",
" 'version': 'v20190624',\n",
" 'path': '/glade/collections/cmip/CMIP6/AerChemMIP/BCC/BCC-ESM1/ssp370/r2i1p1f1/Amon/hfls/gn/v20190624/hfls/hfls_Amon_BCC-ESM1_ssp370_r2i1p1f1_gn_201501-205512.nc'}"
" 'version': 'v20190702',\n",
" 'path': '/glade/collections/cmip/CMIP6/AerChemMIP/BCC/BCC-ESM1/ssp370/r2i1p1f1/day/pr/gn/v20190702/pr/pr_day_BCC-ESM1_ssp370_r2i1p1f1_gn_20150101-20551231.nc'}"
]
},
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
......@@ -387,16 +442,16 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"609911"
"711508"
]
},
"execution_count": 21,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
......@@ -407,7 +462,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 15,
"metadata": {},
"outputs": [
{
......@@ -447,21 +502,21 @@
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>hfls</td>\n",
" <td>Amon</td>\n",
" <td>pr</td>\n",
" <td>day</td>\n",
" <td>BCC-ESM1</td>\n",
" <td>ssp370</td>\n",
" <td>r2i1p1f1</td>\n",
" <td>gn</td>\n",
" <td>201501-205512</td>\n",
" <td>20150101-20551231</td>\n",
" <td>AerChemMIP</td>\n",
" <td>BCC</td>\n",
" <td>v20190624</td>\n",
" <td>v20190702</td>\n",
" <td>/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>va</td>\n",
" <td>hfls</td>\n",
" <td>Amon</td>\n",
" <td>BCC-ESM1</td>\n",
" <td>ssp370</td>\n",
......@@ -475,7 +530,7 @@
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>tas</td>\n",
" <td>prsn</td>\n",
" <td>Amon</td>\n",
" <td>BCC-ESM1</td>\n",
" <td>ssp370</td>\n",
......@@ -489,7 +544,7 @@
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>rsds</td>\n",
" <td>va</td>\n",
" <td>Amon</td>\n",
" <td>BCC-ESM1</td>\n",
" <td>ssp370</td>\n",
......@@ -503,7 +558,7 @@
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>pr</td>\n",
" <td>tas</td>\n",
" <td>Amon</td>\n",
" <td>BCC-ESM1</td>\n",
" <td>ssp370</td>\n",
......@@ -521,18 +576,18 @@
],
"text/plain": [
" variable_id table_id source_id experiment_id member_id grid_label \\\n",
"0 hfls Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"1 va Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"2 tas Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"3 rsds Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"4 pr Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"0 pr day BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"1 hfls Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"2 prsn Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"3 va Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"4 tas Amon BCC-ESM1 ssp370 r2i1p1f1 gn \n",
"\n",
" time_range activity_id institution_id version \\\n",
"0 201501-205512 AerChemMIP BCC v20190624 \n",
"1 201501-205512 AerChemMIP BCC v20190624 \n",
"2 201501-205512 AerChemMIP BCC v20190624 \n",
"3 201501-205512 AerChemMIP BCC v20190624 \n",
"4 201501-205512 AerChemMIP BCC v20190624 \n",
" time_range activity_id institution_id version \\\n",
"0 20150101-20551231 AerChemMIP BCC v20190702 \n",
"1 201501-205512 AerChemMIP BCC v20190624 \n",
"2 201501-205512 AerChemMIP BCC v20190624 \n",
"3 201501-205512 AerChemMIP BCC v20190624 \n",
"4 201501-205512 AerChemMIP BCC v20190624 \n",
"\n",
" path \n",
"0 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... \n",
......@@ -542,7 +597,7 @@
"4 /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B... "
]
},
"execution_count": 24,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
......@@ -554,95 +609,16 @@
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"609911"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Keep latest version"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"grpby = list(set(df1.columns.tolist()) - {'path', 'version'})\n",
"groups = df1.groupby(grpby)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4min 11s, sys: 4.39 s, total: 4min 15s\n",
"Wall time: 4min 12s\n"
]
}
],
"source": [
"%%time\n",
"idx_to_remove = []\n",
"for _, group in groups:\n",
" if group.version.nunique() > 1:\n",
" idx_to_remove.extend(group.sort_values(by=['version'], ascending=False).index[1:].values.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"19169"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(idx_to_remove)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"609911"
"711508"
]
},
"execution_count": 31,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
......@@ -653,28 +629,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"590742"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df1.drop(index=idx_to_remove)\n",
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 18,
"metadata": {},
"outputs": [
{
......@@ -713,50 +668,8 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>507544</td>\n",
" <td>sftof</td>\n",
" <td>Ofx</td>\n",
" <td>historical</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>r1i1p1f1</td>\n",
" <td>gn</td>\n",
" <td>NaN</td>\n",
" <td>NCC</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>v20190815</td>\n",
" <td>/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>507546</td>\n",
" <td>basin</td>\n",
" <td>Ofx</td>\n",
" <td>historical</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>r1i1p1f1</td>\n",
" <td>gn</td>\n",
" <td>NaN</td>\n",
" <td>NCC</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>v20190815</td>\n",
" <td>/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>507549</td>\n",
" <td>volcello</td>\n",
" <td>Ofx</td>\n",
" <td>historical</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>r1i1p1f1</td>\n",
" <td>gr</td>\n",
" <td>NaN</td>\n",
" <td>NCC</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>v20190815</td>\n",
" <td>/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>508393</td>\n",
" <td>areacello</td>\n",
" <td>582271</td>\n",
" <td>deptho</td>\n",
" <td>Ofx</td>\n",
" <td>piControl</td>\n",
" <td>NorESM2-LM</td>\n",
......@@ -769,35 +682,7 @@
" <td>/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>508395</td>\n",
" <td>basin</td>\n",
" <td>Ofx</td>\n",
" <td>piControl</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>r1i1p1f1</td>\n",
" <td>gn</td>\n",
" <td>NaN</td>\n",
" <td>NCC</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>v20190815</td>\n",
" <td>/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>508396</td>\n",
" <td>volcello</td>\n",
" <td>Ofx</td>\n",
" <td>piControl</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>r1i1p1f1</td>\n",
" <td>gr</td>\n",
" <td>NaN</td>\n",
" <td>NCC</td>\n",
" <td>NorESM2-LM</td>\n",
" <td>v20190815</td>\n",
" <td>/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>509418</td>\n",
" <td>583805</td>\n",
" <td>thetao</td>\n",
" <td>Omon</td>\n",
" <td>PCMDI-test-1-0</td>\n",
......@@ -816,48 +701,153 @@
],
"text/plain": [
" variable_id table_id source_id experiment_id member_id \\\n",
"507544 sftof Ofx historical NorESM2-LM r1i1p1f1 \n",
"507546 basin Ofx historical NorESM2-LM r1i1p1f1 \n",
"507549 volcello Ofx historical NorESM2-LM r1i1p1f1 \n",
"508393 areacello Ofx piControl NorESM2-LM r1i1p1f1 \n",
"508395 basin Ofx piControl NorESM2-LM r1i1p1f1 \n",
"508396 volcello Ofx piControl NorESM2-LM r1i1p1f1 \n",
"509418 thetao Omon PCMDI-test-1-0 piControl-withism r3i1p1f1 \n",
"582271 deptho Ofx piControl NorESM2-LM r1i1p1f1 \n",
"583805 thetao Omon PCMDI-test-1-0 piControl-withism r3i1p1f1 \n",
"\n",
" grid_label time_range activity_id institution_id version \\\n",
"507544 gn NaN NCC NorESM2-LM v20190815 \n",
"507546 gn NaN NCC NorESM2-LM v20190815 \n",
"507549 gr NaN NCC NorESM2-LM v20190815 \n",
"508393 gn NaN NCC NorESM2-LM v20190815 \n",
"508395 gn NaN NCC NorESM2-LM v20190815 \n",
"508396 gr NaN NCC NorESM2-LM v20190815 \n",
"509418 gn 016201-016201 v20190926 thetao v20190926 \n",
"582271 gn NaN NCC NorESM2-LM v20190815 \n",
"583805 gn 016201-016201 v20190926 thetao v20190926 \n",
"\n",
" path \n",
"507544 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n",
"507546 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n",
"507549 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n",
"508393 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n",
"508395 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n",
"508396 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n",
"509418 /glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FI... "
"582271 /glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2... \n",
"583805 /glade/collections/cmip/CMIP6/CMIP/FIO-QLNM/FI... "
]
},
"execution_count": 34,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Some entries are invalid\n",
"invalids = df[~df.activity_id.isin(activity_ids)]\n",
"df = df[df.activity_id.isin(activity_ids)]\n",
"invalids = df1[~df1.activity_id.isin(activity_ids)]\n",
"df = df1[df1.activity_id.isin(activity_ids)]\n",
"invalids"
]
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/piControl/r1i1p1f1/Ofx/deptho/gn/v20190815/deptho/deptho_Ofx_piControl_NorESM2-LM_r1i1p1f1_gn.nc'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"invalids.iloc[0].path\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Keep latest version"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"grpby = list(set(df.columns.tolist()) - {'path', 'version'})\n",
"groups = df.groupby(grpby)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 59s, sys: 1.85 s, total: 4min\n",
"Wall time: 3min 59s\n"
]
}
],
"source": [
"%%time\n",
"idx_to_remove = []\n",
"for _, group in groups:\n",
" if group.version.nunique() > 1:\n",
" idx_to_remove.extend(group.sort_values(by=['version'], ascending=False).index[1:].values.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"23587"