Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
intake-esm_support
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
data-infrastructure-services
intake-esm_support
Commits
e543ce66
Commit
e543ce66
authored
2 years ago
by
Fabian Wachsmann
Browse files
Options
Downloads
Patches
Plain Diff
Updated for era5 catalog
parent
dbdbee5e
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
builder/data-pool_candidate-check-and-append.ipynb
+51
-202
51 additions, 202 deletions
builder/data-pool_candidate-check-and-append.ipynb
with
51 additions
and
202 deletions
builder/data-pool_candidate-check-and-append.ipynb
+
51
−
202
View file @
e543ce66
...
...
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count":
1
,
"execution_count":
null
,
"id": "3e5313a4-8e12-4ba3-8517-7ac34896bc6c",
"metadata": {},
"outputs": [],
...
...
@@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count":
2
,
"execution_count":
null
,
"id": "6b814955-2ca4-40c1-b4cb-0b70cf58c078",
"metadata": {},
"outputs": [],
...
...
@@ -30,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count":
3
,
"execution_count":
null
,
"id": "e6591a2f-bcb1-43b0-8809-e1ad4e2827f1",
"metadata": {},
"outputs": [],
...
...
@@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count":
4
,
"execution_count":
null
,
"id": "ec66e472-a123-4ab0-a1c4-0d480c5c07a6",
"metadata": {},
"outputs": [],
...
...
@@ -80,7 +80,7 @@
},
{
"cell_type": "code",
"execution_count":
5
,
"execution_count":
null
,
"id": "ac66ad8f-03de-4173-bc82-71be4a37f011",
"metadata": {},
"outputs": [],
...
...
@@ -94,7 +94,7 @@
},
{
"cell_type": "code",
"execution_count":
6
,
"execution_count":
null
,
"id": "ad223ff4-3ec8-4e6f-8d4e-6b3f2b24b306",
"metadata": {},
"outputs": [],
...
...
@@ -119,27 +119,29 @@
},
{
"cell_type": "code",
"execution_count":
7
,
"execution_count":
null
,
"id": "306b6e70-e14c-4838-812f-7268ed6d7451",
"metadata": {},
"outputs": [],
"source": [
"def load_catalog(col,l_hastime=True, l_isgrib=False):\n",
" col.df=col.df.loc[0:MIN_NO_OF_LINES_FOR_LOAD,:]\n",
" kwargs={\"zarr_kwargs\":{\"consolidated\": True}}\n",
" if l_hastime :\n",
" kwargs[\"cdf_kwargs\"]={\"chunks\":{\"time\":1}}\n",
" kwargs[\"zarr_kwargs\"][\"decode_times\"]=True\n",
" kwargs[\"zarr_kwargs\"][\"use_cftime\"]=True\n",
" cat=col.search(uri=col.df.loc[0,\"uri\"])\n",
" kwargs={}\n",
" if l_isgrib:\n",
" kwargs[\"cdf_kwargs\"][\"engine\"]=\"cfgrib\"\n",
" kwargs[\"cdf_kwargs\"]={\"engine\":\"cfgrib\"}\n",
" else:\n",
" kwargs[\"zarr_kwargs\"]={\"consolidated\": True}\n",
" if l_hastime :\n",
" kwargs[\"cdf_kwargs\"]={\"chunks\":{\"time\":1}}\n",
" kwargs[\"zarr_kwargs\"][\"decode_times\"]=True\n",
" kwargs[\"zarr_kwargs\"][\"use_cftime\"]=True\n",
" \n",
" return c
ol
.to_dataset_dict(kwargs)"
" return c
at
.to_dataset_dict(
**
kwargs)"
]
},
{
"cell_type": "code",
"execution_count":
8
,
"execution_count":
null
,
"id": "ecccdf7a-2ba7-4fb3-8e03-ab1f26929bc1",
"metadata": {},
"outputs": [],
...
...
@@ -149,6 +151,12 @@
" col=open_catalog(template_or_catalog, df)\n",
" if VERBOSE:\n",
" print(\"Successfully opened catalog\")\n",
" l_isgrib=False\n",
" try:\n",
" if \"grib\" in col.df[\"format\"].unique() :\n",
" l_isgrib=True\n",
" except Exception as e:\n",
" pass\n",
" cat=search_catalog(col, candidate)\n",
" if not cat:\n",
" raise ValueError(\"Could not find project of catalog in catalog\")\n",
...
...
@@ -156,13 +164,13 @@
" print(\"Successfully searched catalog\")\n",
" if not \"archive\" in candidate:\n",
" print(candidate)\n",
" if not \"cloud\" in candidate:\n",
" for path in tqdm.tqdm(col.df[\"uri\"].values):\n",
" if not os.path.exists(path):\n",
" if VERBOSE:\n",
" print(f\"path {path} does not exist. Check permissions.\")\n",
" return F
A
lse\n",
" dset_dict=load_catalog(col)\n",
"
#
if not \"cloud\" in candidate:\n",
"
#
for path in tqdm.tqdm(col.df[\"uri\"].values):\n",
"
#
if not os.path.exists(path):\n",
"
#
if VERBOSE:\n",
"
#
print(f\"path {path} does not exist. Check permissions.\")\n",
"
#
return F
a
lse\n",
" dset_dict=load_catalog(col
, l_isgrib=l_isgrib
)\n",
" if VERBOSE:\n",
" print(\"Successfully loaded catalog\")\n",
" return True\n",
...
...
@@ -174,7 +182,7 @@
},
{
"cell_type": "code",
"execution_count":
9
,
"execution_count":
null
,
"id": "4605a81c-962c-4597-97fa-60db8a75c847",
"metadata": {},
"outputs": [],
...
...
@@ -192,7 +200,7 @@
},
{
"cell_type": "code",
"execution_count":
10
,
"execution_count":
null
,
"id": "a6310f65-e80d-4180-aa0e-74b9b22db15f",
"metadata": {},
"outputs": [],
...
...
@@ -211,7 +219,7 @@
},
{
"cell_type": "code",
"execution_count":
11
,
"execution_count":
null
,
"id": "b1b8ec2a-5629-45cf-849f-b83ea5dc3642",
"metadata": {},
"outputs": [],
...
...
@@ -227,7 +235,7 @@
},
{
"cell_type": "code",
"execution_count":
12
,
"execution_count":
null
,
"id": "5307f97f-b81d-4695-be65-0bd9274f550e",
"metadata": {},
"outputs": [],
...
...
@@ -242,7 +250,7 @@
},
{
"cell_type": "code",
"execution_count":
13
,
"execution_count":
null
,
"id": "1fd8a674-ad4e-400b-a493-099b2e4f482b",
"metadata": {},
"outputs": [],
...
...
@@ -296,7 +304,7 @@
},
{
"cell_type": "code",
"execution_count":
1
,
"execution_count":
null
,
"id": "efae06d3-866b-4ba8-b7d4-8497c52d8bc3",
"metadata": {},
"outputs": [],
...
...
@@ -331,7 +339,7 @@
},
{
"cell_type": "code",
"execution_count":
15
,
"execution_count":
null
,
"id": "00631fde-6506-4245-a301-26e92a8d7ff9",
"metadata": {},
"outputs": [],
...
...
@@ -345,7 +353,7 @@
},
{
"cell_type": "code",
"execution_count":
16
,
"execution_count":
null
,
"id": "484f8e3c-d89e-4244-b623-dd074377724a",
"metadata": {},
"outputs": [],
...
...
@@ -382,179 +390,10 @@
},
{
"cell_type": "code",
"execution_count":
17
,
"execution_count":
null
,
"id": "132b3dbf-2784-465e-a3d8-1229d25e51a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Will check ['dkrz_nextgems_disk.json', 'dkrz_dyamond-winter_disk.json']\n",
"Linked from ['/work/ik1017/Catalogs/dkrz_nextgems_disk.json', '/work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json']\n",
"Checking candidate dkrz_nextgems_disk.json...\n",
"Found catalog format json\n",
"Successfully opened catalog\n",
"Successfully searched catalog\n",
"dkrz_nextgems_disk.json\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 43054/43054 [00:10<00:00, 4009.38it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"--> The keys in the returned dictionary of datasets are constructed as follows:\n",
"\t'project.institution_id.source_id.experiment_id.simulation_id.realm.frequency.time_reduction.grid_label.level_type'\n"
]
},
{
"data": {
"text/html": [
"\n",
"<style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
" background: #F44336;\n",
" }\n",
"</style>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <progress value='1' class='' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" 100.00% [1/1 00:00<00:00]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Successfully loaded catalog\n",
"Can use dkrz_nextgems_disk.json\n",
"Found existing project catalog/pool/data/Catalogs/dkrz_nextgems_disk.json\n",
"The catalog will be saved entirely as json\n",
"Target /work/ik1017/Catalogs/dkrz_nextgems_disk.json will be overwritten\n",
"Candidate /work/ik1017/Catalogs/dkrz_nextgems_disk.json is the same as target /work/ik1017/Catalogs/dkrz_nextgems_disk.json\n",
"Catalogs are equal. Nothing to append.\n",
"Checking candidate dkrz_dyamond-winter_disk.json...\n",
"Found catalog format json\n",
"Successfully opened catalog\n",
"Successfully searched catalog\n",
"dkrz_dyamond-winter_disk.json\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 40640/40640 [00:10<00:00, 3997.74it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"--> The keys in the returned dictionary of datasets are constructed as follows:\n",
"\t'project.institution_id.source_id.experiment_id.simulation_id.realm.frequency.time_reduction.grid_label.level_type'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"<style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
" background: #F44336;\n",
" }\n",
"</style>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <progress value='1' class='' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" 100.00% [1/1 00:00<00:00]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Successfully loaded catalog\n",
"Can use dkrz_dyamond-winter_disk.json\n",
"Will be a new project catalog in pool dkrz_dyamond-winter_disk.json\n",
"The catalog will be saved entirely as json\n",
"Target /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json will be overwritten\n",
"Candidate /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json is the same as target /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json\n",
"Create catalog /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json\n",
"Writing catalog with 40640 entries into: /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json\n",
"Writing ESM collection json file to: /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json\n",
"Create link /pool/data/Catalogs/dkrz_dyamond-winter_disk.json to catalog\n"
]
}
],
"outputs": [],
"source": [
"candidates=os.listdir(SOURCETRUNK)\n",
"candidatepathes=[]\n",
...
...
@@ -592,6 +431,16 @@
"id": "49f39d3f-383d-4ca4-9387-f718bc0ec3a9",
"metadata": {},
"outputs": [],
"source": [
"!ls /work/bk1099/data/ml00_1H/1979/E5ml00_1H_1979-01-04_075"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "578c4f59-fd80-4058-a0af-6231b977de67",
"metadata": {},
"outputs": [],
"source": []
}
],
...
...
%% Cell type:code id:3e5313a4-8e12-4ba3-8517-7ac34896bc6c tags:
```
python
import
intake
import
os
import
pandas
as
pd
import
filecmp
import
tqdm
import
json
```
%% Cell type:code id:6b814955-2ca4-40c1-b4cb-0b70cf58c078 tags:
```
python
TRUNK
=
"
/pool/data/Catalogs
"
SOURCETRUNK
=
TRUNK
+
"
/
"
+
"
Candidates
"
TEMPLATETRUNK
=
TRUNK
+
"
/
"
+
"
Templates
"
TARGETTRUNK
=
"
/work/ik1017/Catalogs
"
```
%% Cell type:code id:e6591a2f-bcb1-43b0-8809-e1ad4e2827f1 tags:
```
python
allowed_auxiliaries
=
[
"
grid
"
]
institutes
=
[
"
dkrz
"
]
stores
=
[
"
disk
"
,
"
cloud
"
,
"
archive
"
]
VERBOSE
=
True
MIN_NO_OF_LINES_FOR_LOAD
=
3
```
%% Cell type:code id:ec66e472-a123-4ab0-a1c4-0d480c5c07a6 tags:
```
python
def
check_naming_convention
(
candidate
):
if
not
'
_
'
in
candidate
:
if
VERBOSE
:
print
(
"
candidate should contain
'
_
'"
)
return
0
name_parts
=
candidate
.
split
(
'
_
'
)
no_of_parts
=
len
(
name_parts
)
if
no_of_parts
<
3
or
no_of_parts
>
4
:
if
VERBOSE
:
print
(
"
No of catalog name elements separated with
'
_
'
must be 3 or 4
"
)
return
0
elif
no_of_parts
==
4
and
name_parts
[
3
]
not
in
allowed_auxiliaries
:
if
VERBOSE
:
print
(
"
No of catalog name elements separated with
'
_
'
is 4
"
"
but the last element is not in the list of allowed auxiliary names
"
)
return
0
if
name_parts
[
0
]
not
in
institutes
:
if
VERBOSE
:
print
(
f
"
Candidate
'
s institute
{
name_parts
[
0
]
}
not known
"
)
return
0
if
name_parts
[
2
]
not
in
stores
:
if
VERBOSE
:
print
(
f
"
Candidate
'
s store
{
name_parts
[
2
]
}
not known
"
)
return
0
return
1
```
%% Cell type:code id:ac66ad8f-03de-4173-bc82-71be4a37f011 tags:
```
python
def
open_catalog
(
template_or_catalog
,
df
=
None
):
if
df
:
return
intake
.
open_esm_datastore
(
df
,
esmcol_data
=
template_or_catalog
)
else
:
return
intake
.
open_esm_datastore
(
template_or_catalog
)
```
%% Cell type:code id:ad223ff4-3ec8-4e6f-8d4e-6b3f2b24b306 tags:
```
python
def
search_catalog
(
col
,
candidate
):
project
=
candidate
.
split
(
'
_
'
)[
1
]
catalog_projects
=
col
.
unique
(
"
project
"
)[
"
project
"
][
"
values
"
]
if
len
(
catalog_projects
)
>
1
:
print
(
"
Found more than one project
"
)
lower_projects
=
[
p
.
lower
()
for
p
in
catalog_projects
]
if
project
not
in
lower_projects
:
print
(
f
"
Did not find
{
project
}
in catalog
'
s projects
"
)
print
(
lower_projects
)
else
:
for
idx
,
lp
in
enumerate
(
lower_projects
):
if
lp
==
project
:
project
=
catalog_projects
[
idx
]
break
return
col
.
search
(
project
=
project
)
```
%% Cell type:code id:306b6e70-e14c-4838-812f-7268ed6d7451 tags:
```
python
def
load_catalog
(
col
,
l_hastime
=
True
,
l_isgrib
=
False
):
col
.
df
=
col
.
df
.
loc
[
0
:
MIN_NO_OF_LINES_FOR_LOAD
,:]
kwargs
=
{
"
zarr_kwargs
"
:{
"
consolidated
"
:
True
}}
if
l_hastime
:
kwargs
[
"
cdf_kwargs
"
]
=
{
"
chunks
"
:{
"
time
"
:
1
}}
kwargs
[
"
zarr_kwargs
"
][
"
decode_times
"
]
=
True
kwargs
[
"
zarr_kwargs
"
][
"
use_cftime
"
]
=
True
cat
=
col
.
search
(
uri
=
col
.
df
.
loc
[
0
,
"
uri
"
])
kwargs
=
{}
if
l_isgrib
:
kwargs
[
"
cdf_kwargs
"
][
"
engine
"
]
=
"
cfgrib
"
kwargs
[
"
cdf_kwargs
"
]
=
{
"
engine
"
:
"
cfgrib
"
}
else
:
kwargs
[
"
zarr_kwargs
"
]
=
{
"
consolidated
"
:
True
}
if
l_hastime
:
kwargs
[
"
cdf_kwargs
"
]
=
{
"
chunks
"
:{
"
time
"
:
1
}}
kwargs
[
"
zarr_kwargs
"
][
"
decode_times
"
]
=
True
kwargs
[
"
zarr_kwargs
"
][
"
use_cftime
"
]
=
True
return
c
ol
.
to_dataset_dict
(
kwargs
)
return
c
at
.
to_dataset_dict
(
**
kwargs
)
```
%% Cell type:code id:ecccdf7a-2ba7-4fb3-8e03-ab1f26929bc1 tags:
```
python
def
open_search_load_catalog
(
candidate
,
template_or_catalog
,
df
=
None
):
try
:
col
=
open_catalog
(
template_or_catalog
,
df
)
if
VERBOSE
:
print
(
"
Successfully opened catalog
"
)
l_isgrib
=
False
try
:
if
"
grib
"
in
col
.
df
[
"
format
"
].
unique
()
:
l_isgrib
=
True
except
Exception
as
e
:
pass
cat
=
search_catalog
(
col
,
candidate
)
if
not
cat
:
raise
ValueError
(
"
Could not find project of catalog in catalog
"
)
if
VERBOSE
:
print
(
"
Successfully searched catalog
"
)
if
not
"
archive
"
in
candidate
:
print
(
candidate
)
if
not
"
cloud
"
in
candidate
:
for
path
in
tqdm
.
tqdm
(
col
.
df
[
"
uri
"
].
values
):
if
not
os
.
path
.
exists
(
path
):
if
VERBOSE
:
print
(
f
"
path
{
path
}
does not exist. Check permissions.
"
)
return
F
A
lse
dset_dict
=
load_catalog
(
col
)
#
if not "cloud" in candidate:
#
for path in tqdm.tqdm(col.df["uri"].values):
#
if not os.path.exists(path):
#
if VERBOSE:
#
print(f"path {path} does not exist. Check permissions.")
#
return F
a
lse
dset_dict
=
load_catalog
(
col
,
l_isgrib
=
l_isgrib
)
if
VERBOSE
:
print
(
"
Successfully loaded catalog
"
)
return
True
return
True
except
Exception
as
e
:
print
(
e
)
return
False
```
%% Cell type:code id:4605a81c-962c-4597-97fa-60db8a75c847 tags:
```
python
def
find_template
(
candidate
):
templates
=
os
.
listdir
(
TEMPLATETRUNK
)
project
=
candidate
.
split
(
'
_
'
)[
1
]
template_candidate
=
"
intake-esm_template_
"
+
project
if
template_candidate
in
templates
:
return
TEMPLATETRUNK
+
"
/
"
+
template_candidate
if
VERBOSE
:
print
(
f
"
Could not find any template for project
{
project
}
"
)
return
None
```
%% Cell type:code id:a6310f65-e80d-4180-aa0e-74b9b22db15f tags:
```
python
def
find_existing_project_catalog
(
candidate
):
candidatename
=
candidate
.
split
(
'
.
'
)[
0
]
existing
=
[
cat
.
split
(
'
.
'
)[
0
]
for
cat
in
os
.
listdir
(
TRUNK
)]
if
candidatename
in
existing
:
if
VERBOSE
:
print
(
"
Found existing project catalog
"
+
TRUNK
+
"
/
"
+
candidatename
+
"
.json
"
)
return
TRUNK
+
"
/
"
+
candidatename
+
"
.json
"
if
VERBOSE
:
print
(
f
"
Will be a new project catalog in pool
{
candidate
}
"
)
return
None
```
%% Cell type:code id:b1b8ec2a-5629-45cf-849f-b83ea5dc3642 tags:
```
python
def
check_catalog_format
(
candidate
):
if
"
.csv
"
in
candidate
:
return
"
csv
"
elif
candidate
.
endswith
(
"
.json
"
):
return
"
json
"
else
:
return
None
```
%% Cell type:code id:5307f97f-b81d-4695-be65-0bd9274f550e tags:
```
python
def
open_candidate_dataframe
(
candidate
):
try
:
return
pd
.
read_csv
(
candidate
)
except
Exception
as
e
:
print
(
e
)
return
None
```
%% Cell type:code id:1fd8a674-ad4e-400b-a493-099b2e4f482b tags:
```
python
def
test_catalog
(
candidate
,
idx
):
candidatepath
=
candidatepathes
[
idx
]
catalog_format
=
check_catalog_format
(
candidate
)
if
not
catalog_format
and
VERBOSE
:
print
(
f
"
Could not detect fileformat of
{
candidate
}
.
"
"
Choose one of csv or json.
"
)
return
None
,
None
else
:
print
(
f
"
Found catalog format
{
catalog_format
}
"
)
if
catalog_format
==
"
json
"
:
if
open_search_load_catalog
(
candidate
,
candidatepath
):
print
(
f
"
Can use
{
candidate
}
"
)
return
candidatepath
,
None
else
:
if
VERBOSE
:
print
(
f
"
Open, search or load failed
"
)
return
None
,
None
else
:
candidate_df
=
open_candidate_dataframe
(
candidatepath
)
if
not
candidate_df
:
print
(
f
"
Candidate
{
candidate
}
could not be opened with pandas as a dataframe
"
)
return
None
,
None
template
=
find_template
(
candidate
)
existing_json
=
find_existing_project_catalog
(
candidate
)
if
not
template
:
if
not
existing_json
:
print
(
f
"
Do not have a template or an existing catalog for
{
candidate
}
"
)
return
None
,
None
template
=
existing_json
if
open_search_load_catalog
(
candidate
,
template
,
candidate_df
):
print
(
f
"
Can use
{
candidate
}
with template
{
template
}
"
)
return
template
,
candidate
else
:
if
VERBOSE
:
print
(
f
"
Open, search or load failed with tempate
{
template
}
"
)
if
template
!=
existing_json
:
if
open_search_load_catalog
(
candidate
,
existing_json
,
candidate_df
):
print
(
f
"
Can use
{
candidate
}
with existing json
{
existing_json
}
"
)
return
existing_json
,
candidate
else
:
if
VERBOSE
:
print
(
f
"
Open, search or load failed with existing json
{
existing_json
}
"
)
print
(
f
"
Catalog candidate
{
candidate
}
does neither work with template
{
template
}
nor with
{
existing_json
}
"
)
return
None
,
None
return
None
,
None
```
%% Cell type:code id:efae06d3-866b-4ba8-b7d4-8497c52d8bc3 tags:
```
python
def
append_catalog
(
cat
,
existing_json
,
candidatename
,
catalog_type
):
parent_catalog
=
open_catalog
(
existing_json
)
if
cat
.
df
.
equals
(
parent_catalog
.
df
):
if
VERBOSE
:
print
(
"
Catalogs are equal. Nothing to append.
"
)
return
None
new_df
=
parent_catalog
.
df
.
copy
()
columns
=
list
(
new_df
.
columns
.
values
)
i1
=
cat
.
df
.
set_index
(
columns
).
index
i2
=
new_df
.
set_index
(
columns
).
index
cat_filtered_df
=
cat
.
df
[
~
i1
.
isin
(
i2
)]
duplicated_columns
=
list
(
set
(
columns
)
-
{
"
uri
"
})
if
"
path
"
in
duplicated_columns
:
duplicated_columns
=
list
(
set
(
duplicated_columns
)
-
{
"
path
"
})
cat_filtered_duplicated_df
=
cat_filtered_df
[
~
cat_filtered_df
[
duplicated_columns
].
duplicated
(
keep
=
"
first
"
)]
if
len
(
cat_filtered_duplicated_df
)
<
len
(
cat_filtered_df
)
and
VERBOSE
:
print
(
"
Catalog to append contains duplications. Those are filtered.
"
)
new_df
=
pd
.
concat
([
new_df
,
cat_filtered_duplicated_df
],
ignore_index
=
True
)
parent_catalog
.
df
=
new_df
return
parent_catalog
.
serialize
(
name
=
"
/work/ik1017/Catalogs/
"
+
candidatename
,
catalog_type
=
catalog_type
)
```
%% Cell type:code id:00631fde-6506-4245-a301-26e92a8d7ff9 tags:
```
python
def
rewrite_json
(
catalog
):
with
open
(
catalog
,
"
r
"
)
as
f
:
jsondata
=
json
.
load
(
f
)
with
open
(
catalog
,
"
w
"
)
as
f
:
json
.
dump
(
jsondata
,
f
,
indent
=
4
,
sort_keys
=
True
)
```
%% Cell type:code id:484f8e3c-d89e-4244-b623-dd074377724a tags:
```
python
def
append_or_write_catalog
(
esmcol_data
,
df
,
idx
,
candidate
):
candidatepath
=
candidatepathes
[
idx
]
targetpath
=
TARGETTRUNK
+
"
/
"
+
candidate
candidatename
=
candidate
.
split
(
'
.
'
)[
0
]
cat
=
open_catalog
(
esmcol_data
,
df
=
df
)
existing_json_link
=
find_existing_project_catalog
(
candidate
)
catalog_type
=
"
file
"
if
not
cat
.
catalog_file
:
if
VERBOSE
:
print
(
"
The catalog will be saved entirely as json
"
)
catalog_type
=
"
dict
"
if
os
.
path
.
isfile
(
targetpath
):
print
(
f
"
Target
{
targetpath
}
will be overwritten
"
)
if
targetpath
==
candidatepath
:
print
(
"
Candidate
"
+
candidatepath
+
"
is the same as target
"
+
targetpath
)
if
not
existing_json_link
:
print
(
"
Create catalog
"
+
targetpath
)
cat
.
serialize
(
name
=
TARGETTRUNK
+
"
/
"
+
candidatename
,
catalog_type
=
catalog_type
)
rewrite_json
(
TARGETTRUNK
+
"
/
"
+
candidatename
+
"
.json
"
)
return
targetpath
if
not
os
.
path
.
islink
(
existing_json_link
):
print
(
f
"
The existing file
{
existing_json_link
}
is not a link! Can neither write nor append to.
"
)
return
None
existing_json
=
os
.
readlink
(
existing_json_link
)
l_appended
=
append_catalog
(
cat
,
existing_json
,
candidatename
,
catalog_type
)
if
l_appended
:
rewrite_json
(
"
/work/ik1017/Catalogs/
"
+
candidatename
+
"
.json
"
)
return
existing_json_link
```
%% Cell type:code id:132b3dbf-2784-465e-a3d8-1229d25e51a7 tags:
```
python
candidates
=
os
.
listdir
(
SOURCETRUNK
)
candidatepathes
=
[]
for
candidate
in
candidates
:
if
not
os
.
path
.
islink
(
SOURCETRUNK
+
"
/
"
+
candidate
):
print
(
SOURCETRUNK
+
"
/
"
+
candidate
+
"
is not a link!
"
)
candidates
.
remove
(
candidate
)
candidatepathes
.
append
(
os
.
readlink
(
SOURCETRUNK
+
"
/
"
+
candidate
))
print
(
f
"
Will check
{
candidates
}
"
)
print
(
f
"
Linked from
{
candidatepathes
}
"
)
#
candidatenames
=
[
c
.
split
(
'
.
'
)[
0
]
for
c
in
candidates
]
for
idx
,
candidate
in
enumerate
(
candidates
):
if
VERBOSE
:
print
(
f
"
Checking candidate
{
candidate
}
...
"
)
candidatename
=
candidate
.
split
(
'
.
'
)[
0
]
if
candidatenames
.
count
(
candidatename
)
>
1
and
candidatename
+
"
.json
"
in
candidates
and
candidatename
+
"
.json
"
!=
candidate
:
if
VERBOSE
:
print
(
f
"
There is another
{
candidatename
}
.json catalog which is checked instead...
"
)
continue
if
not
check_naming_convention
(
candidatename
):
continue
esmcol_data
,
df
=
test_catalog
(
candidate
,
idx
)
if
esmcol_data
:
newcatalog
=
append_or_write_catalog
(
esmcol_data
,
df
,
idx
,
candidate
)
if
not
os
.
path
.
islink
(
newcatalog
):
if
VERBOSE
:
print
(
"
Create link
"
+
TRUNK
+
"
/
"
+
candidate
+
"
to catalog
"
)
!
ln
-
s
{
newcatalog
}
{
TRUNK
+
"
/
"
+
candidate
}
```
%% Output
Will check ['dkrz_nextgems_disk.json', 'dkrz_dyamond-winter_disk.json']
Linked from ['/work/ik1017/Catalogs/dkrz_nextgems_disk.json', '/work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json']
Checking candidate dkrz_nextgems_disk.json...
Found catalog format json
Successfully opened catalog
Successfully searched catalog
dkrz_nextgems_disk.json
100%|██████████| 43054/43054 [00:10<00:00, 4009.38it/s]
--> The keys in the returned dictionary of datasets are constructed as follows:
'project.institution_id.source_id.experiment_id.simulation_id.realm.frequency.time_reduction.grid_label.level_type'
Successfully loaded catalog
Can use dkrz_nextgems_disk.json
Found existing project catalog/pool/data/Catalogs/dkrz_nextgems_disk.json
The catalog will be saved entirely as json
Target /work/ik1017/Catalogs/dkrz_nextgems_disk.json will be overwritten
Candidate /work/ik1017/Catalogs/dkrz_nextgems_disk.json is the same as target /work/ik1017/Catalogs/dkrz_nextgems_disk.json
Catalogs are equal. Nothing to append.
Checking candidate dkrz_dyamond-winter_disk.json...
Found catalog format json
Successfully opened catalog
Successfully searched catalog
dkrz_dyamond-winter_disk.json
100%|██████████| 40640/40640 [00:10<00:00, 3997.74it/s]
--> The keys in the returned dictionary of datasets are constructed as follows:
'project.institution_id.source_id.experiment_id.simulation_id.realm.frequency.time_reduction.grid_label.level_type'
%% Cell type:code id:49f39d3f-383d-4ca4-9387-f718bc0ec3a9 tags:
Successfully loaded catalog
Can use dkrz_dyamond-winter_disk.json
Will be a new project catalog in pool dkrz_dyamond-winter_disk.json
The catalog will be saved entirely as json
Target /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json will be overwritten
Candidate /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json is the same as target /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json
Create catalog /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json
Writing catalog with 40640 entries into: /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json
Writing ESM collection json file to: /work/ik1017/Catalogs/dkrz_dyamond-winter_disk.json
Create link /pool/data/Catalogs/dkrz_dyamond-winter_disk.json to catalog
```
python
!
ls
/
work
/
bk1099
/
data
/
ml00_1H
/
1979
/
E5ml00_1H_1979
-
01
-
04_075
```
%% Cell type:code id:
49f39d3f-383d-4ca4-9387-f718bc0ec3a9
tags:
%% Cell type:code id:
578c4f59-fd80-4058-a0af-6231b977de67
tags:
```
python
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment