diff --git a/archive-catalog.sh b/archive-catalog.sh index b7ae2bae4e75c9eb31705458a96c8e7798701a52..21aca2e626fd4754a5c2ef08b3a0a614c61bbf52 100755 --- a/archive-catalog.sh +++ b/archive-catalog.sh @@ -3,7 +3,7 @@ set -e project=$1 #path with /mnt/mount2 path=$2 -newcatalogzip=/home/k/k204210/intake-esm/catalogs/dkrz_${project}_disk.csv.gz +newcatalogzip=/home/k/k204210/volume/data-infrastructure-services/intake-esm/catalogs/dkrz_${project}_disk.csv.gz oldcatalogzip=${path}/Catalogs/dkrz_${project}_disk.csv.gz oldcatalogArchDir="${path}/Catalogs/archive" mkdir -p ${oldcatalogArchDir} diff --git a/builder/data-pool_collect-create-main.ipynb b/builder/data-pool_collect-create-main.ipynb index 388aaa5e1bbc5ec95fd7609814e0d31732a6ec1f..a00e96a4122ab1cf57e016ff40b57437fd86f107 100644 --- a/builder/data-pool_collect-create-main.ipynb +++ b/builder/data-pool_collect-create-main.ipynb @@ -190,7 +190,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"/home/k/k204210/intake-esm/esm-collections/disk-access/dkrz_catalog.yaml\",\"w\") as f:\n", + "with open(\"/home/k/k204210/volume/data-infrastructure-services/intake-esm/esm-collections/disk-access/dkrz_catalog.yaml\",\"w\") as f:\n", " f.write(yaml.dump(header))\n", " f.write(yaml.dump(sourcesdict))" ] diff --git a/builder/dkrz_cmip5_disk_catalog.py b/builder/dkrz_cmip5_disk_catalog.py index 528804062bb362fd78a7c4a06aba1f458bfaaa39..b0d916b9d53e8f468906c46a8a0c5722a406b8fc 100644 --- a/builder/dkrz_cmip5_disk_catalog.py +++ b/builder/dkrz_cmip5_disk_catalog.py @@ -1,24 +1,25 @@ import sys -sys.path.insert(0, '/home/k/k204210/intake-esm/builder/ncar-builder/builders/') +sys.path.insert(0, '/home/k/k204210/volume/data-infrastructure-services/intake-esm/builder/ncar-builder/builders/') from cmip import build_cmip import pandas as pd from tqdm import tqdm import ast -root_path="/mnt/lustre/work/kd0956/CMIP5/data/cmip5" +root_path="/work/kd0956/CMIP5/data/cmip5" depth=4 pick_latest_version="y" cmip_version=5 -csv_filepath="/home/k/k204210/intake-esm/catalogs/dkrz_cmip5_disk.csv.gz" +csv_filepath="/home/k/k204210/volume/data-infrastructure-services/intake-esm/catalogs/dkrz_cmip5_disk.csv.gz" df = build_cmip(root_path, cmip_version, depth=depth, pick_latest_version=pick_latest_version) +print(len(df["temporal_subset"])) + df["path"]=df["path"].str.replace("/mnt/lustre/",'/') df["project"]="cmip5" df["institution_id"]=df["institute"] df["source_id"]=df["model"] -df["experiment_id"]=df["experiment"] df["simulation_id"]=df["ensemble_member"] df["realm"]=df["modeling_realm"] df["experiment_id"]=df["experiment"] @@ -28,9 +29,39 @@ df["time_max"]=df["time_range"].str.split('-').str[1] df["format"]="netcdf" df["uri"]=df["path"] df["variable_id"]=df["variable"] -df['grid_id']="unkown" -df['level_type']="unkown" -df['time_reduction']="unkown" -df['grid_label']="unkown" +df['grid_id']="None" +df['level_type']="None" +df['time_reduction']="None" +df['grid_label']="None" + +save_columns = ["project", + "product_id", + "institute", + "model", + "experiment", + "frequency", + "modeling_realm", + "mip_table", + "ensemble_member", + "version", + "variable", + "temporal_subset", + "institution_id", + "source_id", + "experiment_id", + "variable_id", + "grid_label", + "realm", + "level_type", + "time_range", + "time_min", + "time_max", + "simulation_id", + "grid_id", + "time_reduction", + "format", + "uri"] +df = df[save_columns] +df = df.sort_values(save_columns, ascending = True).reset_index(drop=True) df.to_csv(csv_filepath, compression='gzip', index=False) diff --git a/builder/dkrz_cmip6_disk_catalog.py b/builder/dkrz_cmip6_disk_catalog.py index b9bcaacc67c196962df6d6b404d733fdd05cea35..4a17107227a56eb000aea5097b0635ee50bfcbf7 100755 --- a/builder/dkrz_cmip6_disk_catalog.py +++ b/builder/dkrz_cmip6_disk_catalog.py @@ -1,12 +1,12 @@ import sys -sys.path.insert(0, '/home/k/k204210/intake-esm/builder/ncar-builder/builders/') +sys.path.insert(0, '/home/k/k204210/volume/data-infrastructure-services/intake-esm/builder/ncar-builder/builders/') from cmip import build_cmip import pandas as pd from tqdm import tqdm import ast import json -root_path="/mnt/lustre/work/ik1017/CMIP6/data/CMIP6" +root_path="/work/ik1017/CMIP6/data/CMIP6" depth=4 pick_latest_version="y" cmip_version=6 @@ -19,7 +19,8 @@ df.to_csv(csv_filepath, compression='gzip', index=False) # Add PIDs and OpenDAPs to catalog df=pd.read_csv(csv_filepath) -pids=pd.read_csv("/home/k/k204210/intake-esm/catalogs/dkrz_cmip6_disk_filepids.csv.gz") +pids=pd.read_csv("/home/k/k204210/volume/data-infrastructure-services/intake-esm/catalogs/dkrz_cmip6_disk_filepids.csv.gz", + low_memory=True) pids["file_pids"]=pids["file_pids"].str.replace("'",'"') pids_dict=[] for index,row in tqdm(pids.iterrows()): @@ -34,15 +35,18 @@ pids_df=pd.DataFrame.from_records(pids_dict, columns=["path", "file_pid"]) pids_df["path"]=pids_df["path"].str.replace("lustre02","lustre") df=df.merge(pids_df, how='left', on='path') -df.to_csv("/home/k/k204210/intake-esm/catalogs/dkrz_cmip6pids_disk.csv.gz", compression="gzip", index=False) +df.to_csv("/home/k/k204210/volume/data-infrastructure-services/intake-esm/catalogs/dkrz_cmip6pids_disk.csv.gz", compression="gzip", index=False) df.drop(columns="file_pid", inplace=True) +del pids +del pids_df + df["path"]=df["path"].str.replace("/mnt/lustre/",'/') # Add long name information -catalog_basedir="/mnt/lustre02/work/ik1017/Catalogs" -c6tables_basedir="/home/k/k204210/cmip6-cmor-tables/Tables/" +catalog_basedir="/work/ik1017/Catalogs" +c6tables_basedir="/home/k/k204210/volume/dicad/cdo-incl-cmor/configuration/cmip6/cmip6-cmor-tables/Tables/" mip_tables={} for table in df["table_id"].unique(): @@ -92,8 +96,9 @@ df["uri"]=df["path"] # Add opendapurl def get_opendap(row): - filename=row["path"] - opendaptrunk="/".join(filename.split("/")[8:]) + #filename=row["path"] + filename=row + opendaptrunk="/".join(filename.split("/")[6:]) # try: # opendapurl="http://esgf3.dkrz.de/thredds/dodsC/cmip6/"+opendaptrunk # headers=requests.head(opendapurl).headers @@ -105,7 +110,19 @@ def get_opendap(row): # return np.nan return "http://esgf3.dkrz.de/thredds/dodsC/cmip6/"+opendaptrunk -df["opendap_url"] =df.apply(lambda row: get_opendap(row), axis=1) +# compress +category_cols = ['activity_id', 'institution_id', + 'source_id', 'experiment_id', 'member_id', 'table_id', 'variable_id', + 'grid_label', + 'project', 'simulation_id', 'grid_id', 'frequency', + 'time_reduction', 'long_name', 'units', 'realm', + 'format'] + +df=df.astype({key: "category" + for key in category_cols + }) + +df["opendap_url"] =df["path"].map(lambda row: get_opendap(row)) columns = ['activity_id', 'institution_id', 'source_id', 'experiment_id', 'member_id', 'table_id', 'variable_id', @@ -116,4 +133,4 @@ columns = ['activity_id', 'institution_id', df = df[columns] df = df.sort_values(columns, ascending = True).reset_index(drop=True) -df.to_csv("/home/k/k204210/intake-esm/catalogs/dkrz_cmip6_disk.csv.gz", compression="gzip", index=False) \ No newline at end of file +df.to_csv("/home/k/k204210/volume/data-infrastructure-services/intake-esm/catalogs/dkrz_cmip6_disk.csv.gz", compression="gzip", index=False) diff --git a/builder/dkrz_cordex_disk_catalog.py b/builder/dkrz_cordex_disk_catalog.py index e68e45ea1c7964fbf14f6fc75ab773d933efd483..eb7fc76ea105812427a98101fbe71554501926fc 100755 --- a/builder/dkrz_cordex_disk_catalog.py +++ b/builder/dkrz_cordex_disk_catalog.py @@ -206,7 +206,7 @@ invalids # In[43]: -with open('/home/k/k204210/intake-esm/invalids-cordex.txt', 'w') as f : +with open('/home/k/k204210/volume/data-infrastructure-services/intake-esm/invalids-cordex.txt', 'w') as f : for file in invalids.path.values : f.write(file+"\n") @@ -283,8 +283,34 @@ df.loc[:,"format"]="netcdf" df.loc[:,"uri"]=df["path"] df[["time_min","time_max"]]=df["time_range"].str.split('-',expand=True,n=1) -columns = ["project","product_id", "CORDEX_domain", "institute_id", "driving_model_id", "experiment_id", "member", "model_id", "institution_id", "simulation_id", "source_id", "grid_label","grid_id", "time_reduction", "realm", "level_type", - "rcm_version_id", "frequency", "variable_id", "version", "time_range", "time_min", "time_max", "path", "format", "uri", "opendap_url"] +columns = ["project", + "product_id", + "CORDEX_domain", + "institute_id", + "driving_model_id", + "experiment_id", + "member", + "model_id", + "rcm_version_id", + "frequency", + "variable_id", + "version", + "time_range", + "uri", + "institution_id", + "source_id", + "simulation_id", + "grid_label", + "grid_id", + "time_reduction", + "realm", + "level_type", + "time_min", + "time_max", + "path", + "format", + "opendap_url"] + df = df[columns] df = df.sort_values(columns, ascending = True).reset_index(drop=True) df.head() @@ -294,7 +320,7 @@ df.head() #df.to_csv("./mistral-cmip6.csv.gz", compression="gzip", index=False) -df.to_csv("/home/k/k204210/intake-esm/catalogs/dkrz_cordex_disk.csv.gz", compression="gzip", index=False) +df.to_csv("/home/k/k204210/volume/data-infrastructure-services/intake-esm/catalogs/dkrz_cordex_disk.csv.gz", compression="gzip", index=False) # In[ ]: diff --git a/builder/mistral-cmip6_catalog.ipynb b/builder/mistral-cmip6_catalog.ipynb deleted file mode 100755 index 9082eb72c391dd266da3cf450e24ebd0bd389241..0000000000000000000000000000000000000000 --- a/builder/mistral-cmip6_catalog.ipynb +++ /dev/null @@ -1,533 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import fnmatch\n", - "import dask.dataframe as dd\n", - "from intake.source.utils import reverse_format\n", - "import os\n", - "import re\n", - "import subprocess\n", - "from tqdm.auto import tqdm\n", - "from pathlib import Path\n", - "import shutil\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_file_list(persist_path):\n", - " persist_path = Path(persist_path)\n", - " if persist_path.exists():\n", - " shutil.rmtree(persist_path)\n", - " persist_path.mkdir()\n", - " root = Path(\"/mnt/lustre02/work/ik1017/CMIP6/data/CMIP6\")\n", - " dirs = [x for x in root.iterdir() if x.is_dir()]\n", - " for directory in tqdm(dirs):\n", - " print(directory)\n", - " stem = directory.stem\n", - " f = open(f\"{persist_path}/{stem}.txt\", \"w\")\n", - " cmd = [\"find\", \"-L\", directory.as_posix(), \"-name\", \"*.nc\", \"-perm\", \"-444\"]\n", - " p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)\n", - " p.wait()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "persist_path = \"/home/dkrz/k204210/intake-esm/CMIP6_filelist\"\n", - "get_file_list(persist_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Extract attributes of a file using information from CMIP6 DRS.\n", - "\n", - "\n", - "References\n", - " 1. CMIP6 DRS: http://goo.gl/v1drZl\n", - " 2. Controlled Vocabularies (CVs) for use in CMIP6:\n", - " https://github.com/WCRP-CMIP/CMIP6_CVs\n", - " \n", - " \n", - "Directory structure =\n", - "```<mip_era>/\n", - " <activity_id>/\n", - " <institution_id>/\n", - " <source_id>/\n", - " <experiment_id>/\n", - " <member_id>/\n", - " <table_id>/\n", - " <variable_id>/\n", - " <grid_label>/\n", - " <version>\n", - "```\n", - "file name =\n", - "```<variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc```\n", - "For time-invariant fields, the last segment (time_range) above is omitted.\n", - "Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`\n", - "Example with a sub-experiment: `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "activity_ids = list(Path(persist_path).rglob(\"*.txt\"))\n", - "activity_ids = [activity_id.stem for activity_id in activity_ids]\n", - "activity_ids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dd.read_csv(f\"{persist_path}/*.txt\", header=None).compute()\n", - "df.columns = [\"path\"]\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):\n", - " \"\"\"\n", - " Uses intake's ``reverse_format`` utility to reverse the string method format.\n", - " Given format_string and resolved_string, find arguments\n", - " that would give format_string.format(arguments) == resolved_string\n", - " \"\"\"\n", - " try:\n", - " return reverse_format(filename_template, file_basename)\n", - " except ValueError:\n", - " try:\n", - " return reverse_format(gridspec_template, file_basename)\n", - " except:\n", - " print(\n", - " f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'\n", - " )\n", - " return {}\n", - " \n", - "def _extract_attr_with_regex(input_str, regex, strip_chars=None):\n", - " pattern = re.compile(regex, re.IGNORECASE)\n", - " match = re.findall(pattern, input_str)\n", - " if match:\n", - " match = max(match, key=len)\n", - " if strip_chars:\n", - " match = match.strip(strip_chars)\n", - "\n", - " else:\n", - " match = match.strip()\n", - "\n", - " return match\n", - "\n", - " else:\n", - " return None\n", - " \n", - "\n", - "exclude_patterns = ['*/files/*', '*/latest/*']\n", - "def _filter_func(path):\n", - " return not any(\n", - " fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns\n", - " )\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add a filter for the retracted files:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "retracted_list=\"/mnt/lustre02/work/ik1017/CMIP6/meta/File.lst/Retracted/CMIP6_retracted_2020-12-10.list\"\n", - "df_ret = dd.read_csv(retracted_list, header=None).compute()\n", - "df_ret[0]=df_ret[0].str.split(\"/\").str[1:].str.join(\"/\")\n", - "#\n", - "df[\"path\"]=df[\"path\"].str.split(\"/\").str[7:].str.join(\"/\")\n", - "df = df[~df[\"path\"].isin(df_ret[0])].reset_index(drop=True)\n", - "df[\"path\"]=\"/mnt/lustre02/work/ik1017/CMIP6/data/\"+df[\"path\"].astype(str)\n", - "#\n", - "del df_ret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "files = df.path.tolist()\n", - "filelist = list(filter(_filter_func, files))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(filelist)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_attrs(filepath):\n", - " basename = os.path.basename(filepath)\n", - " dirname = os.path.dirname(filepath)\n", - " filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'\n", - "\n", - " gridspec_template = (\n", - " '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'\n", - " )\n", - "\n", - " f = _reverse_filename_format(\n", - " basename, filename_template=filename_template, gridspec_template=gridspec_template\n", - " )\n", - "\n", - " fileparts = {}\n", - " fileparts.update(f)\n", - " parent = os.path.dirname(filepath).strip('/')\n", - " parent_split = parent.split(f\"/{fileparts['source_id']}/\")\n", - " part_1 = parent_split[0].strip('/').split('/')\n", - " grid_label = parent.split(f\"/{fileparts['variable_id']}/\")[1].strip('/').split('/')[0]\n", - " frequency = parent.split(f\"/{fileparts['variable_id']}/\")[1].strip('/').split('/')[0]\n", - " fileparts['grid_label'] = grid_label\n", - " fileparts['frequency'] = frequency\n", - " fileparts['activity_id'] = part_1[-2]\n", - " fileparts['institution_id'] = part_1[-1]\n", - " version_regex = r'v\\d{4}\\d{2}\\d{2}|v\\d{1}'\n", - " version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'\n", - " fileparts['version'] = version\n", - " fileparts['path'] = filepath\n", - " return fileparts " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "entries = list(map(get_attrs, filelist))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "entries[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(entries)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df1 = pd.DataFrame(entries)\n", - "df1.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Some entries are invalid\n", - "invalids = df1[~df1.activity_id.isin(activity_ids)]\n", - "df = df1[df1.activity_id.isin(activity_ids)]\n", - "invalids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('/home/dkrz/k204210/intake-esm/invalids-cmip6.txt', 'w') as f :\n", - " for file in invalids.path.values :\n", - " f.write(file+\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "invalids.path.tolist()\n", - "\n", - "## Keep latest version" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pick the latest versions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#df = df1\n", - "grpby = list(set(df.columns.tolist()) - {'path', 'version', 'time_range'})\n", - "groups = df.groupby(grpby)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "idx_to_remove = []\n", - "for _, group in groups:\n", - " if group.version.nunique() > 1:\n", - " recentVersion=group.sort_values(by=['version'], ascending=False)[\"version\"].iloc[0]\n", - " idx_to_remove.extend(group[group[\"version\"]!= recentVersion].index[:].values.tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(idx_to_remove)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('/home/dkrz/k204210/intake-esm/oldVersions.txt', 'w') as f:\n", - " for file in df.path[idx_to_remove].values:\n", - " f.write(file+\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.drop(index=idx_to_remove)\n", - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"dcpp_init_year\"] = df.member_id.map(lambda x: float(x.split(\"-\")[0][1:] if x.startswith(\"s\") else np.nan))\n", - "df[\"member_id\"] = df[\"member_id\"].map(lambda x: x.split(\"-\")[-1] if x.startswith(\"s\") else x)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add PIDs and OpenDAPs to catalog" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#def get_pid(row):\n", - "# file=row[\"path\"]\n", - "# try:\n", - "# filepid = !ncdump -h {file} | grep \"tracking_id =\" | cut -d '\"' -f 2 | cut -d ':' -f 2\n", - "# return filepid\n", - "# except:\n", - "# return np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_opendap(row):\n", - " filename=row[\"path\"]\n", - " opendaptrunk=\"/\".join(filename.split(\"/\")[8:])\n", - " try:\n", - " opendapurl=\"http://esgf3.dkrz.de/thredds/dodsC/cmip6/\"+opendaptrunk\n", - " return opendapurl\n", - " except:\n", - " return np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#df[\"pid\"] = df.apply(lambda row: get_pid(row), axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"opendap_url\"] =df.apply(lambda row: get_opendap(row), axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns = [\"activity_id\", \"institution_id\", \"source_id\", \"experiment_id\", \"member_id\", \"table_id\", \"variable_id\",\n", - " \"grid_label\", \"dcpp_init_year\", \"version\", \"time_range\", \"path\", \"opendap_url\"]\n", - "df = df[columns]\n", - "df = df.sort_values(columns, ascending = True).reset_index(drop=True)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv(\"/home/dkrz/k204210/intake-esm/mistral-cmip6.csv.gz\", compression=\"gzip\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}