{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import fnmatch\n",
    "import dask.dataframe as dd\n",
    "import dask\n",
    "from intake.source.utils import reverse_format\n",
    "import os\n",
    "import re\n",
    "import subprocess\n",
    "from tqdm.auto import tqdm\n",
    "from pathlib import Path\n",
    "import shutil\n",
    "import numpy as np\n",
    "import datetime\n",
    "import xarray\n",
    "from functools import lru_cache\n",
    "import itertools\n",
    "import cfgrib"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ERA5 Restructured 2023"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "#-----------------------------------------------------------------------------\n",
    "#      (A) Directory structure\n",
    "#-----------------------------------------------------------------------------\n",
    "This document describes the /pool/data/ERA5 directory\n",
    "structure that was implemented for all ERA5 data by December 19th, 2022 9am UTC\n",
    "\n",
    "#==L1== Subfolders in /pool/data/ERA5/       \n",
    "The subfolders name is a 2-digit product abbreviation (PROD):\n",
    "E5           ERA5 data 1959 to present (most recent files with ~ 3-month delay)\n",
    "E1           ERA5.1 data (corrected ERA5 data of the years 2000 to 2006)\n",
    "ET           ERA5T is a provisional, early-release version of the ERA5 data \n",
    "EB           ERA5 BE is the preliminary ERA5 backward extension data. \n",
    "             Please use E5 instead. We keep this folder for the sake of traceability\n",
    "             until January 31, 2023. Afterwards, the data will be deleted.\n",
    "\n",
    "/pool/data/ERA5/ additionaly contains the following ancillary subfolders:\n",
    "READMEs      all kind of Readmes for users of Levante /pool/data/ERA5 data\n",
    "custom       storage of special ERA data requests (e.g. paramaters not provided by default)\n",
    "incoming     (for internal use) raw retrievals\n",
    "scripts      (for internal use) various scripts to postprocess or check the ERA5 data\n",
    "\n",
    "#==L2== Subfolders in /pool/data/ERA5/${PROD}/       \n",
    "The subfolders name is a 2-digit abbreviation for the level type (LEVTYPE):\n",
    "pl           pressure level data  (16 parameters)\n",
    "ml           model level data     (16 parameters)\n",
    "sf           surface level data   (49 parameter in sf00 and 55 parameters in sf12)\n",
    "\n",
    "#==L3== Subfolders in /pool/data/ERA5/${PROD}/${LEVTYPE}/       \n",
    "The subfolders name is a 2-digit abbreviation for the ECMWF type (TYPE):\n",
    "an           analysis fields (ECMWF type=an; relevant for ml00, pl00, sf00 data)\n",
    "fc           forecast fields (ECMWF type=fc; relevant for sf12 data)\n",
    "\n",
    "#==L4== Subfolders in /pool/data/ERA5/${PROD}/${LEVTYPE}/${TYPE}/\n",
    "The subfolders name is a 2-digit abbreviation for temporal resolution (TRES):\n",
    "1H           hourly temporal resolution\n",
    "1D           daily  temporal resolution\n",
    "1M           monthly temporal resolution\n",
    "\n",
    "#==L5== Subfolders in /pool/data/ERA5/${PROD}/${LEVTYPE}/${TYPE}/${TRES}/\n",
    "PAR          3-digit GRIB parameter code as directory name\n",
    " \n",
    "PAR=027,028,029,030,031,032,033,034,035,036,037,038,039,040,041,042,043,044,045,049,050,\n",
    "    057,059,060,075,076,077,078,079,129,130,131,132,133,134,135,136,137,138,139,141,142,143,\n",
    "    144,145,146,147,151,152,155,157,159,160,161,162,163,164,165,166,167,168,169,170,172,175,\n",
    "    176,177,178,179,180,181,182,183,186,187,188,195,196,197,198,201,202,203,205,206,208,209,\n",
    "    210,211,212,228,235,236,238,239,240,243,244,245,246,247,248,277,278,279,285,385,386,474,\n",
    "    475,476,477,502,503,507\n",
    "\n",
    "Please note that PAR refers to the last three digits of the parameter ID specified in ECMWF code table 128, except for PAR values >= 256 which refer to ECMWF code table 228. All table 228 PAR values have to be substracted by 256 to obtain the correct table 228 parameter ID.  Example: PAR 502 in /pool/data/ERA5 corresponds to parameter ID 246 of code table 228 (502=246+256).\n",
    "\n",
    "\n",
    "#-----------------------------------------------------------------------------\n",
    "#      (B) File naming convention\n",
    "#-----------------------------------------------------------------------------\n",
    "\n",
    "ERA5 files on /pool/data/ERA5 have the following file naming convention:\n",
    "#==L5== Subfolders in /pool/data/ERA5/${PROD}/${LEVTYPE}/${TYPE}/${TRES}/${PAR}\n",
    "    \n",
    "       ${PROD}${LEVTYPE}${TYPEID}_${TRES}_${DATE}_${PAR}.grb\n",
    "\n",
    "TYPEID       a 2-digit identifier for the ECMWF type (TYPE) and the forecast step\n",
    "             00     all data with ECMWF type (TYPE) = an        \n",
    "             12     for data with ECMWF type (TYPE) = fc \n",
    "                    the number 12 reflects the forecast step\n",
    "                    on /pool/data/ERA5 only fc data with forecast step 12 are provided \n",
    "DATE         ${YYYY}-${MM}-${DD}\n",
    "All other variables are defined in section (A) above.     "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "## Extract attributes of a file using information from ERA5 \"DRS\".\n",
    "\n",
    "\n",
    "The filename is consructed by:\n",
    "- `<era_id><level_type><type>_<frequency>_<validation_date>_<virt_code>`\n",
    "- `<era_id><level_type><type>_<frequency>_<initialization_date>_<virt_code>`\n",
    "\n",
    "E.g.: _\"E5pl00_1H_1979-10-02_157\"_\n",
    "\n",
    "The directory root is constructed by:\n",
    "- `/work/bk1099/ERA5/<era_id>/<level_type>/<type>/<frequency>/<virt_code>`\n",
    "\n",
    "E.g.: _\"/pool/data/ERA5/E5/sf/an/1H/027/\"_ \n",
    "\n",
    "`era_id`: E5 <br>\n",
    "`virt_code`: %03d <br>\n",
    "`initialization_date`: YYYY[-MM][[-DD]] <br>\n",
    "`validation_date`: YYYY[-MM][[-DD]] <br>\n",
    "`frequency`: hourly=01=1H, monthly=1M <br>\n",
    "`type`: an=00, fc=12 <br>\n",
    "`level_type`: ml, pl, sf <br>\n",
    "`path` : \"STRING\" <br>\n",
    "`step`: 01 <br>\n",
    "\n",
    "From inside the file (`cdo sinfo`): <br>\n",
    "`startdate`: \"YYYY-MM-DD HH:MM:SS\"  <br>\n",
    "`enddate`: \"YYYY-MM-DD HH:MM:SS\" <br>\n",
    "`cell_methods`:\n",
    "- \"time: mean within days time: mean over days\", #synoptical\n",
    "- \"time: maximum within days time: mean over days\"\n",
    "- \"time: minimum within days time: mean over days\"\n",
    "- \"time: mean\"\n",
    "- \"time: point\"\n",
    "- \"time: maximum\"\n",
    "- \"time: minimum\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "era_cv={}\n",
    "era_cv[\"era_id\"]=[\"E5\", \"E1\", \"ET\"]\n",
    "era_cv[\"virt_code\"]=\"[0-9][0-9][0-9]\"\n",
    "era_cv[\"initialization_date\"]=\"%Y-%m-%d\"\n",
    "era_cv[\"validation_date\"]=\"%Y-%m-%d\" #or INVARIANT\n",
    "era_cv[\"frequency\"]={\"1H\":\"hourly\",\"1D\":\"daily\", \"IV\":\"invariant\"} #, \"1M\":\"monthly\"\n",
    "era_cv[\"dataType\"]={\"00\":\"an\", \"12\":\"fc\"}\n",
    "era_cv[\"level_type\"]={\"ml\":\"model_level\", \"pl\":\"pressure_level\",\"sf\":\"surface\"}\n",
    "era_cv[\"step\"]=[\"01\"]\n",
    "era_cv[\"startdate\"]=\"%Y-%m-%d\"\n",
    "era_cv[\"enddate\"]=\"%Y-%m-%d\"\n",
    "era_cv[\"cell_methods\"]=[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "@lru_cache(maxsize=None)\n",
    "def get_file_list(root_path): #, depth=0, extension='*.nc'):        \n",
    "    depth=1\n",
    "    from dask.diagnostics import ProgressBar\n",
    "\n",
    "    #dirs=[p for p in Path(root_path).glob(\"*/*/\") if p.is_dir()]\n",
    "    \n",
    "    cmd = ['find', \"/work/bk1099/data/E5\", '-mindepth', '4', '-maxdepth', '4','-type', \"d\"]\n",
    "    proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)\n",
    "    output = proc.stdout.read().decode('utf-8').split()\n",
    "    dirs=[Path(entry) for entry in output]\n",
    "    cmd = ['find', \"/work/bk1099/data/ET\", '-mindepth', '4', '-maxdepth', '4','-type', \"d\"]\n",
    "    proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)\n",
    "    output = proc.stdout.read().decode('utf-8').split()\n",
    "    dirs+=[Path(entry) for entry in output]\n",
    "    cmd = ['find', \"/work/bk1099/data/E1\", '-mindepth', '4', '-maxdepth', '4','-type', \"d\"]\n",
    "    proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)\n",
    "    output = proc.stdout.read().decode('utf-8').split()\n",
    "    dirs+=[Path(entry) for entry in output]\n",
    "    \n",
    "    @dask.delayed\n",
    "    def _file_dir_files(directory):\n",
    "        try:\n",
    "            cmd = ['find', '-L', directory.as_posix(), '-name', '*.grb','-type', \"f\", \"-perm\", \"-444\"]\n",
    "            proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)\n",
    "            output = proc.stdout.read().decode('utf-8').split()\n",
    "        except Exception:\n",
    "            output = []\n",
    "        return output\n",
    "\n",
    "    print('Getting list of assets...\\n')\n",
    "    filelist = [_file_dir_files(directory) for directory in dirs]\n",
    "    # watch progress\n",
    "    with ProgressBar():\n",
    "        filelist = dask.compute(*filelist)\n",
    "\n",
    "    filelist = list(itertools.chain(*filelist))\n",
    "    return filelist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "E1  EC\tincoming  READMEs  sf00_1D  sf12_01  sf12_MM\n",
      "E5  ET\tpl00_1D   scripts  sf00_MM  sf12_1D\n"
     ]
    }
   ],
   "source": [
    "!ls /work/bk1099/data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_filename(path):\n",
    "    fileparts={}\n",
    "    fileparts[\"path\"] = path\n",
    "    filename=path.split(os.sep)[-1]\n",
    "    if len(filename.split(\"_\")) < 4 or len(filename.split(\"_\")[0])<6:\n",
    "        return fileparts\n",
    "    \n",
    "    fileparts[\"era_id\"]=filename[0:2]\n",
    "    fileparts[\"level_type\"]=filename[2:4]\n",
    "    fileparts[\"dataType\"]=filename[4:6]\n",
    "    fileparts[\"frequency\"]=filename.split(\"_\")[1]\n",
    "    for key in fileparts :\n",
    "        if key != \"path\" :\n",
    "            if not fileparts[key] in era_cv[key]:\n",
    "                return fileparts\n",
    "        \n",
    "    fileparts[\"validation_date\"]=filename.split(\"_\")[2]\n",
    "    if not fileparts[\"validation_date\"]==\"INVARIANT\":\n",
    "        if \"-\" in fileparts[\"validation_date\"] and len(fileparts[\"validation_date\"].split(\"-\")[1]) == 4:\n",
    "            fileparts[\"validation_date\"]=fileparts[\"validation_date\"].split(\"-\")[0]\n",
    "        teststring = era_cv[\"validation_date\"]\n",
    "        \n",
    "        if fileparts[\"frequency\"] == \"daily\" :\n",
    "            teststring = '-'.join(teststring.split(\"-\")[0:-1])\n",
    "        try :\n",
    "            datetime.datetime.strptime(fileparts[\"validation_date\"],teststring)\n",
    "        except:\n",
    "            return fileparts\n",
    "\n",
    "    fileparts[\"initialization_date\"] = np.nan\n",
    "    if fileparts[\"dataType\"] == \"12\":\n",
    "        fileparts[\"initialization_date\"] = fileparts[\"validation_date\"]\n",
    "        fileparts[\"validation_date\"] = np.nan\n",
    "    fileparts[\"virt_code\"]=filename.split(\"_\")[3].split('.')[0]\n",
    "    if '000' == fileparts[\"virt_code\"] :\n",
    "        return fileparts\n",
    "    if not re.search(era_cv[\"virt_code\"],fileparts[\"virt_code\"]):\n",
    "        return fileparts\n",
    "    \n",
    "    fileparts[\"code\"]=fileparts[\"virt_code\"]\n",
    "    fileparts[\"table_id\"]=128\n",
    "    if int(fileparts[\"virt_code\"]) > 256 :\n",
    "        fileparts[\"code\"]=f'{(int(fileparts[\"code\"])-256):02}'\n",
    "        fileparts[\"table_id\"]=256\n",
    "\n",
    "    fileparts[\"step\"]=np.nan\n",
    "    if fileparts[\"frequency\"] in era_cv[\"step\"]:\n",
    "        fileparts[\"step\"] = fileparts[\"frequency\"]\n",
    "    \n",
    "    for key in era_cv :\n",
    "        if type(era_cv[key]) == dict :\n",
    "            fileparts[key] = era_cv[key][fileparts[key]]\n",
    "    #For cross-checking against root dir\n",
    "    fileparts[\"checkpath\"]=f\"/work/bk1099/data/{fileparts['era_id']}/{filename[2:4]}/{fileparts['dataType']}/{filename.split('_')[1]}/{fileparts['virt_code']}/{filename}\"\n",
    "\n",
    "    return fileparts    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Getting list of assets...\n",
      "\n",
      "[########################################] | 100% Completed |  2min 32.9s\n"
     ]
    }
   ],
   "source": [
    "entries = list(map(parse_filename, get_file_list(\"/work/bk1099/data\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>path</th>\n",
       "      <th>era_id</th>\n",
       "      <th>level_type</th>\n",
       "      <th>dataType</th>\n",
       "      <th>frequency</th>\n",
       "      <th>validation_date</th>\n",
       "      <th>initialization_date</th>\n",
       "      <th>virt_code</th>\n",
       "      <th>code</th>\n",
       "      <th>table_id</th>\n",
       "      <th>step</th>\n",
       "      <th>checkpath</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>model_level</td>\n",
       "      <td>an</td>\n",
       "      <td>hourly</td>\n",
       "      <td>2004-02-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>248</td>\n",
       "      <td>248</td>\n",
       "      <td>128.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...</td>\n",
       "      <td>E5</td>\n",
       "      <td>model_level</td>\n",
       "      <td>an</td>\n",
       "      <td>hourly</td>\n",
       "      <td>1999-03-16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>248</td>\n",
       "      <td>248</td>\n",
       "      <td>128.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...</td>\n",
       "      <td>E5</td>\n",
       "      <td>model_level</td>\n",
       "      <td>an</td>\n",
       "      <td>hourly</td>\n",
       "      <td>1999-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>248</td>\n",
       "      <td>248</td>\n",
       "      <td>128.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...</td>\n",
       "      <td>E5</td>\n",
       "      <td>model_level</td>\n",
       "      <td>an</td>\n",
       "      <td>hourly</td>\n",
       "      <td>1999-06-18</td>\n",
       "      <td>NaN</td>\n",
       "      <td>248</td>\n",
       "      <td>248</td>\n",
       "      <td>128.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>model_level</td>\n",
       "      <td>an</td>\n",
       "      <td>hourly</td>\n",
       "      <td>2004-04-21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>248</td>\n",
       "      <td>248</td>\n",
       "      <td>128.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>/work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                path era_id   level_type  \\\n",
       "0  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...     E5  model_level   \n",
       "1  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...     E5  model_level   \n",
       "2  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...     E5  model_level   \n",
       "3  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...     E5  model_level   \n",
       "4  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...     E5  model_level   \n",
       "\n",
       "  dataType frequency validation_date initialization_date virt_code code  \\\n",
       "0       an    hourly      2004-02-28                 NaN       248  248   \n",
       "1       an    hourly      1999-03-16                 NaN       248  248   \n",
       "2       an    hourly      1999-06-20                 NaN       248  248   \n",
       "3       an    hourly      1999-06-18                 NaN       248  248   \n",
       "4       an    hourly      2004-04-21                 NaN       248  248   \n",
       "\n",
       "   table_id  step                                          checkpath  \n",
       "0     128.0   NaN  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...  \n",
       "1     128.0   NaN  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...  \n",
       "2     128.0   NaN  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...  \n",
       "3     128.0   NaN  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_19...  \n",
       "4     128.0   NaN  /work/bk1099/data/E5/ml/an/1H/248/E5ml00_1H_20...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = pd.DataFrame(entries)\n",
    "df1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5031180"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>path</th>\n",
       "      <th>era_id</th>\n",
       "      <th>level_type</th>\n",
       "      <th>dataType</th>\n",
       "      <th>frequency</th>\n",
       "      <th>validation_date</th>\n",
       "      <th>initialization_date</th>\n",
       "      <th>virt_code</th>\n",
       "      <th>code</th>\n",
       "      <th>table_id</th>\n",
       "      <th>step</th>\n",
       "      <th>checkpath</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>735798</th>\n",
       "      <td>/work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>12</td>\n",
       "      <td>1H</td>\n",
       "      <td>2014-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>735799</th>\n",
       "      <td>/work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>12</td>\n",
       "      <td>1H</td>\n",
       "      <td>2015-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>735800</th>\n",
       "      <td>/work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>12</td>\n",
       "      <td>1H</td>\n",
       "      <td>2013-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>735801</th>\n",
       "      <td>/work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>12</td>\n",
       "      <td>1H</td>\n",
       "      <td>2012-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>735802</th>\n",
       "      <td>/work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>12</td>\n",
       "      <td>1H</td>\n",
       "      <td>2016-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4006185</th>\n",
       "      <td>/work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_19...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>00</td>\n",
       "      <td>1D</td>\n",
       "      <td>1944-11</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4006186</th>\n",
       "      <td>/work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>00</td>\n",
       "      <td>1D</td>\n",
       "      <td>2008-12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4006187</th>\n",
       "      <td>/work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_19...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>00</td>\n",
       "      <td>1D</td>\n",
       "      <td>1967-06</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4006188</th>\n",
       "      <td>/work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>00</td>\n",
       "      <td>1D</td>\n",
       "      <td>2008-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4006189</th>\n",
       "      <td>/work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_20...</td>\n",
       "      <td>E5</td>\n",
       "      <td>sf</td>\n",
       "      <td>00</td>\n",
       "      <td>1D</td>\n",
       "      <td>2020-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>117500 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      path era_id level_type  \\\n",
       "735798   /work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...     E5         sf   \n",
       "735799   /work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...     E5         sf   \n",
       "735800   /work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...     E5         sf   \n",
       "735801   /work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...     E5         sf   \n",
       "735802   /work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_20...     E5         sf   \n",
       "...                                                    ...    ...        ...   \n",
       "4006185  /work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_19...     E5         sf   \n",
       "4006186  /work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_20...     E5         sf   \n",
       "4006187  /work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_19...     E5         sf   \n",
       "4006188  /work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_20...     E5         sf   \n",
       "4006189  /work/bk1099/data/E5/sf/an/1D/164/E5sf00_1D_20...     E5         sf   \n",
       "\n",
       "        dataType frequency validation_date initialization_date virt_code code  \\\n",
       "735798        12        1H         2014-01                 NaN       NaN  NaN   \n",
       "735799        12        1H         2015-01                 NaN       NaN  NaN   \n",
       "735800        12        1H         2013-01                 NaN       NaN  NaN   \n",
       "735801        12        1H         2012-01                 NaN       NaN  NaN   \n",
       "735802        12        1H         2016-01                 NaN       NaN  NaN   \n",
       "...          ...       ...             ...                 ...       ...  ...   \n",
       "4006185       00        1D         1944-11                 NaN       NaN  NaN   \n",
       "4006186       00        1D         2008-12                 NaN       NaN  NaN   \n",
       "4006187       00        1D         1967-06                 NaN       NaN  NaN   \n",
       "4006188       00        1D         2008-05                 NaN       NaN  NaN   \n",
       "4006189       00        1D         2020-04                 NaN       NaN  NaN   \n",
       "\n",
       "         table_id  step checkpath  \n",
       "735798        NaN   NaN       NaN  \n",
       "735799        NaN   NaN       NaN  \n",
       "735800        NaN   NaN       NaN  \n",
       "735801        NaN   NaN       NaN  \n",
       "735802        NaN   NaN       NaN  \n",
       "...           ...   ...       ...  \n",
       "4006185       NaN   NaN       NaN  \n",
       "4006186       NaN   NaN       NaN  \n",
       "4006187       NaN   NaN       NaN  \n",
       "4006188       NaN   NaN       NaN  \n",
       "4006189       NaN   NaN       NaN  \n",
       "\n",
       "[117500 rows x 12 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Some entries are invalid\n",
    "invalids = df1[~df1.checkpath.isin(df1[\"path\"])]\n",
    "invalids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1=df1[df1.checkpath.isin(df1[\"path\"])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#with open('/home/k/k204210/intake-esm/invalids-era5.txt', 'w') as f :\n",
    "#    for file in invalids.path.values :\n",
    "#        if type(file) == str :\n",
    "#            f.write(file+\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/work/bk1099/data/E5/sf/fc/1H/197/E5sf12_1H_2014-01_197.grb'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "invalids[\"path\"].values[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1[\"validation_date\"]=df1[\"validation_date\"].fillna(False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1=df1[~((df1[\"dataType\"]==\"an\") &\n",
    "    (df1[\"level_type\"]==\"model_level\") &\n",
    "    (df1[\"frequency\"]==\"hourly\") &\n",
    "    (df1[\"validation_date\"].str.contains('|'.join([str(a) for a in range(1940,1959)]))))\n",
    "   ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1=df1[~((df1[\"frequency\"]==\"daily\" ) &\n",
    "          (df1[\"level_type\"]!=\"surface\"))\n",
    "       ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4510994"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#dfnew = df1[df1.era_id.isin(activity_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfnew=df1[~df1[list(set(list(df1.keys()))-{\"path\"})].duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "261"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfnew.loc[:,[\"short_name\", \"long_name\", \"units\", \"stepType\"]] = pd.DataFrame([[\"\", \"\", \"\", \"\"]], index=dfnew.index)\n",
    "groups = dfnew.groupby([\"level_type\", \"dataType\", \"frequency\", \"code\", \"table_id\"])\n",
    "len(groups)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6a458b1f37df46ba990f2343c7e8f41a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/261 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for group_name, df_group in tqdm(groups):\n",
    "    groupxr = xarray.open_dataset(df_group[\"path\"].values[0], engine=\"cfgrib\", chunks=\"auto\")\n",
    "    varname = list(groupxr.data_vars)[0]\n",
    "    varxr = groupxr[varname]\n",
    "    group_indices = df_group.index[:].values.tolist()\n",
    "    dfnew.loc[group_indices, \"short_name\"] = varname\n",
    "    dfnew.loc[group_indices, \"long_name\"] = varxr.attrs[\"long_name\"]\n",
    "    dfnew.loc[group_indices, \"units\"] = varxr.attrs[\"units\"]\n",
    "    dfnew.loc[group_indices, \"stepType\"] = varxr.attrs[\"GRIB_stepType\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfnew.loc[:,\"project\"]=\"era5\"\n",
    "dfnew.loc[:,\"institution_id\"]=\"ecmwf\"\n",
    "dfnew.loc[:,\"source_id\"]=\"IFS\"\n",
    "dfnew.loc[:,\"format\"]=\"grib\"\n",
    "dfnew.loc[:,\"uri\"]=dfnew[\"path\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = [\"project\", \"institution_id\", \"source_id\", \"era_id\", \"dataType\", \"level_type\", \"frequency\", \"stepType\", \"table_id\", \"code\", \"short_name\", \"long_name\", \"units\", \"validation_date\", \"initialization_date\", \"step\", \"uri\", \"path\", \"format\"]\n",
    "dfnew = dfnew[columns]\n",
    "dfnew = dfnew.sort_values(columns, ascending = True).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df=pd.concat([df,dfnew], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfnew"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfnew.to_csv(\"/home/k/k204210/intake-esm/catalogs/dkrz_era5_disk.csv.gz\", compression=\"gzip\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "dfnew[\"validation_date\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(dfnew[\"validation_date\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(dfnew.groupby([\"dataType\", \"level_type\", \"frequency\"]).groups.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (based on the module python3/2022.01)",
   "language": "python",
   "name": "python3_2022_01"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}