diff --git a/builder/packems-index_to_intake.ipynb b/builder/packems-index_to_intake.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e7eefc15ed49e336963f0e06bbf6acba24da2766 --- /dev/null +++ b/builder/packems-index_to_intake.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d8f066a2-2da6-4366-9d0c-5de4d3e1df1d", + "metadata": {}, + "outputs": [], + "source": [ + "import intake" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c01445bb-83d4-4110-a53d-64a8c0da7200", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "40148755-29b4-4cf4-8b4f-6a02ec1560b3", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '/home/k/k204210/volume/data-infrastructure-services/intake-esm/builder/ncar-builder/builders/')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "50d151a4-11fc-40ce-837e-fa83d0349013", + "metadata": {}, + "outputs": [], + "source": [ + "indexes=pd.read_csv(\"/work/ik1017/CMIP6/meta/cmip6_tape_index_paths_adjusted_2.txt\",\n", + " skiprows=3,\n", + " sep=' ',\n", + " names=[\"permissions\",\"owner\",\"size\",\"date\",\"time\",\"path\",\"temp\",\"tar\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c4d11ef2-a53a-47df-b5f5-0bf62294050a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>permissions</th>\n", + " <th>owner</th>\n", + " <th>size</th>\n", + " <th>date</th>\n", + " <th>time</th>\n", + " <th>path</th>\n", + " <th>temp</th>\n", + " <th>tar</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204145/esgf</td>\n", + " <td>2.508455e+09</td>\n", + " <td>2019-09-30</td>\n", + " <td>23:11</td>\n", + " <td>./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196...</td>\n", + " <td>-></td>\n", + " <td>DCPP_001.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204145/esgf</td>\n", + " <td>2.370545e+09</td>\n", + " <td>2019-09-29</td>\n", + " <td>22:50</td>\n", + " <td>./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196...</td>\n", + " <td>-></td>\n", + " <td>DCPP_001.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204145/esgf</td>\n", + " <td>1.806243e+09</td>\n", + " <td>2019-09-29</td>\n", + " <td>22:50</td>\n", + " <td>./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196...</td>\n", + " <td>-></td>\n", + " <td>DCPP_001.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204145/esgf</td>\n", + " <td>2.491425e+09</td>\n", + " <td>2019-09-29</td>\n", + " <td>22:50</td>\n", + " <td>./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196...</td>\n", + " <td>-></td>\n", + " <td>DCPP_001.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204145/esgf</td>\n", + " <td>2.527008e+09</td>\n", + " <td>2019-09-29</td>\n", + " <td>22:50</td>\n", + " <td>./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196...</td>\n", + " <td>-></td>\n", + " <td>DCPP_001.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1626476</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204210/esgf</td>\n", + " <td>7.618832e+07</td>\n", + " <td>2021-01-18</td>\n", + " <td>20:00</td>\n", + " <td>./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p...</td>\n", + " <td>-></td>\n", + " <td>Update_2021-02-16_1206.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1626477</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204210/esgf</td>\n", + " <td>7.614116e+07</td>\n", + " <td>2021-01-18</td>\n", + " <td>20:00</td>\n", + " <td>./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p...</td>\n", + " <td>-></td>\n", + " <td>Update_2021-02-16_1206.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1626478</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204210/esgf</td>\n", + " <td>7.609798e+07</td>\n", + " <td>2021-01-18</td>\n", + " <td>20:00</td>\n", + " <td>./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p...</td>\n", + " <td>-></td>\n", + " <td>Update_2021-02-16_1206.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1626479</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204210/esgf</td>\n", + " <td>7.611006e+07</td>\n", + " <td>2021-01-18</td>\n", + " <td>20:00</td>\n", + " <td>./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p...</td>\n", + " <td>-></td>\n", + " <td>Update_2021-02-16_1206.tar</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1626480</th>\n", + " <td>-r--r--r--</td>\n", + " <td>k204210/esgf</td>\n", + " <td>2.439534e+07</td>\n", + " <td>2021-01-18</td>\n", + " <td>19:58</td>\n", + " <td>./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p...</td>\n", + " <td>-></td>\n", + " <td>Update_2021-02-16_1206.tar</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>1626481 rows × 8 columns</p>\n", + "</div>" + ], + "text/plain": [ + " permissions owner size date time \\\n", + "0 -r--r--r-- k204145/esgf 2.508455e+09 2019-09-30 23:11 \n", + "1 -r--r--r-- k204145/esgf 2.370545e+09 2019-09-29 22:50 \n", + "2 -r--r--r-- k204145/esgf 1.806243e+09 2019-09-29 22:50 \n", + "3 -r--r--r-- k204145/esgf 2.491425e+09 2019-09-29 22:50 \n", + "4 -r--r--r-- k204145/esgf 2.527008e+09 2019-09-29 22:50 \n", + "... ... ... ... ... ... \n", + "1626476 -r--r--r-- k204210/esgf 7.618832e+07 2021-01-18 20:00 \n", + "1626477 -r--r--r-- k204210/esgf 7.614116e+07 2021-01-18 20:00 \n", + "1626478 -r--r--r-- k204210/esgf 7.609798e+07 2021-01-18 20:00 \n", + "1626479 -r--r--r-- k204210/esgf 7.611006e+07 2021-01-18 20:00 \n", + "1626480 -r--r--r-- k204210/esgf 2.439534e+07 2021-01-18 19:58 \n", + "\n", + " path temp \\\n", + "0 ./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196... -> \n", + "1 ./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196... -> \n", + "2 ./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196... -> \n", + "3 ./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196... -> \n", + "4 ./DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/s196... -> \n", + "... ... ... \n", + "1626476 ./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p... -> \n", + "1626477 ./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p... -> \n", + "1626478 ./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p... -> \n", + "1626479 ./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p... -> \n", + "1626480 ./ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585/r9i1p... -> \n", + "\n", + " tar \n", + "0 DCPP_001.tar \n", + "1 DCPP_001.tar \n", + "2 DCPP_001.tar \n", + "3 DCPP_001.tar \n", + "4 DCPP_001.tar \n", + "... ... \n", + "1626476 Update_2021-02-16_1206.tar \n", + "1626477 Update_2021-02-16_1206.tar \n", + "1626478 Update_2021-02-16_1206.tar \n", + "1626479 Update_2021-02-16_1206.tar \n", + "1626480 Update_2021-02-16_1206.tar \n", + "\n", + "[1626481 rows x 8 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "indexes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "138da396-255a-437b-88f6-1cc95986fdbb", + "metadata": {}, + "outputs": [], + "source": [ + "indexes.loc[indexes[\"path\"].str.contains('\\:'),\"path\"]=indexes[\"temp\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cdaa94c7-0478-42d1-b987-6ba63ce06b3f", + "metadata": {}, + "outputs": [], + "source": [ + "indexes.loc[(indexes[\"path\"].str.contains('2019-')) &\n", + " ~(indexes[\"path\"].str.contains('\\.nc')),\"path\"]=indexes[\"tar\"]\n", + "indexes.loc[(indexes[\"path\"].str.contains('2020-')) &\n", + " ~(indexes[\"path\"].str.contains('\\.nc')),\"path\"]=indexes[\"tar\"]\n", + "indexes.loc[(indexes[\"path\"].str.contains('2021-')) &\n", + " ~(indexes[\"path\"].str.contains('\\.nc')),\"path\"]=indexes[\"tar\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "dc84a8b5-b704-480a-83a5-05746ffb7c6b", + "metadata": {}, + "outputs": [], + "source": [ + "indexes=indexes[~indexes[\"tar\"].str.contains('->')]\n", + "indexes=indexes[~indexes[\"tar\"].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7e8a0556-a4f6-4af8-9db3-9c85cf7a662f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing list of assets...\n", + "\n", + "Done...\n", + "\n" + ] + } + ], + "source": [ + "from core import Builder, extract_attr_with_regex, get_asset_list, reverse_filename_format\n", + "from cmip import cmip6_parser\n", + "cmip_columns = [\n", + " 'activity_id',\n", + " 'institution_id',\n", + " 'source_id',\n", + " 'experiment_id',\n", + " 'member_id',\n", + " 'table_id',\n", + " 'variable_id',\n", + " 'grid_label',\n", + " 'dcpp_init_year',\n", + " 'version',\n", + " 'time_range',\n", + " 'path',\n", + " ]\n", + "b = Builder(cmip_columns)\n", + "df = b(list(indexes[\"path\"]), cmip6_parser)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "64cede18-6817-40cb-aa73-273fda8cf525", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"tar\"]=\"/arch/ik1017/cmip6/CMIP6/\"+indexes[\"tar\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ae4d7bff-969a-4943-900d-5fbe0c11c74e", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"/work/ik1017/CMIP6/meta/dkrz_cmip6_archive.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e589b159-2853-4deb-a77e-7134fd46c588", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "taucenvkernel", + "language": "python", + "name": "taucenvkernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/dkrz_cmip6_disk_old.json b/tests/dkrz_cmip6_disk_old.json index ef327e394fcef7d0c3e98d8dc863575eb3f41c17..30d8b2ea04f2f6bcd42c3136af0f850201edf749 100755 --- a/tests/dkrz_cmip6_disk_old.json +++ b/tests/dkrz_cmip6_disk_old.json @@ -2,7 +2,7 @@ "esmcat_version": "0.1.0", "id": "mistral-cmip6", "description": "This is an ESM collection for CMIP6 data accessible on the DKRZ's MISTRAL disk storage system in /work/ik1017/CMIP6/data/CMIP6", - "catalog_file": "/mnt/lustre/work/ik1017/Catalogs/dkrz_cmip6_disk.csv.gz", + "catalog_file": "/work/ik1017/Catalogs/dkrz_cmip6_disk.csv.gz", "attributes": [ { "column_name": "activity_id",