diff --git a/TODO b/TODO new file mode 100644 index 0000000000000000000000000000000000000000..8cc24cfdb6c630548ee5cc6c4b49752baa6aaf70 --- /dev/null +++ b/TODO @@ -0,0 +1,138 @@ +I need to create a csv to json reader that: + +1. extracts the parameters and maps them to diff names in 4 files: fx, 1hr, day, mon: + +```json +variable_entry = { + row["CMPAR"]: { + "cell_measures": --> search eithen in cmip6, obs4mips, crosscheck with realm or a default value .e.g "area: areacella", + "cell_methods": --> same as with cell_methods, + "comment": --> from cmip6,obs4mips,append any comment from CFNAME_COMMENT, + "dimensions": --> same as with cell_methdos, + "frequency": 1hr,day,mon, depends on the file. + "long_name": row["CMLNAME"], + "modeling_realm": row["REALM"], + "out_name": row["REALM"], + "positive":"", derived from the row["CMFACTOR"] + "standard_name": row["CFNAME"], + "type":"real", + "units": row["CMUNIT"], + "valid_max":"", + "valid_min":"", + --- + "grib_paramID": row["ECTABLE"], variable + "grib_code": row["ECCODE"], variable + "orig_short_name": row["ECPAR"], global + "orig_name": row["ECNAME"], global + "orig_units": row["ECUNIT"], global + "grib_description": row["ECDESC"],global + "orig_grid": row["ECGRID"], global --> here we should have all, in the netcdf file only the selected one. + "level_type": row["LTYPE"], global --> here we should have all, in the netcdf file only the selected one. + "conversion": row["CMFACT"], global + "table": row["CMTABLE"], global --> we look there but also in the table_id of each file of cmip6/obs4mips + "mapping": row["CMIP"], that should be either era5,cf,cf-cmip, cmip5, cmip6, obs4mips --> in principle from the table but if I find the variable at Obs4mips or CMIP6 then it goes for them + + }, +} +``` + +based on .e.g `obs4MIPs_A1hr.json`: + +```json + "variable_entry":{ + "clt":{ + "cell_measures":"area: areacella", + "cell_methods":"area: time: mean", + "comment":"Total cloud area fraction (reported as a percentage) for the whole atmospheric column, as seen from the surface or the top of the atmosphere. Includes both large-scale and convective cloud.", + "dimensions":"longitude latitude time", + "frequency":"3hr", + "long_name":"Total Cloud Cover Percentage", + "modeling_realm":"atmos", + "ok_max_mean_abs":"", + "ok_min_mean_abs":"", + "out_name":"clt", + "positive":"", + "standard_name":"cloud_area_fraction", + "type":"real", + "units":"%", + "valid_max":"", + "valid_min":"" + }, +``` + +for certain variables it needs to check in cmip6 and obs4mips tables, +- if they are the same, mapping --> obs4mips +- if they differ: + 1. info in the logfile + 2. two entries of the same parameter in the variable dict + +- additionally we have to add some extra info that needs to be elsewhere: + +``` + :institution = "European Centre for Medium-Range Weather Forecasts" ; + :institute_id = "ECMWF" ; + :model = "IFS" + :Conventions = "CF-1.8" ; <--- UPDATED, LET THE CHECKER RUN THROUGH AND THEN WE GET THE NUMBER + :license = "Contains modified Copernicus Atmosphere Monitoring Service information [2024]. Neither the European Commission nor ECMWF is responsible for any use that may be made of the Copernicus information or data it contains." ; +``` + + + +it should look like: + + +```shell +╰─$ ncdump -h /work/bm1159/XCES/data4xces/reanalysis/reanalysis/ECMWF/IFS/ERA5/mon/atmos/pr/r1i1p1/pr_Amon_reanalysis_era5_r1i1p1_20240101-20241231.nc +netcdf pr_Amon_reanalysis_era5_r1i1p1_20240101-20241231 { +dimensions: + time = UNLIMITED ; // (10 currently) + lon = 1280 ; + bnds = 2 ; + lat = 640 ; +variables: + double time(time) ; + time:standard_name = "time" ; + time:units = "hours since 2023-12-31 18:00:00" ; + time:calendar = "proleptic_gregorian" ; + time:axis = "T" ; + double lon(lon) ; + lon:standard_name = "longitude" ; + lon:long_name = "longitude" ; + lon:units = "degrees_east" ; + lon:axis = "X" ; + lon:bounds = "lon_bnds" ; + double lon_bnds(lon, bnds) ; + double lat(lat) ; + lat:standard_name = "latitude" ; + lat:long_name = "latitude" ; + lat:units = "degrees_north" ; + lat:axis = "Y" ; + lat:bounds = "lat_bnds" ; + double lat_bnds(lat, bnds) ; + float pr(time, lat, lon) ; + pr:standard_name = "precipitation_flux" ; + pr:long_name = "Precipitation" ; + pr:units = "kg m-2 s-1" ; + pr:cell_methods = "time: mean" ; <--- NEW (this is particualr for precip, because it comes from "m" in sum), to add to the excel table + pr:cell_measures = "area: areacella" ; <--- NEW (for this we would need to provide ONE area cell variable as well for the weights), to add to excel table -> ANGELIKA + pr:grib_code = 228 ; <--- RENAME + pr:grib_table = 128 ; <--- RENAME + pr:_FillValue = 1.e+20f ; + pr:missing_value = 1.e+20f ; +// global attributes: + :ECDESC = "This parameter is the accumulated liquid and frozen water, comprising rain and snow, that falls to the Earth's surface. It is the sum of large-scale precipitation and convective precipitation. Large-scale precipitation is generated by the cloud scheme in the ECMWF Integrated Forecasting System (IFS). The cloud scheme represents the formation and dissipation of clouds and large-scale precipitation due to changes in atmospheric quantities (such as pressure, temperature and moisture) predicted directly by the IFS at spatial scales of the [grid box](https://confluence.ecmwf.int/display/CKB/Model+grid+box+and+time+step) or larger. Convective precipitation is generated by the convection scheme in the IFS, which represents convection at spatial scales smaller than the grid box. [See further information.](https://confluence.ecmwf.int/display/CKB/Convective+and+large- scale+precipitation) This parameter does not include fog, dew or the precipitation that evaporates in the atmosphere before it lands at the surface of the Earth. This parameter is the total amount of water [accumulated over a particular time period which depends on the data extracted](https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation- Meanrates/fluxesandaccumulations). The units of this parameter are depth in metres of water equivalent. It is the depth the water would have if it were spread evenly over the grid box. Care should be taken when comparing model parameters with observations, because observations are often local to a particular point in space and time, rather than representing averages over a model grid box. "; <--- NEW long original ECMWF description + :orig_name = "Total precipitation"; <--- NEW + :orig_short_name = "tp"; <--- NEW + :orig_units = "m"; <---- NEW + :conversion = "1.0/3.6"; <--- NEW + :history = "Tue Oct 15 00:33:10 2024: cdo -f nc4 -s -mergetime era5dkrzmon..."; <--- KEEP + :institution = "European Centre for Medium-Range Weather Forecasts" ; + :institute_id = "ECMWF" ; + :model = "IFS" + :Conventions = "CF-1.8" ; <--- UPDATED, LET THE CHECKER RUN THROUGH AND THEN WE GET THE NUMBER + :license = "Contains modified Copernicus Atmosphere Monitoring Service information [2024]. Neither the European Commission nor ECMWF is responsible for any use that may be made of the Copernicus information or data it contains." ; + :tracking_id = "d5b13485-16f3-5f65-8dfd-cf03615bcc01" ; + :creation_date = "2024-10-15T00:07:37Z" ; + :CDO = "Climate Data Operators version 1.9.6 (http://mpimet.mpg.de/cdo)" ; +} +``` \ No newline at end of file diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..58f0c58d2edf74cad617cdd2721981083b718f12 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,256 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "28bcf6fa-56fd-4d42-bca8-f735b59d579c", + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import json\n", + "import sys, os, shutil\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2323c67-b7e4-41e1-8951-73d7e2d03989", + "metadata": {}, + "outputs": [], + "source": [ + "csv_file = \"tables/CSV/ct_ecmwf.rc\"\n", + "output_folder = \"test\"\n", + "output_file = \"out.json\"\n", + "json_file = os.path.join(output_folder, output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885ca828-a4d6-4ece-b628-de0d7b86826f", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_folder(path):\n", + " try:\n", + " shutil.rmtree(path)\n", + " print(f\"Folder '{path}' and its contents removed successfully.\")\n", + " except FileNotFoundError:\n", + " print(f\"Error: Folder '{path}' not found.\")\n", + " except OSError as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb3ce294-ff9a-40ba-bee9-d42a42c78575", + "metadata": {}, + "outputs": [], + "source": [ + "def map_columns_to_json(csv_file, json_file):\n", + " data = []\n", + " with open(csv_file, 'r') as csvfile:\n", + " csvreader = csv.DictReader(csvfile, delimiter=':')\n", + " for row in csvreader:\n", + " variable_entry = {\n", + " row[\"None\"]: {\n", + " \"frequency\": row[\"GRIDTYPE\"],\n", + " \"modeling_realm\": row[\"REALM\"],\n", + " \"standard_name\": row[\"PARDES\"],\n", + " \"units\": row[\"CMUNIT\"],\n", + " \"cell_methods\": \"area: mean\",\n", + " \"cell_measures\": \"area: areacella\",\n", + " \"long_name\": row[\"CMLNAME\"],\n", + " \"comment\": \"Percentage of horizontal area occupied by land.\",\n", + " \"dimensions\": \"longitude latitude\",\n", + " \"type\": \"real\",\n", + " \"positive\": \"\",\n", + " \"conversion\": row[\"CMFACT\"],\n", + " \"grid\": row[\"GRIBVERS\"],\n", + " \"table\": row[\"CMTABLE\"],\n", + " \"code\": row[\"#CCC\"],\n", + " \"orig_short_name\": row[\"ECPAR\"],\n", + " \"orig_name\": row[\"ECNAME\"],\n", + " \"grib_description\": row[\"OBS\"],\n", + " \"orig_units\": row[\"ECUNIT\"],\n", + " \"orig_grid\": row[\"LEVELS\"],\n", + " \"level_type\": row[\"LTYPE\"],\n", + " \"mapping\": \"CMIP6\"\n", + " }\n", + " }\n", + " data.append(variable_entry)\n", + "\n", + " with open(json_file, 'w') as jsonfile:\n", + " jsonfile.write(json.dumps(data, indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "183dcc20-e401-47d1-9b6a-1745c06dba91", + "metadata": {}, + "outputs": [], + "source": [ + "remove_folder(output_folder)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bebb72ac-2463-40fd-af38-0c852581592a", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "map_columns_to_json(csv_file, json_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b902c0e5-2af8-4483-a3b9-3096a8d0d30d", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def skip_lines(row):\n", + " return not row['#CCC'].startswith(\"#\")\n", + "data = []\n", + "\n", + "# Manually skip lines until the header is found\n", + "with open(csv_file, 'r') as csvfile:\n", + " csvreader = csv.reader(csvfile, delimiter=':')\n", + " \n", + " for row in csvreader:\n", + " if row[0].startswith('#CCC'):\n", + " break\n", + "\n", + "# Read the CSV file with specified fieldnames\n", + "with open(csv_file, 'r') as csvfile:\n", + " csvreader = csv.DictReader(csvfile, delimiter=':', fieldnames=row)\n", + " filtered_rows = list(filter(skip_lines, csvreader))\n", + " for row in filtered_rows:\n", + "# variable_entry = {\n", + "# row[\"CMPAR\"]: {\n", + "# \"frequency\": row[\"GRIDTYPE\"],\n", + "# \"modeling_realm\": row[\"REALM\"],\n", + "# \"standard_name\": row[\"PARDES\"],\n", + "# \"units\": row[\"CMUNIT\"],\n", + "# \"cell_methods\": \"area: mean\",\n", + "# \"cell_measures\": \"area: areacella\",\n", + "# \"long_name\": row[\"CMLNAME\"],\n", + "# \"comment\": \"Percentage of horizontal area occupied by land.\",\n", + "# \"dimensions\": \"longitude latitude\",\n", + "# \"type\": \"real\",\n", + "# \"positive\": \"\",\n", + "# \"conversion\": row[\"CMFACT\"],\n", + "# \"grid\": row[\"GRIBVERS\"],\n", + "# \"table\": row[\"CMTABLE\"],\n", + "# \"code\": row[\"#CCC\"],\n", + "# \"orig_short_name\": row[\"ECPAR\"],\n", + "# \"orig_name\": row[\"ECNAME\"],\n", + "# \"grib_description\": row[\"OBS\"],\n", + "# \"orig_units\": row[\"ECUNIT\"],\n", + "# \"orig_grid\": row[\"LEVELS\"],\n", + "# \"level_type\": row[\"LTYPE\"],\n", + "# \"mapping\": \"CMIP6\"\n", + "# }\n", + "# }\n", + "# data.append(variable_entry)\n", + "# for column_name, value in row.items():\n", + "# print(f\"{column_name}: {value}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "56bc6018-2a96-4054-a834-8cc98fa8fa30", + "metadata": {}, + "source": [ + "```json\n", + "\"variable\": CFNAME\n", + "\"frequency\": FREQUENCY <--- NEW VARIABLE ?? ([1hr, mon], [1hr, day, mon], [fx]) \"FREQUENCY\"\n", + "\"modeling_realm\": REALM,\n", + "\"standard_name\": \"land_area_fraction\", <---?\n", + "\"units\": CMUNIT,\n", + "\"cell_methods\": CELL_METHODS,\n", + "\"cell_measures\": CELL_MEASURES,\n", + "\"long_name\": CMLNAME,\n", + "\"comment\": COMMENT, \n", + "\"dimensions\": \"longitude latitude\", <--- ? NEW VARIABLE ?? \"DIM\"\n", + "\"type\": \"real\", <--- I guess that this will always be REAL\n", + "\"positive\": \"\", <--- this should be derived from the conversion factor (the sign)\n", + "\"conversion\": CMFACT,\n", + "\"grid\": GRIDTYPE,\n", + "\"table\": \"180\", <---?\n", + "\"code\": #CCC,\n", + "\"orig_short_name\": ECPAR, \n", + "\"orig_name\": ECNAME,\n", + "\"grib_description\": DESCRIPTION, \n", + "\"orig_units\": ECUNIT,\n", + "\"orig_grid\": GRIDTYPE,\n", + "\"level_type\": LTYPE <--- \"pl_an, ml_an, sfc_an, sfc_fc\",\n", + "\"mapping\": CMIP\n", + "```\n", + "\n", + "```\n", + "#CCC ECPAR ECNAME ECUNIT CMIP CMPAR CFNAME CMUNIT CMFACT CELL_METHODS CELL_MEASURES COMMENT DESCRIPTION CMLNAME CMTABLE GRIDTYPE GRIBVERS REALM PTYPE LTYPE DATASET LEVELS XCES OBS\n", + "\n", + "#CCC:PARDES:ECMWF:ECPAR:ECNAME:ECUNIT:CMIP:CMPAR:CMNAME:CMUNIT:CMFACT:CMOFFSET:CMLNAME:CMTABLE:GRIDTYPE:GRIBVERS:REALM:PTYPE:LTYPE:DATASET:LEVELS:XCES:OBS\n", + "27:var27:CVL:cvl:Low vegetation cover:0..1:1:cvl:vegetation_area_fraction:%:100:0.0:Low Vegetation Cover:mon-cf:gr:GRIB:land:I:sf00:sf00:None:1.0::\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3ec0820-2b09-4cda-b128-cea475866d1b", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "if __name__ == \"__main__\":\n", + " if len(sys.argv) != 3:\n", + " print(\"Usage: python map_columns_to_json.py input.csv output.json\")\n", + " else:\n", + " input_csv = sys.argv[1]\n", + " output_json = sys.argv[2]\n", + " map_columns_to_json(input_csv, output_json)\n", + " print(f\"Conversion complete. JSON file saved to {output_json}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}