WIP: test for csv to json tool

15f99089 · Etor Lucio Eceiza · eb797d5c · 15f99089 · 15f99089
Commit 15f99089 authored 3 weeks ago by Etor Lucio Eceiza
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 *.log
 .idea*
 conda_env/
+**/*.xlsx
 **/*.sw[pco]
 **/*.*.sw[pco]
 **/.ipynb_checkpoints*

--- a/src/converter.py
+++ b/src/converter.py
 import pandas as pd
 import json
 import csv
+import sys, os, shutil
 from pathlib import Path

-def excel_to_csv(excel_path, csv_path, sheet_name=0, field_separator="|"):
-    df = pd.read_excel(excel_path, sheet_name=sheet_name)
+tables_dir: str  = f"{os.path.dirname(os.path.abspath(__file__))}/../Tables"
+csv_file: str = f"{tables_dir}/original_tables/ct_ecmwf.rc"
+cmip6_tables: str = f"{tables_dir}/source_tables/cmip6-cmor-tables/Tables"
+obs_tables: str = f"{tables_dir}/source_tables/obs4MIPs-cmor-tables/Tables"
+era5_tables: str = f"{tables_dir}/era5-cmor-tables/Tables"
+
+header_info: dict[str, str]  =     {
+        "Conventions":"CF-1.7 ODS-2.1",
+        "approx_interval":"0.0416666666", # should change depending on day, month, year
+        "cmor_version":"3.5", # tbd
+        "data_specs_version":"2.1.0", # tbd
+        "generic_levels":"",
+        "int_missing_value":"-999", # tbd
+        "mip_era":"CMIP6",
+        "missing_value":"1e20",
+        "product":"model-output",
+        "realm":"atmos", # should change depending on the realm
+        "table_date":"18 November 2020", # tbd
+        "table_id":"Table obs4MIPs_A1hr" # should relate to the filename
+    }
+
+
+def excel_to_csv(excel_path, csv_path, sheet_name=0, field_separator="|") -> Any:
+    df: pd.DataFrame = pd.read_excel(excel_path, sheet_name=sheet_name)
    df.to_csv(csv_path, sep=field_separator, index=False)
    return csv_path

-def csv_to_json(csv_path, json_path):
-    with open(csv_path, encoding='utf-8') as csvf:
-        csv_reader = csv.DictReader(csvf)
-        data = [row for row in csv_reader]
+def csv_to_excel(csv_path, excel_path, sheet_name="cmor_table", field_separator="|") -> str:
+    df: pd.DataFrame = pd.read_csv(csv_path, sep=field_separator)
+    df.to_excel(excel_path, sheet_name=sheet_name, index=False)
+    return excel_path
+      
+        
+def csv_to_cmor_json(csv_filepath=csv_file, json_output_path=era5_tables, default_cell_measures="area: areacella", frequency="mon"):
+    with open(csv_filepath, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+
+    # Find the header line
+    header_line_idx = next(idx for idx, line in enumerate(lines) if line.startswith("#CCC|"))
+
+    # Extract headers
+    headers = lines[header_line_idx].strip().lstrip('#').split('|')
+
+    # Read data after header
+    data_lines = lines[header_line_idx + 1:]
+
+    # Use pandas to read CSV from the string
+    from io import StringIO
+    df = pd.read_csv(StringIO(''.join(data_lines)), sep='|', names=headers, engine='python')
+
+    json_filename = f"ERA5_{frequency.capitalize()}.json"
+    json_filepath = f"{json_output_path}/{json_filename}"
+    # Initialize the structure for the JSON
+    cmor_json = {
+        "Header": {
+            "Conventions": "CF-1.7 ODS-2.1",
+            "approx_interval": "0.0416666666",
+            "cmor_version": "3.5",
+            "data_specs_version": "2.1.0",
+            "generic_levels": "",
+            "int_missing_value": "-999",
+            "mip_era": "CMIP6",
+            "missing_value": "1e20",
+            "product": "model-output",
+            "realm": "atmos",
+            "table_date": "18 November 2020",
+            "table_id": f"Table obs4MIPs_{frequency.capitalize()}"
+        },
+        "variable_entry": {}
+    }
+
+    # Iterate over dataframe rows to fill variable entries
+    for _, row in df.iterrows():
+        var_name = row["CMPAR"]
+        cmfactor = row["CMFACT"]
+        comment = row["COMMENT"] if pd.notna(row["COMMENT"]) else ""
+        
+        # Determine positivity
+        if pd.notna(row["CMUNIT"]) and "W m-2" in str(row["CMUNIT"]):
+            if "-" in str(cmfactor):
+                positive = "down"
+            else:
+                positive = "up"
+        else:
+            positive: Literal[''] = ""
+
+        # Cell_methods based on frequency
+        if frequency == "1hr":
+            dimensions = "longitude latitude time"
+            cell_methods = "area: time: mean"
+        elif frequency == "day":
+            dimensions = "longitude latitude time"
+            cell_methods = "area: time: mean"
+        else:  # monthly or other defaults
+            dimensions = "longitude latitude time"
+            cell_methods = "area: time: mean"
+
+        # Construct variable entry
+        cmor_json["variable_entry"][var_name] = {
+            "cell_measures": default_cell_measures,
+            "cell_methods": cell_methods,
+            "comment": comment,
+            "dimensions": dimensions,
+            "frequency": frequency,
+            "long_name": row["CMLNAME"],
+            "modeling_realm": row["REALM"],
+            "ok_max_mean_abs": "",
+            "ok_min_mean_abs": "",
+            "out_name": var_name,
+            "positive": positive,
+            "standard_name": row["CFNAME"],
+            "type": "real",
+            "units": row["CMUNIT"],
+            "valid_max": "",
+            "valid_min": "",
+        }
+
+    # Write JSON file
+    with open(json_filepath, 'w', encoding='utf-8') as json_file:
+        json.dump(cmor_json, json_file, indent=4)
+
+    print(f"JSON file successfully created at {json_output_path}")
+
+# Example usage:
+csv_to_cmor_json(
+    csv_filepath='path_to_your_csv_file.csv',
+    json_output_path='cmor_variables.json',
+    default_cell_measures="area: areacella",
+    frequency="mon"
+)

-    with open(json_path, 'w', encoding='utf-8') as jsonf:
-        json.dump(data, jsonf, indent=2)
-    
-    return json_path