diff --git a/.gitignore b/.gitignore index 0678e48561e38e9fb247808b295fd5232a323349..34a69c7d84d562d61d7a4ad6bd39fd02b4ec7d71 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ *.log .idea* conda_env/ +**/*.xlsx **/*.sw[pco] **/*.*.sw[pco] **/.ipynb_checkpoints* diff --git a/src/converter.py b/src/converter.py index 803128db76882f8af9dd9c021c158e1e0ac0ea72..3e094c587d85909d012c5762cc2c73cd73df3917 100644 --- a/src/converter.py +++ b/src/converter.py @@ -1,19 +1,137 @@ import pandas as pd import json import csv +import sys, os, shutil from pathlib import Path -def excel_to_csv(excel_path, csv_path, sheet_name=0, field_separator="|"): - df = pd.read_excel(excel_path, sheet_name=sheet_name) +tables_dir: str = f"{os.path.dirname(os.path.abspath(__file__))}/../Tables" +csv_file: str = f"{tables_dir}/original_tables/ct_ecmwf.rc" +cmip6_tables: str = f"{tables_dir}/source_tables/cmip6-cmor-tables/Tables" +obs_tables: str = f"{tables_dir}/source_tables/obs4MIPs-cmor-tables/Tables" +era5_tables: str = f"{tables_dir}/era5-cmor-tables/Tables" + +header_info: dict[str, str] = { + "Conventions":"CF-1.7 ODS-2.1", + "approx_interval":"0.0416666666", # should change depending on day, month, year + "cmor_version":"3.5", # tbd + "data_specs_version":"2.1.0", # tbd + "generic_levels":"", + "int_missing_value":"-999", # tbd + "mip_era":"CMIP6", + "missing_value":"1e20", + "product":"model-output", + "realm":"atmos", # should change depending on the realm + "table_date":"18 November 2020", # tbd + "table_id":"Table obs4MIPs_A1hr" # should relate to the filename + } + + +def excel_to_csv(excel_path, csv_path, sheet_name=0, field_separator="|") -> Any: + df: pd.DataFrame = pd.read_excel(excel_path, sheet_name=sheet_name) df.to_csv(csv_path, sep=field_separator, index=False) return csv_path -def csv_to_json(csv_path, json_path): - with open(csv_path, encoding='utf-8') as csvf: - csv_reader = csv.DictReader(csvf) - data = [row for row in csv_reader] +def csv_to_excel(csv_path, excel_path, sheet_name="cmor_table", field_separator="|") -> str: + df: pd.DataFrame = pd.read_csv(csv_path, sep=field_separator) + df.to_excel(excel_path, sheet_name=sheet_name, index=False) + return excel_path + + +def csv_to_cmor_json(csv_filepath=csv_file, json_output_path=era5_tables, default_cell_measures="area: areacella", frequency="mon"): + with open(csv_filepath, 'r', encoding='utf-8') as f: + lines = f.readlines() + + # Find the header line + header_line_idx = next(idx for idx, line in enumerate(lines) if line.startswith("#CCC|")) + + # Extract headers + headers = lines[header_line_idx].strip().lstrip('#').split('|') + + # Read data after header + data_lines = lines[header_line_idx + 1:] + + # Use pandas to read CSV from the string + from io import StringIO + df = pd.read_csv(StringIO(''.join(data_lines)), sep='|', names=headers, engine='python') + + json_filename = f"ERA5_{frequency.capitalize()}.json" + json_filepath = f"{json_output_path}/{json_filename}" + # Initialize the structure for the JSON + cmor_json = { + "Header": { + "Conventions": "CF-1.7 ODS-2.1", + "approx_interval": "0.0416666666", + "cmor_version": "3.5", + "data_specs_version": "2.1.0", + "generic_levels": "", + "int_missing_value": "-999", + "mip_era": "CMIP6", + "missing_value": "1e20", + "product": "model-output", + "realm": "atmos", + "table_date": "18 November 2020", + "table_id": f"Table obs4MIPs_{frequency.capitalize()}" + }, + "variable_entry": {} + } + + # Iterate over dataframe rows to fill variable entries + for _, row in df.iterrows(): + var_name = row["CMPAR"] + cmfactor = row["CMFACT"] + comment = row["COMMENT"] if pd.notna(row["COMMENT"]) else "" + + # Determine positivity + if pd.notna(row["CMUNIT"]) and "W m-2" in str(row["CMUNIT"]): + if "-" in str(cmfactor): + positive = "down" + else: + positive = "up" + else: + positive: Literal[''] = "" + + # Cell_methods based on frequency + if frequency == "1hr": + dimensions = "longitude latitude time" + cell_methods = "area: time: mean" + elif frequency == "day": + dimensions = "longitude latitude time" + cell_methods = "area: time: mean" + else: # monthly or other defaults + dimensions = "longitude latitude time" + cell_methods = "area: time: mean" + + # Construct variable entry + cmor_json["variable_entry"][var_name] = { + "cell_measures": default_cell_measures, + "cell_methods": cell_methods, + "comment": comment, + "dimensions": dimensions, + "frequency": frequency, + "long_name": row["CMLNAME"], + "modeling_realm": row["REALM"], + "ok_max_mean_abs": "", + "ok_min_mean_abs": "", + "out_name": var_name, + "positive": positive, + "standard_name": row["CFNAME"], + "type": "real", + "units": row["CMUNIT"], + "valid_max": "", + "valid_min": "", + } + + # Write JSON file + with open(json_filepath, 'w', encoding='utf-8') as json_file: + json.dump(cmor_json, json_file, indent=4) + + print(f"JSON file successfully created at {json_output_path}") + +# Example usage: +csv_to_cmor_json( + csv_filepath='path_to_your_csv_file.csv', + json_output_path='cmor_variables.json', + default_cell_measures="area: areacella", + frequency="mon" +) - with open(json_path, 'w', encoding='utf-8') as jsonf: - json.dump(data, jsonf, indent=2) - - return json_path