diff --git a/src/converter.py b/src/converter.py index b821b88f82000e26058a135e8e7d826438145ab1..c3a320fafee968f5ec9a7934cbf06751a8ab7653 100644 --- a/src/converter.py +++ b/src/converter.py @@ -2,14 +2,18 @@ import os import json import pandas as pd from io import StringIO +from pathlib import Path from typing import Optional, Dict, List +from datetime import datetime -# Define paths -csv_file = "/home/etor/work/data/era5-tables/Tables/original_tables/ct_ecmwf.rc" -json_output_path = "/home/etor/work/data/era5-tables/Tables/era5-cmor-tables/Tables" -search_dirs=[ - "/home/etor/work/data/era5-tables/Tables/source_tables/obs4MIPs-cmor-tables/Tables", - "/home/etor/work/data/era5-tables/Tables/source_tables/cmip6-cmor-tables/Tables" +today_date = datetime.today().strftime("%d %B %Y") + +BASE_DIR = Path(__file__).resolve().parent +csv_file = BASE_DIR.parent / "Tables/original_tables/ct_ecmwf.rc" +json_output_path = BASE_DIR.parent / "Tables/era5-cmor-tables/Tables" +search_dirs = [ + BASE_DIR.parent / "Tables/source_tables/obs4MIPs-cmor-tables/Tables", + BASE_DIR.parent / "Tables/source_tables/cmip6-cmor-tables/Tables" ] frequency_priority = { @@ -42,133 +46,191 @@ frequency_priority = { ] } -level_category_priority = { - "sfc": ["sfc_an", "sfc_fc", "sfc_an_land", "sfc_fc_land"], +level_categories = { + "sfc": ["sfc_an", "sfc_fc"], + "sfc_land": ["sfc_an_land", "sfc_fc_land"], "pl": ["pl_an", "pl_fc"], - "ml": ["ml_an"], - "inv": ["fx"] + "ml": ["ml_an"] +} + +approx_interval_map = { + "1hr": round(1 / 24, 5), # 0.04167 + "day": 1.00000, + "mon": 30.00000, + "fx": 0.00000 } +pressure_level_number = 37 + def determine_level_category(level_type: str) -> str: - for category, values in level_category_priority.items(): + for category, values in level_categories.items(): if level_type in values: return category return "sfc" -def load_variable_from_table(table_path: str, variable_name: str) -> Optional[Dict]: +def load_variable_from_table(table_path: str, variable_name: str, table_prefix: str) -> Optional[Dict]: try: with open(table_path, 'r') as f: data = json.load(f) - return data.get("variable_entry", {}).get(variable_name) + + if "variable_entry" in data: + return data["variable_entry"].get(variable_name) + if table_prefix in data and "variable_entry" in data[table_prefix]: + return data[table_prefix]["variable_entry"].get(variable_name) + + return None except Exception: return None -def find_best_matching_variable(variable: str, frequency: str, realm: str, level_type: str, + +def find_best_matching_variable(variable: str, frequency: str, level_type: str, search_dirs: List[str]) -> Optional[Dict]: level_category = determine_level_category(level_type) priority_tables = frequency_priority.get(frequency, []) for search_dir in search_dirs: - for root, _, files in os.walk(search_dir): - for file in files: - if file.endswith(".json"): - for table_prefix in priority_tables: - if f"_{table_prefix}.json" in file or file == f"{table_prefix}.json": - if level_category == "pl" and "Plev" not in file: - continue - if level_category == "sfc" and "Plev" in file: - continue - table_path = os.path.join(root, file) - variable_data = load_variable_from_table(table_path, variable) - if variable_data: - variable_data["source_table"] = os.path.basename(file) - return variable_data + for table_prefix in priority_tables: + for root, _, files in os.walk(search_dir): + matching_files = [file for file in files if file.endswith(".json") and f"_{table_prefix}" in file or file == f"{table_prefix}.json"] + for file in matching_files: + if level_category == "pl" and "Plev" not in file: + continue + if level_category == "sfc" and "Plev" in file: + continue + + table_path = os.path.join(root, file) + variable_data = load_variable_from_table(table_path, variable, table_prefix) + if variable_data: + variable_data["source_table"] = os.path.basename(file) + return variable_data return None -def csv_to_cmor_json(csv_filepath=csv_file, json_output_path=json_output_path): +def _read_csv(csv_filepath: str) -> pd.DataFrame: with open(csv_filepath, 'r', encoding='utf-8') as f: lines = f.readlines() - - # Get header header_line_idx = next(idx for idx, line in enumerate(lines) if line.startswith("#CCC|")) headers = lines[header_line_idx].strip().lstrip('#').split('|') data_lines = lines[header_line_idx + 1:] df = pd.read_csv(StringIO(''.join(data_lines)), sep='|', names=headers, engine='python') df.columns = df.columns.str.strip() - df = df.dropna(subset=["CMPAR"]) + return df.dropna(subset=["CMPAR"]).fillna("") + +def _get_mapping_source(cmip_val, source_table: str = "") -> str: + """Determine the mapping source based on CMIP value and table origin.""" + try: + cmip_val = int(cmip_val) + except (ValueError, TypeError): + return "unknown" + + if cmip_val == 0: + return "ECMWF" + elif cmip_val == 1: + return "CF" + elif cmip_val == 6: + lower_table = source_table.lower() + if "obs4mips" in lower_table: + return "obs4MIPs" + elif "cmip6" in lower_table: + return "CMIP6" + else: + return "CMIP6" # fallback to CMIP6 if unknown + return "unknown" + +def _build_cmor_entry( + var_name: str, + matched: Optional[Dict], + row: pd.Series, + frequency: str, +) -> Dict: + comment = row.get("COMMENT", "") or "" + long_name = row["CMLNAME"] + realm = row["REALM"] + standard_name = row["CFNAME"] + units = row["CMUNIT"] + + source_table = matched.get("source_table", "") if matched else "" + mapping = _get_mapping_source(row["CMIP"], source_table) + default_dimensions = "lat lon" if frequency == "fx" else "time lat lon" + default_cell_methods = "" if frequency == "fx" else "time: mean" + matched_comment = matched.get("comment", "") if matched else "" + final_comment = f"{matched_comment}. {comment}".strip(". ") if comment else matched_comment + + return { + "cell_measures": matched.get("cell_measures", "") if matched else "", + "cell_methods": matched.get("cell_methods", default_cell_methods) if matched else default_cell_methods, + "comment": final_comment, + "dimensions": matched.get("dimensions", default_dimensions) if matched else default_dimensions, + "frequency": frequency, + "long_name": long_name, + "modeling_realm": matched.get("modeling_realm", "") if matched else realm, + "ok_max_mean_abs": matched.get("ok_max_mean_abs", "") if matched else "", + "ok_min_mean_abs": matched.get("ok_min_mean_abs", "") if matched else "", + "out_name": var_name, + "positive": matched.get("positive", "") if matched else "", + "standard_name": standard_name, + "type": matched.get("type", "real") if matched else "real", + "units": units, + "valid_max": matched.get("valid_max", "") if matched else "", + "valid_min": matched.get("valid_min", "") if matched else "", + # Additional metadata: + "grib_paramID": row["ECTABLE"], + "grib_code": row["ECCODE"], + "orig_short_name": row["ECPAR"], + "orig_name": row["ECNAME"], + "orig_units": row["ECUNIT"], + "grib_description": row["ECDESC"], + "orig_grid": row["ECGRID"], + "level_type": row["LTYPE"], + "conversion": row["CMFACT"], + "source_table": matched.get("source_table", "") if matched else "", + "table": row["CMTABLE"] if pd.notna(row["CMTABLE"]) else source_table, + "mapping": mapping + } + + + +def csv_to_cmor_json(csv_filepath=csv_file, json_output_path=json_output_path): + df = _read_csv(csv_filepath) + df = df[df["CMPAR"] == "zg"] grouped_json = {} for _, row in df.iterrows(): var_name = row["CMPAR"] - comment = row.get("COMMENT", "") or "" - long_name = row["CMLNAME"] - realm = row["REALM"] - standard_name = row["CFNAME"] - units = row["CMUNIT"] - trepr = row["TREPR"] - rtreps = [t.strip().upper() for t in trepr.split(',') if pd.notna(trepr)] + time_representation = [t.strip().upper() for t in row["TREPR"].split(',') if pd.notna(row["TREPR"])] level_types = [lt.strip() for lt in row["LTYPE"].split(',') if pd.notna(row["LTYPE"])] for level_type in level_types: level_group = determine_level_category(level_type) - # 1hr, day, mon frequencies - for freq in ["1hr", "day", "mon"]: - key = (freq, level_group, "") - - matched = find_best_matching_variable(var_name, freq, realm, level_type, search_dirs) - cmor_entry = { - "cell_measures": matched.get("cell_measures", "") if matched else "", - "cell_methods": matched.get("cell_methods", "time: mean") if matched else "time: mean", - "comment": comment, - "dimensions": matched.get("dimensions", "time lat lon") if matched else "time lat lon", - "frequency": freq, - "long_name": long_name, - "modeling_realm": realm, - "ok_max_mean_abs": matched.get("ok_max_mean_abs", "") if matched else "", - "ok_min_mean_abs": matched.get("ok_min_mean_abs", "") if matched else "", - "out_name": var_name, - "positive": matched.get("positive", "") if matched else "", - "standard_name": standard_name, - "type": matched.get("type", "real") if matched else "real", - "units": units, - "valid_max": matched.get("valid_max", "") if matched else "", - "valid_min": matched.get("valid_min", "") if matched else "", - } - grouped_json.setdefault(key, {})[var_name] = cmor_entry - - # fx if INV is among RTREP - if "INV" in rtreps: + if "INV" in time_representation: freq = "fx" - key = (freq, level_group, "") - matched = find_best_matching_variable(var_name, freq, realm, level_type, search_dirs) - cmor_entry = { - "cell_measures": matched.get("cell_measures", "") if matched else "", - "cell_methods": matched.get("cell_methods", "") if matched else "", - "comment": comment, - "dimensions": matched.get("dimensions", "lat lon") if matched else "lat lon", - "frequency": freq, - "long_name": long_name, - "modeling_realm": realm, - "ok_max_mean_abs": matched.get("ok_max_mean_abs", "") if matched else "", - "ok_min_mean_abs": matched.get("ok_min_mean_abs", "") if matched else "", - "out_name": var_name, - "positive": matched.get("positive", "") if matched else "", - "standard_name": standard_name, - "type": matched.get("type", "real") if matched else "real", - "units": units, - "valid_max": matched.get("valid_max", "") if matched else "", - "valid_min": matched.get("valid_min", "") if matched else "", - } + print(f"----------------------------{freq}-----------------------------") + key = (freq, level_group) + matched = find_best_matching_variable(var_name, freq, level_type, search_dirs) + cmor_entry = _build_cmor_entry( + var_name, matched, row, freq + ) + grouped_json.setdefault(key, {})[var_name] = cmor_entry + for freq in ["1hr", "day", "mon"]: + print(f"----------------------------{freq}-----------------------------") + key = (freq, level_group) + matched = find_best_matching_variable(var_name, freq, level_type, search_dirs) + cmor_entry = _build_cmor_entry( + var_name, matched, row, freq + ) grouped_json.setdefault(key, {})[var_name] = cmor_entry # Write each file - for (freq, level_group, inst_flag), variable_entry in grouped_json.items(): - filename = f"ERA5_{freq}_{level_group}{inst_flag}.json" + for (freq, level_group), variable_entry in grouped_json.items(): + if "land" in level_group: + level_group = level_group.replace("_land", "") + filename = f"ERA5Land_{freq}_{level_group}" + else: + filename = f"ERA5_{freq}_{level_group}" cmor_json = { "Header": { "Conventions": "CF-1.7 ODS-2.1", - "approx_interval": "0.0416666666", + "approx_interval": f"{approx_interval_map[freq]}", "cmor_version": "3.5", "data_specs_version": "2.1.0", "generic_levels": "", @@ -176,15 +238,15 @@ def csv_to_cmor_json(csv_filepath=csv_file, json_output_path=json_output_path): "mip_era": "CMIP6", "missing_value": "1e20", "product": "model-output", - "realm": "atmos", - "table_date": "18 November 2020", - "table_id": f"Table obs4MIPs_{freq}" + "realm": " ".join(sorted({v.get("modeling_realm", "") for v in variable_entry.values()})).strip(), + "table_date": f"{today_date}", + "table_id": f"Table {filename}" }, "variable_entry": variable_entry } os.makedirs(json_output_path, exist_ok=True) - output_path = os.path.join(json_output_path, filename) + output_path = os.path.join(json_output_path, f"{filename}.json") with open(output_path, "w", encoding="utf-8") as f: json.dump(cmor_json, f, indent=4)