#!/usr/bin/env python # # Call syntax: python convert-stations.py file-that-lists-paths outputdir import os.path as osp import datetime as dt from collections import defaultdict import pandas as pd import numpy as np import xarray as xr import argparse parser = argparse.ArgumentParser() parser.add_argument( 'input', help="File that contains the paths to the input data") parser.add_argument( 'output', help="Output directory for the concatenated input directories") parser.add_argument( '-map', '--mapping', default="colnames-mapping-BG.csv", help=("The mapping of CSV column names to netCDF variables. " "Default: %(default)s")) parser.add_argument( "-meta", default="meta/BSRN_Metadaten_Geographie.nc", help=("NetCDF file that contains the meta data for every station." "Default: %(default)s")) parser.add_argument( "-c", "--combine-only", action='store_true', help="If set, CSV files are not converted but only combined") parser.add_argument( "-o", '--overwrite', action='store_true', help="If set, overwrite existing files") parser.add_argument( "-s", "--source", help=("Path to a file with the same number of rows as `input`. Every " "line in this file must correspond to the source file and the " "modification date is extracted and stored as the modification_date " "attribute. If this parameter is not set, the modification dates in " "the input is used.")) def log_progress(iterator, total=None): if total is None: total = len(iterator) length = 80 fill = '█' current = 0 t0 = dt.datetime.now() print(f"Starting at {t0}") first = True for i, arg in enumerate(iterator): percent = 100 * (i / total) if first or np.round(percent) > current: current = percent filledLength = int(length * i // total) bar = fill * filledLength + '-' * (length - filledLength) secs = (dt.datetime.now() - t0).total_seconds() left = np.nan if first else ((secs * total / i) - secs) / 60 print(f'\r|{bar}| {percent:0.1f}%. Time left: {left:1.3f} minutes', end='\r') first = False yield arg # Print New Line on Complete print('\r|%s| %0.1f%%' % (fill * length, 100), end='\r') t1 = dt.datetime.now() td = (t1 - t0).total_seconds() / 60 print(f"\nFinished at {t1}. Time needed: {td:1.3f} minutes") def format_dataset(ds, source): for key in list(ds): var = ds[key] var.attrs['bsrn_name'] = key try: row = mapping.loc[key] except KeyError: if key not in ['station', 'stationid']: del ds[key] else: for attr, val in row[row.notnull()].items(): if attr != 'name': var.attrs[attr] = val ds = ds.rename({key: row['name']}) ds = ds.set_index(index='time').rename(index='time') ds = ds.expand_dims('stationid', axis=1) station_meta = meta.isel( stationid=np.where(meta.station == stationid)[0]) for key in list(station_meta): if station_meta[key].isnull().all(): del station_meta[key] ds.update(station_meta) ds.attrs['featureType'] = 'timeSeries' ds.attrs['Conventions'] = 'CF-1.8' ds.attrs['station_id'] = stationid ds.attrs['source'] = "surface observation" ds.attrs['conventionsURL'] = ('http://www.unidata.ucar.edu/packages/' 'netcdf/conventions.html') ds.attrs['download_site'] = "ftp://ftp.bsrn.awi.de/" ds.attrs['station'] = ds['station'].values[0] ds.attrs['creation_date'] = 'transformation to netCDF: ' + now mtime = dt.datetime.fromtimestamp( osp.getmtime(source)).isoformat() ds.attrs['modification_date'] = ( "Modification date of source file %s: %s" % ( osp.basename(source), mtime)) ds.attrs['history'] = '\n'.join( [ds.attrs['creation_date'], ds.attrs['modification_date']]) ds.attrs['references'] = ( "Driemel, A., Augustine, J., Behrens, K., Colle, S., Cox, C., " "Cuevas-Agulló, E., Denn, F. M., Duprat, T., Fukuda, M., " "Grobe, H., Haeffelin, M., Hodges, G., Hyett, N., Ijima, O., " "Kallis, A., Knap, W., Kustov, V., Long, C. N., Longenecker, D., " "Lupi, A., Maturilli, M., Mimouni, M., Ntsangwane, L., " "Ogihara, H., Olano, X., Olefs, M., Omori, M., Passamani, L., " "Pereira, E. B., Schmithüsen, H., Schumacher, S., Sieger, R., " "Tamlyn, J., Vogt, R., Vuilleumier, L., Xia, X., Ohmura, A., and " "König-Langlo, G.: Baseline Surface Radiation Network (BSRN): " "structure and data description (1992–2017), " "Earth Syst. Sci. Data, 10, 1491-1501, " "doi:10.5194/essd-10-1491-2018, 2018.") if 'institution' in ds: ds.attrs['institution'] = ds['institution'].values[0] if 'station_name' in ds: ds.attrs['station'] = ds['station_name'].values[0] if 'time' in ds: ds['time'] = ds['time'].copy(data=pd.to_datetime(ds['time'])) if 'units' in ds['time'].attrs: ds['time'].encoding['units'] = ds['time'].attrs.pop('units') return ds args = parser.parse_args() with open(args.input) as f: files = sorted(map(str.strip, f.readlines())) outdir = args.output mapping = pd.read_csv(args.mapping, index_col=0) meta = xr.open_dataset(args.meta) if args.source is not None: with open(args.source) as f: source_files = list(map(str.strip, f.readlines())) else: source_files = files[:] ids = defaultdict(list) now = dt.datetime.now().isoformat() for i, (path, source) in enumerate(zip(files, source_files)): # read BSRN data file base = osp.splitext(path)[0] stationid = osp.basename(base.split('_')[0]) output = base + '.nc' ids[stationid].append((path, source, base, output)) for stationid, files in ids.items(): full_df = None name = meta.isel( stationid=np.where(meta.station == stationid)[0][0]).station.values[()] print("Processing %i files of station %s" % (len(files), name)) files.sort() for path, source, base, output in log_progress(files): df = pd.read_csv(path, '\t') full_df = df if full_df is None else pd.concat( [full_df, df], ignore_index=True, sort=False) ds = df.to_xarray() ds = format_dataset(ds, source) if not args.combine_only and ( args.overwrite or not osp.exists(output)): encoding = {key: {'zlib': True, 'complevel': 4} for key in ds.variables} encoding['time']['dtype'] = float ds.to_netcdf( output, encoding=encoding) if full_df is not None: ds = full_df.to_xarray() ds = format_dataset(ds, source) t0 = str(ds.time.min().dt.strftime('%Y-%m-%d').values) t1 = str(ds.time.max().dt.strftime('%Y-%m-%d').values) name = ds['station_name'].values[0].split(',')[0].replace( ' ', '_').lower() ds = ds.sortby('time') full_output = osp.join(outdir, f'BSRN-stationdata.{name}.{t0}-{t1}.nc') encoding = {key: {'zlib': True, 'complevel': 4} for key in ds.variables} encoding['time']['dtype'] = float ds.to_netcdf( full_output, encoding=encoding)