#!/usr/bin/env python # # Call syntax: python convert-stations.py file-that-lists-paths outputdir import os.path as osp import datetime as dt from collections import defaultdict import pandas as pd import numpy as np import xarray as xr import argparse parser = argparse.ArgumentParser() parser.add_argument( 'input', help="File that contains the paths to the input data") parser.add_argument( 'output', help="Output directory for the concatenated input directories") parser.add_argument( '-map', '--mapping', default="colnames-mapping-BG.csv", help=("The mapping of CSV column names to netCDF variables. " "Default: %(default)s")) parser.add_argument( "-meta", default="meta/BSRN_Metadaten_Geographie.nc", help=("NetCDF file that contains the meta data for every station." "Default: %(default)s")) parser.add_argument( "-c", "--combine-only", action='store_true', help="If set, CSV files are not converted but only combined") parser.add_argument( "-o", '--overwrite', action='store_true', help="If set, overwrite existing files") parser.add_argument( "-s", "--source", help=("Path to a file with the same number of rows as `input`. Every " "line in this file must correspond to the source file and the " "modification date is extracted and stored as the modification_date " "attribute. If this parameter is not set, the modification dates in " "the input is used.")) def log_progress(iterator, total=None): if total is None: total = len(iterator) length = 80 fill = '█' current = 0 t0 = dt.datetime.now() print(f"Starting at {t0}") first = True for i, arg in enumerate(iterator): percent = 100 * (i / total) if first or np.round(percent) > current: current = percent filledLength = int(length * i // total) bar = fill * filledLength + '-' * (length - filledLength) secs = (dt.datetime.now() - t0).total_seconds() left = np.nan if first else ((secs * total / i) - secs) / 60 print(f'\r|{bar}| {percent:0.1f}%. Time left: {left:1.3f} minutes', end='\r') first = False yield arg # Print New Line on Complete print('\r|%s| %0.1f%%' % (fill * length, 100), end='\r') t1 = dt.datetime.now() td = (t1 - t0).total_seconds() / 60 print(f"\nFinished at {t1}. Time needed: {td:1.3f} minutes") args = parser.parse_args() with open(args.input) as f: files = sorted(map(str.strip, f.readlines())) outdir = args.output mapping = pd.read_csv(args.mapping, index_col=0) meta = xr.open_dataset(args.meta) if args.source is not None: with open(args.source) as f: source_files = list(map(str.strip, f.readlines())) else: source_files = files[:] ids = defaultdict(list) now = dt.datetime.now().isoformat() for i, (path, source) in log_progress( enumerate(zip(files, source_files)), len(files)): # read BSRN data file base = osp.splitext(path)[0] stationid = osp.basename(base.split('_')[0]) output = base + '.nc' ids[stationid].append(output) if not args.combine_only and (args.overwrite or not osp.exists(output)): df = pd.read_csv(path, '\t') ds = df.to_xarray() for key in list(ds): var = ds[key] var.attrs['bsrn_name'] = key try: row = mapping.loc[key] except KeyError: if key not in ['station', 'stationid']: del ds[key] else: for attr, val in row[row.notnull()].items(): if attr != 'name': var.attrs[attr] = val ds = ds.rename({key: row['name']}) ds = ds.set_index(index='time').rename(index='time') ds = ds.expand_dims('stationid') station_meta = meta.sel(stationid=[stationid]) for key in list(station_meta): if station_meta[key].isnull().all(): del station_meta[key] ds.update(station_meta) ds.attrs['featureType'] = 'timeSeries' ds.attrs['Conventions'] = 'CF-1.8' ds.attrs['station_id'] = stationid ds.attrs['source'] = "surface observation" ds.attrs['conventionsURL'] = ('http://www.unidata.ucar.edu/packages/' 'netcdf/conventions.html') ds.attrs['download_site'] = "ftp://ftp.bsrn.awi.de/" ds.attrs['station'] = ds['station'].values[0] ds.attrs['creation_date'] = 'transformation to netCDF: ' + now mtime = dt.datetime.fromtimestamp( osp.getmtime(source)).isoformat() ds.attrs['modification_date'] = ( "Modification date of source file %s: %s" % ( osp.basename(source), mtime)) ds.attrs['history'] = '\n'.join( [ds.attrs['creation_date'], ds.attrs['modification_date']]) ds.attrs['references'] = ( "Driemel, A., Augustine, J., Behrens, K., Colle, S., Cox, C., " "Cuevas-Agulló, E., Denn, F. M., Duprat, T., Fukuda, M., " "Grobe, H., Haeffelin, M., Hodges, G., Hyett, N., Ijima, O., " "Kallis, A., Knap, W., Kustov, V., Long, C. N., Longenecker, D., " "Lupi, A., Maturilli, M., Mimouni, M., Ntsangwane, L., " "Ogihara, H., Olano, X., Olefs, M., Omori, M., Passamani, L., " "Pereira, E. B., Schmithüsen, H., Schumacher, S., Sieger, R., " "Tamlyn, J., Vogt, R., Vuilleumier, L., Xia, X., Ohmura, A., and " "König-Langlo, G.: Baseline Surface Radiation Network (BSRN): " "structure and data description (1992–2017), " "Earth Syst. Sci. Data, 10, 1491-1501, " "doi:10.5194/essd-10-1491-2018, 2018.") if 'institution' in ds: ds.attrs['institution'] = ds['institution'].values[0] for attr in ['institution', 'station']: if attr in ds: ds.attrs[attr] = str(ds[attr].values[0]) if 'time' in ds: ds['time'] = ds['time'].copy(data=pd.to_datetime(ds['time'])) if 'units' in ds['time'].attrs: ds['time'].encoding['units'] = ds['time'].attrs.pop('units') ds.to_netcdf( output, encoding={key: {'zlib': True, 'complevel': 4} for key in ds.variables}) for stationid, files in ids.items(): print(f"Concatenating files for {stationid}") ds = xr.open_mfdataset(files, combine='nested', join='override', data_vars='minimal', coords='minimal', concat_dim='time') name = meta.sel(stationid=stationid).station.values[()] output = osp.join(outdir, name + '.nc') if not osp.exists(output) or args.overwrite: ds.to_netcdf(output)