Commit d796f8a0 authored by Philipp Sommer's avatar Philipp Sommer
Browse files

Outsourced formatting into separate function

parent c819fe02
......@@ -74,6 +74,82 @@ def log_progress(iterator, total=None):
print(f"\nFinished at {t1}. Time needed: {td:1.3f} minutes")
def format_dataset(ds, source):
for key in list(ds):
var = ds[key]
var.attrs['bsrn_name'] = key
try:
row = mapping.loc[key]
except KeyError:
if key not in ['station', 'stationid']:
del ds[key]
else:
for attr, val in row[row.notnull()].items():
if attr != 'name':
var.attrs[attr] = val
ds = ds.rename({key: row['name']})
ds = ds.set_index(index='time').rename(index='time')
ds = ds.expand_dims('stationid', axis=1)
station_meta = meta.isel(
stationid=np.where(meta.station == stationid)[0])
for key in list(station_meta):
if station_meta[key].isnull().all():
del station_meta[key]
ds.update(station_meta)
ds.attrs['featureType'] = 'timeSeries'
ds.attrs['Conventions'] = 'CF-1.8'
ds.attrs['station_id'] = stationid
ds.attrs['source'] = "surface observation"
ds.attrs['conventionsURL'] = ('http://www.unidata.ucar.edu/packages/'
'netcdf/conventions.html')
ds.attrs['download_site'] = "ftp://ftp.bsrn.awi.de/"
ds.attrs['station'] = ds['station'].values[0]
ds.attrs['creation_date'] = 'transformation to netCDF: ' + now
mtime = dt.datetime.fromtimestamp(
osp.getmtime(source)).isoformat()
ds.attrs['modification_date'] = (
"Modification date of source file %s: %s" % (
osp.basename(source), mtime))
ds.attrs['history'] = '\n'.join(
[ds.attrs['creation_date'], ds.attrs['modification_date']])
ds.attrs['references'] = (
"Driemel, A., Augustine, J., Behrens, K., Colle, S., Cox, C., "
"Cuevas-Agulló, E., Denn, F. M., Duprat, T., Fukuda, M., "
"Grobe, H., Haeffelin, M., Hodges, G., Hyett, N., Ijima, O., "
"Kallis, A., Knap, W., Kustov, V., Long, C. N., Longenecker, D., "
"Lupi, A., Maturilli, M., Mimouni, M., Ntsangwane, L., "
"Ogihara, H., Olano, X., Olefs, M., Omori, M., Passamani, L., "
"Pereira, E. B., Schmithüsen, H., Schumacher, S., Sieger, R., "
"Tamlyn, J., Vogt, R., Vuilleumier, L., Xia, X., Ohmura, A., and "
"König-Langlo, G.: Baseline Surface Radiation Network (BSRN): "
"structure and data description (1992–2017), "
"Earth Syst. Sci. Data, 10, 1491-1501, "
"doi:10.5194/essd-10-1491-2018, 2018.")
if 'institution' in ds:
ds.attrs['institution'] = ds['institution'].values[0]
if 'station_name' in ds:
ds.attrs['station'] = ds['station_name'].values[0]
if 'time' in ds:
ds['time'] = ds['time'].copy(data=pd.to_datetime(ds['time']))
if 'units' in ds['time'].attrs:
ds['time'].encoding['units'] = ds['time'].attrs.pop('units')
return ds
args = parser.parse_args()
with open(args.input) as f:
......@@ -95,104 +171,65 @@ ids = defaultdict(list)
now = dt.datetime.now().isoformat()
for i, (path, source) in log_progress(
enumerate(zip(files, source_files)), len(files)):
for i, (path, source) in enumerate(zip(files, source_files)):
# read BSRN data file
base = osp.splitext(path)[0]
stationid = osp.basename(base.split('_')[0])
output = base + '.nc'
ids[stationid].append(output)
ids[stationid].append((path, source, base, output))
for stationid, files in ids.items():
full_df = None
name = meta.isel(
stationid=np.where(meta.station == stationid)[0][0]).station.values[()]
print("Processing %i files of station %s" % (len(files), name))
files.sort()
for path, source, base, output in log_progress(files):
if not args.combine_only and (args.overwrite or not osp.exists(output)):
df = pd.read_csv(path, '\t')
full_df = df if full_df is None else pd.concat(
[full_df, df], ignore_index=True, sort=False)
ds = df.to_xarray()
for key in list(ds):
var = ds[key]
var.attrs['bsrn_name'] = key
try:
row = mapping.loc[key]
except KeyError:
if key not in ['station', 'stationid']:
del ds[key]
else:
for attr, val in row[row.notnull()].items():
if attr != 'name':
var.attrs[attr] = val
ds = ds.rename({key: row['name']})
ds = ds.set_index(index='time').rename(index='time')
ds = ds.expand_dims('stationid')
station_meta = meta.sel(stationid=[stationid])
for key in list(station_meta):
if station_meta[key].isnull().all():
del station_meta[key]
ds.update(station_meta)
ds.attrs['featureType'] = 'timeSeries'
ds.attrs['Conventions'] = 'CF-1.8'
ds.attrs['station_id'] = stationid
ds.attrs['source'] = "surface observation"
ds.attrs['conventionsURL'] = ('http://www.unidata.ucar.edu/packages/'
'netcdf/conventions.html')
ds.attrs['download_site'] = "ftp://ftp.bsrn.awi.de/"
ds.attrs['station'] = ds['station'].values[0]
ds.attrs['creation_date'] = 'transformation to netCDF: ' + now
mtime = dt.datetime.fromtimestamp(
osp.getmtime(source)).isoformat()
ds.attrs['modification_date'] = (
"Modification date of source file %s: %s" % (
osp.basename(source), mtime))
ds.attrs['history'] = '\n'.join(
[ds.attrs['creation_date'], ds.attrs['modification_date']])
ds.attrs['references'] = (
"Driemel, A., Augustine, J., Behrens, K., Colle, S., Cox, C., "
"Cuevas-Agulló, E., Denn, F. M., Duprat, T., Fukuda, M., "
"Grobe, H., Haeffelin, M., Hodges, G., Hyett, N., Ijima, O., "
"Kallis, A., Knap, W., Kustov, V., Long, C. N., Longenecker, D., "
"Lupi, A., Maturilli, M., Mimouni, M., Ntsangwane, L., "
"Ogihara, H., Olano, X., Olefs, M., Omori, M., Passamani, L., "
"Pereira, E. B., Schmithüsen, H., Schumacher, S., Sieger, R., "
"Tamlyn, J., Vogt, R., Vuilleumier, L., Xia, X., Ohmura, A., and "
"König-Langlo, G.: Baseline Surface Radiation Network (BSRN): "
"structure and data description (1992–2017), "
"Earth Syst. Sci. Data, 10, 1491-1501, "
"doi:10.5194/essd-10-1491-2018, 2018.")
if 'institution' in ds:
ds.attrs['institution'] = ds['institution'].values[0]
for attr in ['institution', 'station']:
if attr in ds:
ds.attrs[attr] = str(ds[attr].values[0])
if 'time' in ds:
ds['time'] = ds['time'].copy(data=pd.to_datetime(ds['time']))
if 'units' in ds['time'].attrs:
ds['time'].encoding['units'] = ds['time'].attrs.pop('units')
ds = format_dataset(ds, source)
ds.to_netcdf(
output,
encoding={key: {'zlib': True, 'complevel': 4}
for key in ds.variables})
if not args.combine_only and (
args.overwrite or not osp.exists(output)):
encoding = {key: {'zlib': True, 'complevel': 4}
for key in ds.variables}
encoding['time']['dtype'] = float
ds.to_netcdf(
output, encoding=encoding)
if full_df is not None:
ds = full_df.to_xarray()
ds = format_dataset(ds, source)
t0 = str(ds.time.min().dt.strftime('%Y-%m-%d').values)
t1 = str(ds.time.max().dt.strftime('%Y-%m-%d').values)
name = ds['station_name'].values[0].split(',')[0].replace(
' ', '_').lower()
ds = ds.sortby('time')
full_output = osp.join(outdir, f'BSRN-stationdata.{name}.{t0}-{t1}.nc')
encoding = {key: {'zlib': True, 'complevel': 4}
for key in ds.variables}
encoding['time']['dtype'] = float
ds.to_netcdf(
full_output, encoding=encoding)
for stationid, files in ids.items():
print(f"Concatenating files for {stationid}")
ds = xr.open_mfdataset(files, combine='nested', join='override',
data_vars='minimal', coords='minimal',
concat_dim='time')
name = meta.sel(stationid=stationid).station.values[()]
output = osp.join(outdir, name + '.nc')
if not osp.exists(output) or args.overwrite:
ds.to_netcdf(output)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment