Commit d796f8a0 authored by Philipp Sommer's avatar Philipp Sommer
Browse files

Outsourced formatting into separate function

parent c819fe02
......@@ -74,41 +74,7 @@ def log_progress(iterator, total=None):
print(f"\nFinished at {t1}. Time needed: {td:1.3f} minutes")
args = parser.parse_args()
with open(args.input) as f:
files = sorted(map(str.strip, f.readlines()))
outdir = args.output
mapping = pd.read_csv(args.mapping, index_col=0)
meta = xr.open_dataset(args.meta)
if args.source is not None:
with open(args.source) as f:
source_files = list(map(str.strip, f.readlines()))
else:
source_files = files[:]
ids = defaultdict(list)
now = dt.datetime.now().isoformat()
for i, (path, source) in log_progress(
enumerate(zip(files, source_files)), len(files)):
# read BSRN data file
base = osp.splitext(path)[0]
stationid = osp.basename(base.split('_')[0])
output = base + '.nc'
ids[stationid].append(output)
if not args.combine_only and (args.overwrite or not osp.exists(output)):
df = pd.read_csv(path, '\t')
ds = df.to_xarray()
def format_dataset(ds, source):
for key in list(ds):
var = ds[key]
var.attrs['bsrn_name'] = key
......@@ -125,9 +91,10 @@ for i, (path, source) in log_progress(
ds = ds.set_index(index='time').rename(index='time')
ds = ds.expand_dims('stationid')
ds = ds.expand_dims('stationid', axis=1)
station_meta = meta.sel(stationid=[stationid])
station_meta = meta.isel(
stationid=np.where(meta.station == stationid)[0])
for key in list(station_meta):
if station_meta[key].isnull().all():
del station_meta[key]
......@@ -172,27 +139,97 @@ for i, (path, source) in log_progress(
if 'institution' in ds:
ds.attrs['institution'] = ds['institution'].values[0]
for attr in ['institution', 'station']:
if attr in ds:
ds.attrs[attr] = str(ds[attr].values[0])
if 'station_name' in ds:
ds.attrs['station'] = ds['station_name'].values[0]
if 'time' in ds:
ds['time'] = ds['time'].copy(data=pd.to_datetime(ds['time']))
if 'units' in ds['time'].attrs:
ds['time'].encoding['units'] = ds['time'].attrs.pop('units')
ds.to_netcdf(
output,
encoding={key: {'zlib': True, 'complevel': 4}
for key in ds.variables})
return ds
args = parser.parse_args()
with open(args.input) as f:
files = sorted(map(str.strip, f.readlines()))
outdir = args.output
mapping = pd.read_csv(args.mapping, index_col=0)
meta = xr.open_dataset(args.meta)
if args.source is not None:
with open(args.source) as f:
source_files = list(map(str.strip, f.readlines()))
else:
source_files = files[:]
ids = defaultdict(list)
now = dt.datetime.now().isoformat()
for i, (path, source) in enumerate(zip(files, source_files)):
# read BSRN data file
base = osp.splitext(path)[0]
stationid = osp.basename(base.split('_')[0])
output = base + '.nc'
ids[stationid].append((path, source, base, output))
for stationid, files in ids.items():
print(f"Concatenating files for {stationid}")
ds = xr.open_mfdataset(files, combine='nested', join='override',
data_vars='minimal', coords='minimal',
concat_dim='time')
name = meta.sel(stationid=stationid).station.values[()]
output = osp.join(outdir, name + '.nc')
if not osp.exists(output) or args.overwrite:
ds.to_netcdf(output)
full_df = None
name = meta.isel(
stationid=np.where(meta.station == stationid)[0][0]).station.values[()]
print("Processing %i files of station %s" % (len(files), name))
files.sort()
for path, source, base, output in log_progress(files):
df = pd.read_csv(path, '\t')
full_df = df if full_df is None else pd.concat(
[full_df, df], ignore_index=True, sort=False)
ds = df.to_xarray()
ds = format_dataset(ds, source)
if not args.combine_only and (
args.overwrite or not osp.exists(output)):
encoding = {key: {'zlib': True, 'complevel': 4}
for key in ds.variables}
encoding['time']['dtype'] = float
ds.to_netcdf(
output, encoding=encoding)
if full_df is not None:
ds = full_df.to_xarray()
ds = format_dataset(ds, source)
t0 = str(ds.time.min().dt.strftime('%Y-%m-%d').values)
t1 = str(ds.time.max().dt.strftime('%Y-%m-%d').values)
name = ds['station_name'].values[0].split(',')[0].replace(
' ', '_').lower()
ds = ds.sortby('time')
full_output = osp.join(outdir, f'BSRN-stationdata.{name}.{t0}-{t1}.nc')
encoding = {key: {'zlib': True, 'complevel': 4}
for key in ds.variables}
encoding['time']['dtype'] = float
ds.to_netcdf(
full_output, encoding=encoding)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment