# coding: utf-8 import pandas as pd import numpy as np import re # read BSRN meta data file df = pd.read_csv('bsrn_stations.tab', '\t') staff = pd.read_csv('bsrn_staffs.tab', '\t') mapping = pd.read_csv('colnames-mapping-BG.csv', index_col=0) staff.index = staff['First name'] + ' ' + staff['Last name'] candidate_patt = re.compile(r'\s*BSRN Candidate\.\s*') email_patt = re.compile(r'\s*\(.+@.+\).*') def add_comment(d, val): if 'Unsorted Comments' in d: d['Unsorted Comments'] += '; ' + val else: d['Unsorted Comments'] = val def create_dict(s): """Transform a *Comment* string into a :class:`dict`""" items = re.split(r';\s*', s) d = {} for item in items: try: key, val = item.split(': ', 1) except ValueError: add_comment(d, item) else: candidate = candidate_patt.search(key) if candidate: add_comment(d, 'BSRN Candidate') key = candidate_patt.sub('', key) d[key] = val if 'Station scientist' in d or '. Station scientist' in d: scientist = email_patt.sub( '', d.get('Station scientist', d.get('. Station scientist'))) try: row = staff.loc[scientist] except KeyError: pass else: d.update(row.to_dict()) return d dicts = df.Comment.apply(create_dict).values.tolist() comments = pd.DataFrame(dicts) joined = df.join(comments) # Manual fixes of some typos in the comments if 'Surface type' in joined: joined.loc[joined['Surface type'].notnull(), 'Surface Type'] = joined.loc[ joined['Surface type'].notnull(), 'Surface type'] del joined['Surface type'] if 'BSRN station no' in joined: joined.loc[joined['BSRN station no'].notnull(), 'BSRN Station no'] = joined.loc[ joined['BSRN station no'].notnull(), 'BSRN station no'] del joined['BSRN station no'] if '. Station scientist' in joined: joined.loc[joined['. Station scientist'].notnull(), 'Station scientist'] = joined.loc[ joined['. Station scientist'].notnull(), '. Station scientist'] del joined['. Station scientist'] if 'Plain terrain, Soil ' in joined: joined.loc[joined['Plain terrain, Soil '].notnull(), 'Plain terrain, Soil'] = joined.loc[ joined['Plain terrain, Soil '].notnull(), 'Plain terrain, Soil '] del joined['Plain terrain, Soil '] if 'Plain terrain, Soil type' in joined: joined.loc[joined['Plain terrain, Soil type'].notnull(), 'Plain terrain, Soil'] = joined.loc[ joined['Plain terrain, Soil type'].notnull(), 'Plain terrain, Soil type'] del joined['Plain terrain, Soil type'] if 'Station no' in joined: joined.loc[joined['Station no'].notnull(), 'BSRN Station no'] = joined.loc[ joined['Station no'].notnull(), 'Station no'] del joined['Station no'] insert_station_no_fill = False if 'BSRN Station no' in joined: try: joined['BSRN Station no'] = joined['BSRN Station no'].astype(int) except ValueError: joined['BSRN Station no'] = joined['BSRN Station no'].fillna( '-9999').astype(int) insert_station_no_fill = True joined.index = pd.Index(np.arange(1, len(joined) + 1), name='stationid') ds = joined.to_xarray() for key in list(ds.variables): var = ds[key] var.attrs['bsrn_name'] = key try: row = mapping.loc[key] except KeyError: pass else: for attr, val in row[row.notnull()].items(): if attr != 'name': var.attrs[attr] = val ds = ds.rename({key: row['name']}) new_names = {} patt = re.compile(r'[_\/\-\s]') for key, var in ds.variables.items(): if var.attrs['bsrn_name'] not in df: var.attrs['derived_from'] = 'Comment' else: var.attrs['derived_from'] = var.attrs['bsrn_name'] new_names[key] = patt.sub('_', key.lower()).replace(',', '') if insert_station_no_fill: ds['BSRN Station no'].encoding['_FillValue'] = -9999 ds = ds.rename(new_names) ds.to_netcdf('BSRN_Metadaten_Geographie.nc')