convert-metadata.py 4.03 KB
Newer Older
Philipp Sommer's avatar
Philipp Sommer committed
1
2
# coding: utf-8
import pandas as pd
Philipp Sommer's avatar
Philipp Sommer committed
3
import numpy as np
Philipp Sommer's avatar
Philipp Sommer committed
4
5
6
import re

# read BSRN meta data file
Philipp Sommer's avatar
Philipp Sommer committed
7
8
df = pd.read_csv('bsrn_stations.tab', '\t')
staff = pd.read_csv('bsrn_staffs.tab', '\t')
Philipp Sommer's avatar
Philipp Sommer committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
mapping = pd.read_csv('colnames-mapping-BG.csv', index_col=0)

staff.index = staff['First name'] + ' ' + staff['Last name']


candidate_patt = re.compile(r'\s*BSRN Candidate\.\s*')
email_patt = re.compile(r'\s*\(.+@.+\).*')


def add_comment(d, val):
    if 'Unsorted Comments' in d:
        d['Unsorted Comments'] += '; ' + val
    else:
        d['Unsorted Comments'] = val


def create_dict(s):
    """Transform a *Comment* string into a :class:`dict`"""
    items = re.split(r';\s*', s)
    d = {}
    for item in items:
        try:
            key, val = item.split(': ', 1)
        except ValueError:
            add_comment(d, item)
        else:
            candidate = candidate_patt.search(key)
            if candidate:
                add_comment(d, 'BSRN Candidate')
                key = candidate_patt.sub('', key)
            d[key] = val
    if 'Station scientist' in d or '. Station scientist' in d:
        scientist = email_patt.sub(
            '', d.get('Station scientist', d.get('. Station scientist')))
        try:
            row = staff.loc[scientist]
        except KeyError:
            pass
        else:
            d.update(row.to_dict())
    return d


dicts = df.Comment.apply(create_dict).values.tolist()
comments = pd.DataFrame(dicts)
joined = df.join(comments)

# Manual fixes of some typos in the comments
if 'Surface type' in joined:
    joined.loc[joined['Surface type'].notnull(), 'Surface Type'] = joined.loc[
        joined['Surface type'].notnull(), 'Surface type']
    del joined['Surface type']

if 'BSRN station no' in joined:
    joined.loc[joined['BSRN station no'].notnull(), 'BSRN Station no'] = joined.loc[
        joined['BSRN station no'].notnull(), 'BSRN station no']
    del joined['BSRN station no']

if '. Station scientist' in joined:
    joined.loc[joined['. Station scientist'].notnull(), 'Station scientist'] = joined.loc[
        joined['. Station scientist'].notnull(), '. Station scientist']
    del joined['. Station scientist']

if 'Plain terrain, Soil ' in joined:
    joined.loc[joined['Plain terrain, Soil '].notnull(),
               'Plain terrain, Soil'] = joined.loc[
        joined['Plain terrain, Soil '].notnull(), 'Plain terrain, Soil ']
    del joined['Plain terrain, Soil ']

if 'Plain terrain, Soil type' in joined:
    joined.loc[joined['Plain terrain, Soil type'].notnull(),
               'Plain terrain, Soil'] = joined.loc[
        joined['Plain terrain, Soil type'].notnull(),
        'Plain terrain, Soil type']
    del joined['Plain terrain, Soil type']

Philipp Sommer's avatar
Philipp Sommer committed
85
86
87
88
if 'Station no' in joined:
    joined.loc[joined['Station no'].notnull(), 'BSRN Station no'] = joined.loc[
        joined['Station no'].notnull(), 'Station no']
    del joined['Station no']
Philipp Sommer's avatar
Philipp Sommer committed
89

Philipp Sommer's avatar
Philipp Sommer committed
90
91
92
93
94
95
96
97
98
99
100
101
insert_station_no_fill = False
if 'BSRN Station no' in joined:
    try:
        joined['BSRN Station no'] = joined['BSRN Station no'].astype(int)
    except ValueError:
        joined['BSRN Station no'] = joined['BSRN Station no'].fillna(
            '-9999').astype(int)
        insert_station_no_fill = True


joined.index = pd.Index(np.arange(1, len(joined) + 1), name='stationid')
ds = joined.to_xarray()
Philipp Sommer's avatar
Philipp Sommer committed
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

for key in list(ds.variables):
    var = ds[key]
    var.attrs['bsrn_name'] = key
    try:
        row = mapping.loc[key]
    except KeyError:
        pass
    else:
        for attr, val in row[row.notnull()].items():
            if attr != 'name':
                var.attrs[attr] = val
        ds = ds.rename({key: row['name']})

new_names = {}

patt = re.compile(r'[_\/\-\s]')
for key, var in ds.variables.items():
    if var.attrs['bsrn_name'] not in df:
        var.attrs['derived_from'] = 'Comment'
    else:
        var.attrs['derived_from'] = var.attrs['bsrn_name']
Philipp Sommer's avatar
Philipp Sommer committed
124
125
126
127
128
129
    new_names[key] = patt.sub('_', key.lower()).replace(',', '')


if insert_station_no_fill:
    ds['BSRN Station no'].encoding['_FillValue'] = -9999

Philipp Sommer's avatar
Philipp Sommer committed
130
131
132
133


ds = ds.rename(new_names)
ds.to_netcdf('BSRN_Metadaten_Geographie.nc')