output and cdo_post

60b12023 · Aaron Spring · 2c5808cb · 60b12023 · 60b12023 · 2c5808cb
Commit 60b12023 authored 5 years ago by Aaron Spring
--- a/.coveragerc
+++ b/.coveragerc
+[run]
+omit =
+    pymistral/tests/*.py
+    setup.py
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+
+  -   repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v2.2.3
+      hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-docstring-first
+      - id: check-yaml
+      - id: double-quote-string-fixer
+      - id: no-commit-to-branch
+      - id: debug-statements
+      - id: check-merge-conflict
+
+  -   repo: https://github.com/ambv/black
+      rev: 19.3b0
+      hooks:
+      - id: black
+        args: ["--line-length", "79", "--skip-string-normalization"]
+
+  -   repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v2.2.3
+      hooks:
+      - id: flake8
+        args: ["--max-line-length=79", "--exclude=__init__.py", "--ignore=C901,W605,W503,F722"]
+
+  -   repo: https://github.com/asottile/blacken-docs
+      rev: v1.0.0
+      hooks:
+      - id: blacken-docs
--- a/.unibeautifyrc.yml
+++ b/.unibeautifyrc.yml
---
-Markdown:
-  beautifiers:
-    - Prettier
-  indent_size: 4
-  indent_char: " "
-  wrap_line_length: 80
-Python:
-  beautifiers:
-    - ["Black"]
-  indent_size: 2
-  indent_char: " "
-  indent_space: "space"
-  wrap_line_length: 80
--- a/MPI-ESM-1-2-LR_output.csv
+++ b/MPI-ESM-1-2-LR_output.csv
--- a/intake/obs.yml
+++ b/intake/obs.yml
@@ -189,13 +189,13 @@ sources:
      version:
        description: version name
        type: str
-        default: 's85_v4.2'
-        allowed: ['prior_v4.2','s76_v4.2','s85_v4.2','s93_v4.2']
+        default: s85_v4.2
+        allowed: [sEXTocNEET_v4.3, prior_v4.2, s76_v4.2, s81oc_v4.3, s85_v4.2, s93_v4.2, s99oc_v4.3, s04oc_v4.3, s10oc_v4.3]
      t_res:
        description: temporal resolution
        type: str
-        default: 'monmean'
-        allowed: ['daily','monmean','yearmonmean']
+        default: yearmonmean
+        allowed: [daily, monmean, yearmonmean]
    args:
      urlpath: '/work/mh0727/data/Jena_CarboScope/atmospheric_CO2_inversion/{{version}}_{{t_res}}.nc'

@@ -566,6 +566,7 @@ by Grigory Monterey, Sydney Levitus'
        allowed: [3,3v]
    args:
      urlpath: /pool/data/ICDC/atmosphere/hadcrut/DATA/hadcrut{{version}}/HadCRUT{{version}}.nc
+
  HadCRUT4:
    driver: netcdf
    description: HadCRUT Surface (Air) Temperature Anomalies

--- a/pymistral.yml
+++ b/pymistral.yml
@@ -32,8 +32,9 @@ dependencies:
  - xrviz
  - xhistogram
  - pip:
-    - git+https://github.com/aaronspring/cmip6_preprocessing.git
+    - git+https://github.com/jbusecke/cmip6_preprocessing.git
    - git+https://github.com/xgcm/xgcm.git
    - git+https://github.com/xgcm/xrft.git
    - git+https://github.com/mathause/regionmask.git
    - git+https://github.com/ncar/intake-esm.git
+    - pre-commit
--- a/pymistral/__init__.py
+++ b/pymistral/__init__.py
@@ -4,11 +4,15 @@ pymistral

 A package for analyzing, processing, and mapping ESM output on mistral.

-Available Modules:
+Available Modules
 -----------------
-
+- setup
+- plot: Cartopy Mapping for MPIOM curvilinear grids
+- hamocc: HAMOCC-specific helper functions
+- slurm_post: write python code to file and send to SLURM (experimental)
+- testing: FDR for xarray masked
+- cdo_post: postprocessing with CDO into xarray
 """

-
-from . import hamocc, plot, setup, slurm_post
+from . import cdo_post, hamocc, plot, setup, slurm_post, testing
 from .setup import cdo
--- a/pymistral/cdo_post.py
+++ b/pymistral/cdo_post.py
+import glob
+import os
+import warnings
+
+import pandas as pd
+import xarray as xr
+
+import cdo
+
+warnings.simplefilter('ignore')
+xr.set_options(keep_attrs=True)
+
+cdo = cdo.Cdo()
+cdo.forceOutput = True
+
+
+expid = 'asp_esmControl_PMassim_3014_TSDICALK_over_3170'
+exppath = '/work/bm1124/m300524/experiments'
+year = 3171  # year to get output labels from
+outpath = os.path.expanduser('~/pymistral/')  # folder to save output_df to
+
+
+def find_all_outdatatypes_in_exp(expid=expid, exppath=exppath, year=year):
+    """Find all outdatatypes (tracer,atm,data_2d,...) from experiment `expid`
+     from path `exppath` of a given `year`."""
+    outdatatypes = []
+    for model in ['echam6', 'jsbach', 'mpiom', 'hamocc']:
+        path = f'{exppath}/{expid}/outdata/{model}/{expid}_{model}_*_{year}*'
+        paths = glob.glob(path)
+        for f in paths:
+            file = os.path.basename(f)
+            # remove expid
+            for s in expid.split('_'):
+                file = file.strip(s + '_')
+            file = file.strip(str(year))
+            # print(file)
+            parts = file.split('_')
+            model = parts[0]
+            # print(parts)
+            if len(parts[1:]) < 2:
+                # print(parts[1:])
+                outdatatype = parts[1].split('.')[0]
+            else:
+                outdatatype = '_'.join(
+                    ('_'.join(parts[1:-1]), parts[-1].split('.')[0])
+                )
+            outdatatype = outdatatype.strip(str(year))
+            outdatatype = outdatatype.strip(f'{year}0101_{year}1231')
+            if outdatatype == 'co':
+                outdatatype = 'co2'
+            ending = parts[-1].split('.')[-1]
+            print(
+                f'Found file: model={model} outdatatype={outdatatype} '
+                f'ending={ending}.'
+            )
+            outdatatypes.append(f'{model}_{outdatatype}')
+    return outdatatypes
+
+
+def read_all_outdatatype_files_to_ds(
+    outdatatypes, expid=expid, exppath=exppath, year=year
+):
+    """Read all outdatatypes from experiment `expid` from path `exppath` of a
+     given `year` and return xr.Dataset."""
+    ds_list = []
+    for outdatatype_id in outdatatypes:
+        print(f'Read {outdatatype_id} to xr.Dataset.')
+        parts = outdatatype_id.split('_')
+        model = parts[0]
+        outdatatype = '_'.join(parts[1:])
+        path = (
+            f'{exppath}/{expid}/outdata/{model}/'
+            + f'{expid}_{model}_{outdatatype}_{year}*'
+        )
+        if model in ['jsbach', 'echam6']:
+            options = ' '
+            if (
+                'BOT' in outdatatype
+                or 'ATM' in outdatatype
+                or 'LOG' in outdatatype
+            ):
+                options += ' -t echam6'
+            else:
+                table = (
+                    f'{exppath}/{expid}/log/'
+                    + f'{expid}_{model}_{outdatatype[:2]}*.codes'
+                )
+                options += ' -t ' + table
+        else:
+            options = ''
+        ds = cdo.copy(input=path, options=options, returnXDataset=True)
+        if 'time' not in ds.dims:
+            ds = ds.expand_dims('time')
+        ds.to_netcdf(f'{exppath}/sample_files/{model}_{outdatatype}.nc')
+        # add outdatatype
+        for v in ds.data_vars:
+            ds[v].attrs['outdatatype'] = outdatatype
+            ds[v].attrs['model'] = model
+            ds[v].attrs['dims'] = list(ds[v].dims)
+        ds_list.append(ds.isel(time=0).mean())
+    return xr.merge(ds_list, compat='override')
+
+
+def create_dataframe_of_output_info(ds, outpath=outpath):
+    """Create pd.Dataframe about output from `ds` and save to `outpath`."""
+    df = pd.DataFrame(
+        index=ds.data_vars,
+        columns=[
+            'varname',
+            'long_name',
+            'code',
+            'table',
+            'units',
+            'model',
+            'outdatatype',
+            'dims',
+        ],
+    )
+    for v in list(ds.data_vars):
+        for c in list(df.columns):
+            df[c].loc[v] = ds[v].attrs[c] if c in ds[v].attrs else ''
+    df['stream'] = df['model'] + '_' + df['outdatatype']
+    df.to_csv(outpath + 'MPI-ESM-1-2-LR_output.csv')
+    return df
+
+
+def generate_output_df(
+    expid=expid, exppath=exppath, year=year, outpath=outpath, recalc=False
+):
+    """Combine all functions above to generate output from `expid` or just load
+     if not `recalc`."""
+    path = outpath + 'MPI-ESM-1-2-LR_output.csv'
+    if not recalc and os.path.exists(path):
+        print(f'Read df from path: {path}')
+        output_df = (
+            pd.read_csv(path, index_col='varname')
+            .rename(columns={'Unnamed: 0': 'varname'})
+            .set_index('varname')
+        )
+    else:
+        outdatatypes = find_all_outdatatypes_in_exp(
+            expid=expid, exppath=exppath, year=year
+        )
+        ds = read_all_outdatatype_files_to_ds(
+            outdatatypes, expid=expid, exppath=exppath, year=year
+        )
+        output_df = create_dataframe_of_output_info(ds, outpath=outpath)
+    return output_df
+
+
+output_df = generate_output_df(recalc=False)
+
+
+def get_model_outdatatype_from_var(
+    var, output_df=output_df, expid=expid, exppath=exppath
+):
+    if isinstance(var, list):  # if list check first
+        var = var[0]
+    if not isinstance(output_df, pd.DataFrame):
+        raise ValueError(
+            f'df needs to be pd.Dataframe, found {type(output_df)}'
+        )
+    model = output_df.T[var]['model']
+    outdatatype = output_df.T[var]['outdatatype']
+    if model in ['mpiom', 'hamocc']:
+        ending = 'nc'
+        options = ''
+    else:
+        ending = 'grb'
+        options = ' -f nc '
+        if (
+            'BOT' in outdatatype
+            or 'ATM' in outdatatype
+            or 'LOG' in outdatatype
+        ):
+            options += ' -t echam6 '
+        else:
+            table = f'{exppath}/{expid}/log/{expid}_{model}_{outdatatype[:2]}*.codes'
+            options += '-t ' + table + ' '
+    return model, outdatatype, options, ending
+
+
+def load_var_cdo(
+    var, expid=expid, exppath=exppath, output_df=output_df, timestr='', sel=''
+):
+    """Load variable `var` from experiment_id `expid` with cdo into xarray."""
+    model, outdatatype, options, ending = get_model_outdatatype_from_var(
+        var, output_df=output_df
+    )
+
+    def load_cdo(var, path, options, sel=sel):
+        print(f'cdo {options} -select,name={var}{sel} {path}')
+        return cdo.select(
+            f'name={var}{sel}',
+            input=path,
+            returnXDataset=True,
+            options=options,
+        )
+
+    path = f'{exppath}/{expid}/outdata/{model}/{expid}_{model}_{outdatatype}_{timestr}*'
+    ds = load_cdo(var, path, options, sel=sel).squeeze()
+
+    return ds
--- a/pymistral/plot.py
+++ b/pymistral/plot.py
-import cartopy as cp
-import cartopy.crs as ccrs
 import matplotlib.pyplot as plt
 import numpy as np
 import xarray as xr
-from cartopy.mpl.ticker import LatitudeFormatter, LongitudeFormatter
 from matplotlib.ticker import MaxNLocator

+import cartopy as cp
+import cartopy.crs as ccrs
+from cartopy.mpl.ticker import LatitudeFormatter, LongitudeFormatter
+

 @xr.register_dataarray_accessor('plot_map')
 class CartopyMap(object):
    """
-    Plot the given 2D array on a cartopy axes with ('xc','lon','longitude') assumed as Longitude and ('yc','lat','latitude') assumed as Latitude.
+    Plot the given 2D array on a cartopy axes with
+     ('xc','lon','longitude') assumed as Longitude and
+     ('yc','lat','latitude') assumed as Latitude.
    The default projection is PlateCarree, but can be:
        cartopy.crs.<ProjectionName>()
    If you would like to create a figure with multiple subplots
@@ -27,10 +30,36 @@ class CartopyMap(object):
    def __init__(self, xarray_obj):
        self._obj = xarray_obj

-    def __call__(self, ax=None, proj=ccrs.PlateCarree(), plot_lon_lat_axis=True, feature='land', plot_type='pcolormesh', rm_cyclic=True, **kwargs):
-        return self._cartopy(ax=ax, proj=proj, feature=feature, plot_lon_lat_axis=plot_lon_lat_axis, plot_type=plot_type, rm_cyclic=rm_cyclic, **kwargs)
-
-    def _cartopy(self, ax=None, proj=ccrs.PlateCarree(), feature='land', plot_lon_lat_axis=True, plot_type='pcolormesh', rm_cyclic=True, **kwargs):
+    def __call__(
+        self,
+        ax=None,
+        proj=ccrs.PlateCarree(),
+        plot_lon_lat_axis=True,
+        feature='land',
+        plot_type='pcolormesh',
+        rm_cyclic=True,
+        **kwargs,
+    ):
+        return self._cartopy(
+            ax=ax,
+            proj=proj,
+            feature=feature,
+            plot_lon_lat_axis=plot_lon_lat_axis,
+            plot_type=plot_type,
+            rm_cyclic=rm_cyclic,
+            **kwargs,
+        )
+
+    def _cartopy(
+        self,
+        ax=None,
+        proj=ccrs.PlateCarree(),
+        feature='land',
+        plot_lon_lat_axis=True,
+        plot_type='pcolormesh',
+        rm_cyclic=True,
+        **kwargs,
+    ):

        # TODO: CESM2 and GFDL-ESM4 have lon issue

@@ -41,18 +70,26 @@ class CartopyMap(object):
                xda = xda[xda.data_vars[0]]
            else:
                raise ValueError(
-                    f'Please provide xr.DataArray, found {type(xda)}')
-
-        assert (xda.ndim == 2) or (xda.ndim == 3 and 'col' in kwargs or 'row' in kwargs) or (
-            xda.ndim == 4 and 'col' in kwargs and 'row' in kwargs)
+                    f'Please provide xr.DataArray, found {type(xda)}'
+                )
+
+        assert (
+            (xda.ndim == 2)
+            or (xda.ndim == 3 and 'col' in kwargs or 'row' in kwargs)
+            or (xda.ndim == 4 and 'col' in kwargs and 'row' in kwargs)
+        )
        single_plot = True if xda.ndim == 2 else False

-        stereo_maps = (ccrs.Stereographic,
-                       ccrs.NorthPolarStereo,
-                       ccrs.SouthPolarStereo)
+        stereo_maps = (
+            ccrs.Stereographic,
+            ccrs.NorthPolarStereo,
+            ccrs.SouthPolarStereo,
+        )
        if isinstance(proj, stereo_maps):
            raise ValueError(
-                'Not implemented, see https://github.com/luke-gregor/xarray_tools/blob/master/accessors.py#L222')
+                'Not implemented, see'
+                'https://github.com/luke-gregor/xarray_tools/accessors.py#L222'
+            )

        # find whether curv or not
        curv = False
@@ -65,13 +102,13 @@ class CartopyMap(object):
                    lon = c
                if c in ['yc', 'lat', 'latitude']:
                    lat = c
-        assert lon != None
-        assert lat != None
+        assert lon is not None
+        assert lat is not None

        if proj in [ccrs.Robinson()]:
            plot_lon_lat_axis = False

-        if plot_type is 'contourf':
+        if plot_type == 'contourf':
            rm_cyclic = False
        if curv and rm_cyclic:
            xda = _rm_singul_lon(xda, lon=lon, lat=lat)
@@ -79,18 +116,29 @@ class CartopyMap(object):
        if 'robust' not in kwargs:
            kwargs['robust'] = True
        if 'cbar_kwargs' not in kwargs:
-            kwargs['cbar_kwargs'] = {'shrink': .6}
+            kwargs['cbar_kwargs'] = {'shrink': 0.6}

        if ax is None:
            if single_plot:
                axm = getattr(xda.plot, plot_type)(
-                    lon, lat, ax=plt.axes(projection=proj), transform=ccrs.PlateCarree(), **kwargs)
+                    lon,
+                    lat,
+                    ax=plt.axes(projection=proj),
+                    transform=ccrs.PlateCarree(),
+                    **kwargs,
+                )
            else:
                axm = getattr(xda.plot, plot_type)(
-                    lon, lat, subplot_kws={'projection': proj}, transform=ccrs.PlateCarree(), **kwargs)
+                    lon,
+                    lat,
+                    subplot_kws={'projection': proj},
+                    transform=ccrs.PlateCarree(),
+                    **kwargs,
+                )
        else:
            axm = getattr(xda.plot, plot_type)(
-                lon, lat, ax=ax, transform=ccrs.PlateCarree(), **kwargs)
+                lon, lat, ax=ax, transform=ccrs.PlateCarree(), **kwargs
+            )

        def work_on_axes(axes):
            if 'coastline_color' in kwargs:
@@ -99,11 +147,15 @@ class CartopyMap(object):
                coastline_color = 'gray'
            axes.coastlines(color=coastline_color, linewidth=1.5)
            if feature is not None:
-                axes.add_feature(getattr(cp.feature, feature.upper()),
-                                 zorder=100, edgecolor='k')
+                axes.add_feature(
+                    getattr(cp.feature, feature.upper()),
+                    zorder=100,
+                    edgecolor='k',
+                )

            if plot_lon_lat_axis:
                _set_lon_lat_axis(axes, proj)
+
        if single_plot:
            if ax is None:
                ax = plt.gca()
@@ -122,41 +174,12 @@ def _rm_singul_lon(ds, lon='lon', lat='lat'):
    lons = ds[lon].values
    fixed_lons = lons.copy()
    for i, start in enumerate(np.argmax(np.abs(np.diff(lons)) > 180, axis=1)):
-        fixed_lons[i, start + 1:] += 360
+        fixed_lons[i, start + 1 :] += 360
    lons_da = xr.DataArray(fixed_lons, ds[lat].coords)
    ds = ds.assign_coords({lon: lons_da})
    return ds


-def my_facetgrid(da,
-                 projection=ccrs.PlateCarree(),
-                 coastline_color='gray',
-                 curv=False,
-                 col='year',
-                 col_wrap=2,
-                 **kwargs):
-    """Wrap xr.facetgrid plot with cartopy"""
-    transform = ccrs.PlateCarree()
-    p = da.plot.pcolormesh(
-        'lon',
-        'lat',
-        transform=transform,
-        col=col,
-        col_wrap=col_wrap,
-        subplot_kws={'projection': projection},
-        **kwargs)
-    for ax in p.axes.flat:
-        if curv:
-            ax.add_feature(cp.feature.LAND, zorder=100, edgecolor='k')
-        if projection == ccrs.PlateCarree():
-            _set_lon_lat_axis(ax, projection)
-            ax.set_xlabel('')
-            ax.set_ylabel('')
-        ax.coastlines()
-        # ax.set_extent([-160, -30, 5, 75])
-        # ax.set_aspect('equal', 'box-forced')
-
-
 def _set_lon_lat_axis(ax, projection, talk=False):
    """Add longitude and latitude coordinates to cartopy plots."""
    ax.set_xticks([-180, -120, -60, 0, 60, 120, 180], crs=projection)

--- a/pymistral/testing.py
+++ b/pymistral/testing.py
+import numpy as np
+from statsmodels.stats.multitest import multipletests
+
+
+def xr_multipletest(p, alpha=0.05, method='fdr_bh', **multipletests_kwargs):
+    """Apply statsmodels.stats.multitest.multipletests for multi-dimensional
+     xr.objects.
+
+    Args:
+        p (xr.object): uncorrected p-values.
+        alpha (type): FWER, family-wise error rate. Defaults to 0.05.
+        method (str): Method used for testing and adjustment of pvalues. Can be
+            either the full name or initial letters.  Available methods are:
+            - bonferroni : one-step correction
+            - sidak : one-step correction
+            - holm-sidak : step down method using Sidak adjustments
+            - holm : step-down method using Bonferroni adjustments
+            - simes-hochberg : step-up method (independent)
+            - hommel : closed method based on Simes tests (non-negative)
+            - fdr_bh : Benjamini/Hochberg (non-negative)
+            - fdr_by : Benjamini/Yekutieli (negative)
+            - fdr_tsbh : two stage fdr correction (non-negative)
+            - fdr_tsbky : two stage fdr correction (non-negative)
+           Defaults to 'fdr_bh'.
+        **multipletests_kwargs (optional dict): is_sorted, returnsorted
+           see statsmodels.stats.multitest.multitest
+
+    Returns:
+        reject (xr.object): true for hypothesis that can be rejected for given
+            alpha
+        pvals_corrected (xr.object): p-values corrected for multiple tests
+
+    Example:
+        reject, xpvals_corrected = xr_multipletest(p)
+    """
+    # stack all to 1d array
+    p_stacked = p.stack(s=p.dims)
+    # mask only where not nan:
+    # https://github.com/statsmodels/statsmodels/issues/2899
+    mask = np.isfinite(p_stacked)
+    pvals_corrected = np.full(p_stacked.shape, np.nan)
+    reject = np.full(p_stacked.shape, np.nan)
+    # apply test where mask
+    reject[mask] = multipletests(
+        p_stacked[mask], alpha=alpha, method=method, **multipletests_kwargs
+    )[0]
+    pvals_corrected[mask] = multipletests(
+        p_stacked[mask], alpha=alpha, method=method, **multipletests_kwargs
+    )[1]
+
+    def unstack(reject, p_stacked):
+        """Exchange values from p_stacked w/ reject (1d array) and unstack."""
+        xreject = p_stacked.copy()
+        xreject.values = reject
+        xreject = xreject.unstack()
+        return xreject
+
+    reject = unstack(reject, p_stacked)
+    pvals_corrected = unstack(pvals_corrected, p_stacked)
+    return reject, pvals_corrected