Commit ddb2b754 authored by Fabian Wachsmann's avatar Fabian Wachsmann
Browse files

Moved all functions into a class

parent f0f333bc
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import zarr
from zarrswift import SwiftStore
import xarray
import os
import shutil
from swiftenv import *
import math
from tqdm import tqdm
globalverbose=False
class Tzis:
def __init__(self,
mf_dset=None,
varname=None,
verbose=False,
os_token=None,
os_url=None,
os_container=None,
os_name=None,
) :
self.verbose=verbose
#
self.mf_dset=self.__open_mf_dataset__(mf_dset)
self.varname=self.__get_varname__(varname)
self.store =self.open_store(os_container, os_name, os_url, os_token)
def __get_varname__(self,
varname=None):
if not varname:
varname=self.mf_dset.variables[0]
if self.verbose :
print("We use variable {0} in case we need to rechunk.".format(varname))
return varname
def __open_mf_dataset__(self, mf):
#if type(mf) != list and type(mf) != str :
# raise ValueError("Dataset '{0}' must either be a string or a list of strings")
return xarray.open_mfdataset(mf,
decode_cf=True,
use_cftime=True,
concat_dim="time",
data_vars='minimal',
coords='minimal',
compat='override')
# `getSizeOfVarTimeseries`
# returns the size of the variable `varname`of the entire dataset `ds` used for chunking. Dataset can be multiple files.
......@@ -27,7 +56,7 @@ globalverbose=False
# In[3]:
def get_size_of_var_timeseries(ds, varname):
def __get_size_of_var_timeseries__(self, ds, varname):
return ds[varname].nbytes
......@@ -48,8 +77,8 @@ def get_size_of_var_timeseries(ds, varname):
# In[4]:
def calc_chunk_bytes(ds, varname, chunkdim, target_mb):
n_bytes = get_size_of_var_timeseries(ds, varname)
def __calc_chunk_bytes__(self, ds, varname, chunkdim, target_mb):
n_bytes = self.__get_size_of_var_timeseries__(ds, varname)
return math.ceil(len(ds[chunkdim]) / math.ceil(n_bytes / (target_mb* (2 ** 20))))
......@@ -69,11 +98,11 @@ def calc_chunk_bytes(ds, varname, chunkdim, target_mb):
# In[5]:
def rechunk(ds, varname, chunkdim, target_mb):
chunk_length = calc_chunk_bytes(ds, varname, chunkdim, target_mb)
def __rechunk__(self, ds, varname, chunkdim, target_mb):
chunk_length = self.__calc_chunk_bytes__(ds, varname, chunkdim, target_mb)
chunk_rule = {chunkdim: chunk_length}
if globalverbose:
if self.verbose:
print("Chunking into chunks of {0} {1} steps".format(chunk_length, chunkdim))
chunked_ds = ds.chunk(chunk_rule)
......@@ -95,7 +124,7 @@ def rechunk(ds, varname, chunkdim, target_mb):
# In[6]:
def drop_vars_without_chunkdim(ds, chunkdim):
def __drop_vars_without_chunkdim__(self,ds, chunkdim):
droplist=[var for var in ds.variables if chunkdim not in ds[var].coords]
if "time_bnds" in ds.variables:
droplist+=["time_bnds"]
......@@ -117,7 +146,7 @@ def drop_vars_without_chunkdim(ds, chunkdim):
# In[7]:
def sel_range_for_chunk_by_time(ds, starttimeindex, endtimeindex):
def __sel_range_for_chunk_by_time__(ds, starttimeindex, endtimeindex):
return ds.isel(time=slice(starttimeindex,endtimeindex))
......@@ -139,9 +168,9 @@ def sel_range_for_chunk_by_time(ds, starttimeindex, endtimeindex):
# In[8]:
def sel_range_for_chunk(ds, startindex, endindex, chunkdim):
def __sel_range_for_chunk__(self, ds, startindex, endindex, chunkdim):
if chunkdim == "time" :
return sel_range_for_chunk_by_time(ds, startindex, endindex)
return self.__sel_range_for_chunk_by_time__(ds, startindex, endindex)
else:
raise ValueError('Other chunk dimensions than "time" are not supported yet.')
......@@ -163,7 +192,7 @@ def sel_range_for_chunk(ds, startindex, endindex, chunkdim):
# In[9]:
def write_chunk_by_region(towrite, store, chunkdim, startindex, endindex):
def __write_chunk_by_region__(self, towrite, store, chunkdim, startindex, endindex):
try:
towrite.to_zarr(store=store,region={chunkdim: slice(startindex, endindex)})
towrite.close()
......@@ -171,14 +200,7 @@ def write_chunk_by_region(towrite, store, chunkdim, startindex, endindex):
except:
return chunk_no
# `write_by_region` writes chunk-wise data into an initialized dataset in swift.
# 'Initiliazed' means that a first `to_zarr` call has been executed which writes all coordinates and metadata for the dataset into the chunk. The subsequent `to_zarr` calls performed by `write_to_region` uses this information so that it knows how chunks have to be named. If a region has already been written, it will be overwritten by write_to_region.
# In[10]:
def open_or_initialize_swift_dset(store, ds, chunkdim):
def __open_or_initialize_swift_dset__(self, store, ds, chunkdim):
try:
return xarray.open_zarr(store, consolidated=True, decode_cf=True, use_cftime=True)
except:
......@@ -187,129 +209,89 @@ def open_or_initialize_swift_dset(store, ds, chunkdim):
except:
print("Could not initialize dataset.")
def open_store(self, os_container, os_name, os_url, os_token):
auth = {
"preauthurl": os_url,
"preauthtoken": os_token,
}
store = SwiftStore(container=os_container, prefix=os_name, storage_options=auth)
return store
# In[11]:
# `write_by_region` writes chunk-wise data into an initialized dataset in swift.
# 'Initiliazed' means that a first `to_zarr` call has been executed which writes all coordinates and metadata for the dataset into the chunk. The subsequent `to_zarr` calls performed by `write_to_region` uses this information so that it knows how chunks have to be named. If a region has already been written, it will be overwritten by write_to_region.
def write_by_region(chunked_ds, store, startchunk, validity_check, chunkdim, varname):
already = open_or_initialize_swift_dset(store, chunked_ds, chunkdim)
def write_by_region(self, chunked_ds, store, startchunk, validity_check, chunkdim, varname):
already = self.__open_or_initialize_swift_dset__(store, chunked_ds, chunkdim)
try:
already=drop_vars_without_chunkdim(already, chunkdim)
already=self.__drop_vars_without_chunkdim__(already, chunkdim)
except:
print("Could not drop vars without chunkdim.",
" This is not an issue if you initialized the dataset in the cloud.")
chunked_ds=drop_vars_without_chunkdim(chunked_ds, chunkdim) #
chunked_ds=self.__drop_vars_without_chunkdim__(chunked_ds, chunkdim) #
all_chunks=chunked_ds.chunks[chunkdim]
chunksum=0
for chunk_no in tqdm(range(0,len(all_chunks))):
chunksum+=all_chunks[chunk_no]
if chunk_no < startchunk :
continue
towrite = sel_range_for_chunk(chunked_ds, chunksum-all_chunks[chunk_no],chunksum, chunkdim) #.load()
incloud = sel_range_for_chunk(already, chunksum-all_chunks[chunk_no],chunksum, chunkdim) #.load()
towrite = self.__sel_range_for_chunk__(chunked_ds, chunksum-all_chunks[chunk_no],chunksum, chunkdim) #.load()
incloud = self.__sel_range_for_chunk__(already, chunksum-all_chunks[chunk_no],chunksum, chunkdim) #.load()
#if towrite.broadcast_equals(incloud):
#if towrite[varname].size == incloud[varname].size :
if towrite[varname].identical(incloud[varname]):
if globalverbose:
if self.verbose:
print("datasets for chunk {0} are equal".format(chunk_no+1))
continue
elif validity_check :
print("Dsets at chunk {0} from {1} are different!".format(chunk_no, len(all_chunks)))
return chunk_no
incloud.close()
write_status = write_chunk_by_region(towrite, store, chunkdim, chunksum-all_chunks[chunk_no], chunksum)
write_status = self.__write_chunk_by_region__(towrite, store, chunkdim, chunksum-all_chunks[chunk_no], chunksum)
if write_status > 0 :
return write_status
return 0
def write_directly(dset=None, store=None):
dset.to_zarr(store=store, mode='w', consolidated=True)
# In[12]:
def write_directly(ds,store):
ds.to_zarr(store=store, mode='w', consolidated=True)
# In[13]:
def write_with_validation_and_retries(ds, varname, chunkdim, target_mb, store, startchunk, validity_check, maxretries):
chunked_ds = rechunk(ds, varname, chunkdim, target_mb)
def write_with_validation_and_retries(self, ds, varname, store, chunkdim, target_mb, startchunk, validity_check, maxretries):
chunked_ds = self.rechunk(ds, varname, chunkdim, target_mb)
#
retries = 0
success = -1
if startchunk != 0:
success = startchunk
while ( success != 0 and retries < maxretries ) :
success = write_by_region(chunked_ds, store, success, validity_check, chunkdim, varname)
success = self.write_by_region(chunked_ds, store, success, validity_check, chunkdim, varname)
retries += 1
if globalverbose and success != 0:
if self.verbose and success != 0:
print("Write by region failed. Now retry number {}.".format(retries))
if success != 0 :
raise RuntimeError("Max retries {0} all failed at chunk no {1}".format(maxretries, success))
if globalverbose:
if self.verbose:
print("Start validation of write process")
if write_by_region(chunked_ds, store, startchunk, True, chunkdim, varname) != 0:
if self.write_by_region(chunked_ds, store, startchunk, True, chunkdim, varname) != 0:
raise RuntimeError("Validiation failed.")
# In[14]:
def open_store(outid=None):
auth = {
"preauthurl": OS_STORAGE_URL,
"preauthtoken": OS_AUTH_TOKEN,
}
store = SwiftStore(container='cordex-zarr', prefix=outid, storage_options=auth)
return store
# In[15]:
def open_mf_dataset(mf):
#if type(mf) != list and type(mf) != str :
# raise ValueError("Dataset '{0}' must either be a string or a list of strings")
return xarray.open_mfdataset(mf,
decode_cf=True,
use_cftime=True,
concat_dim="time",
data_vars='minimal',
coords='minimal',
compat='override')
# In[16]:
def write_to_swift(mfs=None,
outid=None,
varname=None,
def write_to_swift(self,
chunkdim="time",
target_mb=1000,
startchunk=0,
validity_check=False,
chunkdim="time",
maxretries=3,
verbose=False):
global globalverbose
ds = open_mf_dataset(mfs)
if verbose:
globalverbose=True
if not varname:
varname=ds.variables[0]
if globalverbose :
print("We use variable {0} in case we need to rechunk.".format(varname))
store = open_store(outid)
#
timevars=[var for var in ds.variables if "time" in ds[var].coords]
maxretries=3):
timevars=[var for var in self.mf_dset.variables if "time" in self.mf_dset[var].coords]
if len(timevars) > 0:
write_with_validation_and_retries(ds, varname, chunkdim, target_mb, store, startchunk, validity_check, maxretries)
self.write_with_validation_and_retries(self.mf_dset,
self.varname,
self.store,
chunkdim,
target_mb,
startchunk,
validity_check,
maxretries)
else:
write_directly(ds, store)
ds.close()
self.write_directly(self.mf_dset, self.store)
self.mf_dset.close()
# In[17]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment