Commit 91406c52 authored by katharina.berger's avatar katharina.berger

add mod_time

parent 13328159
......@@ -6,7 +6,7 @@ from . import query_solr, get_list_of_shards, query_for_facet
from ESGConfigParser import SectionParser, interpolate
def check_dataset_consistent(log, config, input_mapfile, root, output_directory, datasets_incomplete_file=None, delete=False):
def check_dataset_consistent(log, config, input_mapfile, root, output_directory, datasets_incomplete_file=None, delete=False, non_local=False):
filesystem_checksums = set()
search_api_checksums = set()
fields = ['checksum']
......@@ -19,6 +19,8 @@ def check_dataset_consistent(log, config, input_mapfile, root, output_directory,
try:
esgf_index_node_url = 'https://esgf-node.llnl.gov/esg-search/search/'
index_nodes, master_solr_dict = get_list_of_shards(log, config, esgf_index_node_url)
if non_local:
index_nodes.remove('esgf-data.dkrz.de')
index_node, data_node = query_for_facet(log, index_nodes, master_solr_dict, instance_id,
['index_node', 'data_node'])
......@@ -40,13 +42,13 @@ def check_dataset_consistent(log, config, input_mapfile, root, output_directory,
log.warning('Checksum mismatch for: %s' % instance_id)
if datasets_incomplete_file:
datasets_incomplete_file.write('Checksum mismatch: %s \n' % instance_id)
else: # copy mapfile
else: # move mapfile
dirs = '/'.join(instance_id.split('.')[:-3])
fin_outpath = os.path.join(output_directory, dirs)
if not os.path.exists(fin_outpath):
os.makedirs(fin_outpath, 0755)
ffp_dest = os.path.join(fin_outpath, os.path.basename(input_mapfile))
shutil.copy(ffp_source, ffp_dest)
shutil.move(ffp_source, ffp_dest)
else:
log.warning('Number of files mismatch for: %s' % instance_id)
if datasets_incomplete_file:
......
......@@ -23,7 +23,7 @@ esgf_index_node_url = 'https://esgf-node.llnl.gov/esg-search/search/'
# by default, the script will check data that changed in the last N days
LAST_NUMBER_OF_DAYS = 7
EXCLUDE_INSTITUTES = ['AWI', 'DKRZ', 'INM', 'MPI-M']
EXCLUDE_INSTITUTES = ['AER', 'AWI', 'DKRZ', 'DWD', 'HAMMOZ-Consortium', 'INM', 'MPI-M', 'RTE-RRTMGP-Consortium', 'UHH']
usage = """
Usage:
......@@ -57,13 +57,16 @@ usage = """
If not set, use the default location '/esg/log/update_replica_log/[date].log'.
-o, --output output_directory:
Location to produce all output.
--mod-date mod_date
Check only data modified after mod_date (in unix epoch time)
"""
def do_all(log, project, config, project_config, scan_directory, outdir):
def do_all(log, project, config, project_config, config_dir, scan_directory, mod_date, outdir):
drspat = project_config.translate('directory_format')
facets = project_config.get_facets('dataset_id')
esgmapfile_logfile = open(os.path.join(outdir, 'esgmapfile.log'), 'w')
approved_outdir = os.path.join(outdir, 'approved')
outdir = os.path.join(outdir, 'not_approved')
......@@ -71,14 +74,24 @@ def do_all(log, project, config, project_config, scan_directory, outdir):
dirs = root.split('/')
if len(dirs) >= 13 and dirs[-1]: # process directory per variant_label
if files:
datasets_to_process = get_datasets_to_process(log, root, files, project_config, local_master_solr_url, EXCLUDE_INSTITUTES)
to_add = False
if not mod_date:
to_add = True
for f in files:
ffp = os.path.join(root, f)
if mod_date:
if os.stat(ffp).st_mtime > mod_date:
to_add = True
if to_add:
datasets_to_process = get_datasets_to_process(log, root, files, project_config, local_master_solr_url, EXCLUDE_INSTITUTES)
for dataset in datasets_to_process:
run_esgmapfile(project, dataset.split(',')[0], 'esgmapfile.log', outdir=outdir)
run_esgmapfile(project, dataset.split(',')[0], esgmapfile_logfile, config=config_dir, outdir=outdir)
for mapfile_root, _, mapfile_files in os.walk(outdir):
for map in mapfile_files:
input_mapfile = os.path.join(mapfile_root, map)
check_dataset_consistent(log, config, input_mapfile, root, approved_outdir, delete=True)
check_dataset_consistent(log, config, input_mapfile, root, approved_outdir, delete=True, non_local=True)
esgmapfile_logfile.close()
def check_replicas(log,
......@@ -453,7 +466,8 @@ def main(argv):
args, lastargs = getopt.getopt(argv, "p:l:c:o:", ['list-retracted-version', 'config=', 'create-new-mapfiles=',
'project=', 'start-date=', 'stop-date=', 'dry-run',
'send-mail=', 'log=', 'get-list-of-new-mapfiles=', 'output=',
'check-consistency=', 'unpublish-retracted', 'all-in-one='])
'check-consistency=', 'unpublish-retracted', 'all-in-one=',
'mod-date='])
except getopt.error:
print sys.exc_value
print usage
......@@ -474,6 +488,7 @@ def main(argv):
output_folder = '/tmp'
do_unpublication_retracted = False
all_in_one = False
mod_date = None
for flag, arg in args:
if flag == '--list-retracted-version':
......@@ -508,6 +523,8 @@ def main(argv):
output_folder = arg
elif flag == '--unpublish-retracted':
do_unpublication_retracted = True
elif flag == '--mod-date':
mod_date = arg
if not project:
print usage
......@@ -539,7 +556,7 @@ def main(argv):
if all_in_one:
project_config = SectionParser(section='project:%s' % project.lower(), directory=config_dir)
do_all(log, project, config, project_config, scan_directory, output_folder)
do_all(log, project, config, project_config, config_dir, scan_directory, start_date, output_folder)
elif list_retracted:
check_replicas(log, config, project, start_datetime=start_datetime, stop_datetime=stop_datetime, dry_run=dry_run, recipients=recipients)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment