Commit 13328159 authored by katharina.berger's avatar katharina.berger

added all-in-one

parent 6f593817
......@@ -4,3 +4,4 @@ from solr_utils import query_solr, update_solr, query_esgf, get_list_of_shards,
from mapfile_utils import run_esgmapfile
from publish_utils import run_esgunpublish
from filesystem_utils import change_permissions
from consistency_checker_utils import check_dataset_consistent, get_datasets_to_process
import os
import re
import shutil
from . import query_solr, get_list_of_shards, query_for_facet
from ESGConfigParser import SectionParser, interpolate
def check_dataset_consistent(log, config, input_mapfile, root, output_directory, datasets_incomplete_file=None, delete=False):
filesystem_checksums = set()
search_api_checksums = set()
fields = ['checksum']
instance_id = input_mapfile.split('.map')[0].split('/')[-1]
with open(os.path.join(root, input_mapfile)) as mapfile:
for line in mapfile:
filesystem_checksums.add(line.split('checksum=')[-1].split('|')[0].strip())
number_of_files_fs = len(filesystem_checksums)
try:
esgf_index_node_url = 'https://esgf-node.llnl.gov/esg-search/search/'
index_nodes, master_solr_dict = get_list_of_shards(log, config, esgf_index_node_url)
index_node, data_node = query_for_facet(log, index_nodes, master_solr_dict, instance_id,
['index_node', 'data_node'])
if index_node in master_solr_dict:
remote_slave_solr_url = 'http://esgf-data.dkrz.de:%s/solr' % master_solr_dict[index_node]
else:
remote_slave_solr_url = 'https://%s/solr' % index_node
log.info("Querying Solr=%s for file checksums for dataset %s " % (remote_slave_solr_url, instance_id))
dataset_id = '%s|%s' % (instance_id, data_node)
query1 = ('dataset_id:%s' % dataset_id)
docs1 = query_solr(log, query1, fields, solr_url=remote_slave_solr_url, solr_core='files')
number_of_files_sa = len(docs1)
ffp_source = os.path.join(root, input_mapfile)
if number_of_files_fs == number_of_files_sa:
for doc1 in docs1:
search_api_checksums.add(doc1['checksum'][0])
if not filesystem_checksums == search_api_checksums:
log.warning('Checksum mismatch for: %s' % instance_id)
if datasets_incomplete_file:
datasets_incomplete_file.write('Checksum mismatch: %s \n' % instance_id)
else: # copy mapfile
dirs = '/'.join(instance_id.split('.')[:-3])
fin_outpath = os.path.join(output_directory, dirs)
if not os.path.exists(fin_outpath):
os.makedirs(fin_outpath, 0755)
ffp_dest = os.path.join(fin_outpath, os.path.basename(input_mapfile))
shutil.copy(ffp_source, ffp_dest)
else:
log.warning('Number of files mismatch for: %s' % instance_id)
if datasets_incomplete_file:
datasets_incomplete_file.write('Incomplete: %s \n' % instance_id)
if delete:
os.remove(ffp_source)
except:
log.warning('Not found on index: %s' % instance_id)
if datasets_incomplete_file:
datasets_incomplete_file.write('Not found: %s \n' % instance_id)
def get_datasets_to_process(log, root, files, project_config, local_master_solr_url, exclude_institutes, output_file=None):
drspat = project_config.translate('directory_format')
facets = project_config.get_facets('dataset_id')
dataset_list = set()
kv = {}
f = files[0]
if f == '.swp':
return 0
ffp = os.path.join(root, f)
try:
res = re.search(drspat, ffp)
for key in facets:
kv[key] = res.group(key)
kv['version'] = os.path.basename(os.path.dirname(ffp))
# exclude datasets published as original data on the DKRZ data node
if kv['institution_id'] not in exclude_institutes:
master_id = interpolate(project_config.get('dataset_id', raw=True), kv)
instance_id = '%s.%s' % (master_id, kv['version'])
# query local Solr for published replica of the data
log.info(
"\tChecking local Solr=%s for replica of dataset=%s" % (local_master_solr_url, instance_id))
replica_query = 'instance_id:%s&replica:true&retracted:false' % instance_id
num_found = query_solr(log, replica_query, None, solr_url=local_master_solr_url,
solr_core='datasets')
if num_found == 0:
if output_file:
output_file.write('%s,%s\n' % (root, instance_id))
else:
dataset_list.add('%s,%s\n' % (root, instance_id))
else:
log.info("\tReplica already published for dataset=%s" % instance_id)
except:
log.info("\tCheck failed for=%s" % ffp)
return dataset_list
......@@ -14,6 +14,7 @@ def run_esgmapfile(project, path, esgmapfile_logfile, outdir='.', config='/etc/e
esgmapfile_logfile.write('============ STDOUT ============')
esgmapfile_logfile.write('\n%s\n' % str(stdout))
except:
print '============ STDOUT ============'
print stdout
if stderr:
......@@ -21,4 +22,5 @@ def run_esgmapfile(project, path, esgmapfile_logfile, outdir='.', config='/etc/e
esgmapfile_logfile.write('============ STDERR ============')
esgmapfile_logfile.write('\n%s\n' % str(stderr))
except:
print '============ STDERR ============'
print stderr
......@@ -15,12 +15,24 @@ ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
def query_for_facet(log, index_nodes, master_solr_dict, instance_id, fields):
return_list = []
remote_slave_solr_urls_external = set()
for index_node in index_nodes:
try:
if index_node in master_solr_dict:
remote_slave_solr_url = 'http://esgf-data.dkrz.de:%s/solr' % master_solr_dict[index_node]
query1 = ('replica:false&retracted:false&instance_id:%s' % instance_id)
docs1 = query_solr(log, query1, fields, solr_url=remote_slave_solr_url, solr_core='datasets')
for field in fields:
return_list.append(docs1[0][field])
return return_list
else:
remote_slave_solr_url = 'https://%s/solr' % index_node
remote_slave_solr_urls_external.add('https://%s/solr' % index_node)
except Exception:
pass
for remote_slave_solr_url in remote_slave_solr_urls_external:
try:
query1 = ('replica:false&retracted:false&instance_id:%s' % instance_id)
docs1 = query_solr(log, query1, fields, solr_url=remote_slave_solr_url, solr_core='datasets')
for field in fields:
......@@ -28,6 +40,7 @@ def query_for_facet(log, index_nodes, master_solr_dict, instance_id, fields):
return return_list
except Exception:
pass
return return_list
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment