Commit 61ce2dd6 authored by katharina.berger's avatar katharina.berger

added find_retracted

parent 54b1f1d6
......@@ -3,7 +3,7 @@ import urllib2
import requests
from config import File
from utils import FilePID, DatasetPID
from pkg_utils import FilePID, DatasetPID
from pyesgf.search import SearchConnection
......
......@@ -24,6 +24,7 @@ class Config:
except:
return None
class DBCon:
def __init__(self, db_path):
......
......@@ -2,4 +2,4 @@ from pid_utils import DatasetPID, FilePID
from mail_utils import send_mail
from solr_utils import query_solr, update_solr, query_esgf, get_list_of_shards, query_for_facet
from mapfile_utils import run_esgmapfile
from publish_utils import run_esgunpublish
\ No newline at end of file
from publish_utils import run_esgunpublish
......@@ -11,7 +11,7 @@ import os
from pyesgf.search import SearchConnection
from checks import pid_check_dataset, check_citation
from config import Config, DBCon, Log, Dataset
from utils import send_mail
from pkg_utils import send_mail
usage = """
Usage:
......
......@@ -8,7 +8,7 @@ import sys
import os
import re
from utils import query_solr, update_solr, query_esgf, send_mail, get_list_of_shards, run_esgmapfile, query_for_facet, run_esgunpublish
from pkg_utils import query_solr, update_solr, query_esgf, send_mail, get_list_of_shards, run_esgmapfile, query_for_facet, run_esgunpublish
from config import Log, Config
from ESGConfigParser import SectionParser, interpolate
......
# !/usr/bin/env python
import datetime
import getopt
import shutil
import time
import sys
import os
import re
from pkg_utils import query_solr, update_solr, query_esgf, send_mail, get_list_of_shards, run_esgmapfile, query_for_facet, run_esgunpublish
from config import Log, Config
from ESGConfigParser import SectionParser, interpolate
from pyesgf.search import SearchConnection
# SearchAPI connection
conn = SearchConnection('http://esgf-data.dkrz.de/esg-search', distrib=True)
EXCLUDE_INSTITUTES = ['AWI', 'DKRZ', 'INM', 'MPI-M']
usage = """
Usage:
find_retracted [options] path_to_scan
Options:
-c, --config config_file:
Configuration file
Use the default '/esg/config/replica_check.conf', if not specified.
-p, --project project:
Check only data for the specified project, this option is mandatory.
--send-mail email_address[es]
If logfile is not empty send the file as attachment to specified email adresses.
If more than one recipient, the email addresses has to be seperated with a ','.
-l, --log logfile:
Location of the logfile
If not set, use the default location '/esg/log/update_replica_log/[date].log'.
-o, --output output_directory:
Location to produce all output.
"""
def get_deleted_and_retracted(log,
project,
project_config,
scan_directory,
retracted_list,
deleted_list,
recipients=None):
"""
Get a list of all datasets that are on disk but either deleted or retracted in ESGF
"""
drspat = project_config.translate('directory_format')
facets = project_config.get_facets('dataset_id')
# get a list of all datasets on filesystem
for root, _, files in os.walk(scan_directory):
if files:
kv = {}
f = files[0]
if files:
if f == '.swp':
continue
ffp = os.path.join(root, f)
try:
res = re.search(drspat, ffp)
for key in facets:
kv[key] = res.group(key)
kv['version'] = os.path.basename(os.path.dirname(ffp))
if kv['institution_id'] not in EXCLUDE_INSTITUTES:
master_id = interpolate(project_config.get('dataset_id', raw=True), kv)
instance_id = '%s.%s' % (master_id, kv['version'])
# check SearchAPI for dataset
ctx = conn.new_context(replica=False, instance_id=instance_id, facets='retracted')
retracted = ctx.facet_counts['retracted'].keys()[0]
if retracted:
if retracted == 'true':
retracted_list.write('%s\n' % instance_id)
else:
deleted_list.write('%s\n' % instance_id)
except:
log.info("\tCheck failed for=%s" % ffp)
def main(argv):
try:
args, lastargs = getopt.getopt(argv, "p:l:c:o:", ['config=', 'project=', 'send-mail=', 'log=', 'output='])
except getopt.error:
print sys.exc_value
print usage
sys.exit(0)
project = None
recipients = None
logfile_path = '/tmp'
config_dir = '/esg/config/esgcet/'
config_filename = 'replica_check.conf'
output_folder = '/tmp'
for flag, arg in args:
if flag in ['-p', '--project']:
project = arg
elif flag in ['-c', '--config']:
config_dir = arg
elif flag == '--send-mail':
recipients = arg
elif flag in ['-l', '--log']:
logfile_path = arg
elif flag in ['-o', '--output']:
output_folder = arg
if len(lastargs) != 0 and os.path.isdir(lastargs[0]):
path_to_scan = lastargs[0]
else:
print usage
sys.exit(0)
if not project:
print usage
sys.exit(0)
configfile = os.path.join(config_dir, config_filename)
if not os.path.isfile(configfile):
print "Missing configfile %s" % configfile
sys.exit(0)
else:
config = Config(configfile)
logfile_name = '%s/%s_check_%s.log' % (logfile_path, project, datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d'))
if config.loglevel:
log = Log(logfile_name, config.loglevel)
else:
log = Log(logfile_name, 'INFO')
print('Logging information will be written to %s' % logfile_name)
project_config = SectionParser(section='project:%s' % project.lower(), directory=config_dir)
deleted_list = open(os.path.join(output_folder, 'deleted_datasets.txt'), 'w')
retracted_list = open(os.path.join(output_folder, 'retracted_datasets.txt'), 'w')
get_deleted_and_retracted(log,
project,
project_config,
path_to_scan,
retracted_list,
deleted_list,
recipients=recipients)
retracted_list.close()
deleted_list.close()
if __name__ == '__main__':
main(sys.argv[1:])
......@@ -8,7 +8,7 @@ import os
from checks import pid_check_dataset
from config import Log, Dataset, Config
from utils import send_mail
from pkg_utils import send_mail
usage = """
Usage:
......
......@@ -12,7 +12,7 @@ if not os.path.exists(conf_final_location):
shutil.copyfile('./config.cfg', conf_final_location)
setup(
name='esgf-utils',
name='esgf-pkg_utils',
version=VERSION,
description='ESGF Utils',
author='Katharina Berger',
......@@ -29,7 +29,8 @@ setup(
scripts=[
'scripts/check_publication',
'scripts/recheck_from_log',
'scripts/check_replica'
'scripts/check_replica',
'scripts/find_retracted'
],
zip_safe=False, # Migration repository must be a directory
)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment