Commit 63720ca1 authored by Merret Buurman's avatar Merret Buurman
Browse files

Improving parsing of file handles.

parent 76157fe6
......@@ -47,11 +47,43 @@ def get_checksum_method_from_record(json_record):
def get_urls_original_from_record(json_record):
field_string = get_value_from_record(json_record, 'URL_ORIGINAL_DATA')
return _extract_url_info_from_field(field_string)
parsed = _extract_url_info_from_field(field_string, 'href', 'host')
if parsed is not None:
parsed = _rename_href_to_url(parsed)
parsed = _remove_duplicate_urls(parsed)
return parsed
def get_urls_replicas_from_record(json_record):
field_string = get_value_from_record(json_record, 'URL_REPLICAS')
return _extract_url_info_from_field(field_string)
parsed = _extract_url_info_from_field(field_string, 'href', 'host')
if parsed is not None:
parsed = _rename_href_to_url(parsed)
parsed = _remove_duplicate_urls(parsed)
return parsed
def _rename_href_to_url(list_of_dicts):
''' Helper for the list of dict that is the result of parsing
a locations xml snippet.'''
for item in list_of_dicts:
item['url'] = item['href']
del item['href']
return list_of_dicts
def _remove_duplicate_urls(list_of_dicts):
''' Helper for the list of dict that is the result of parsing
a locations xml snippet.
This removes any item that has an url that was already there,
no matter of those items differ in other aspects.
'''
new_list_of_dicts = []
temp_list_of_urls = []
for item in list_of_dicts:
url = item['url']
if url not in temp_list_of_urls:
temp_list_of_urls.append(url)
new_list_of_dicts.append(item)
return new_list_of_dicts
def _get_aggregation_field_from_record(json_record):
field_string = get_value_from_record(json_record, 'IS_PART_OF')
......@@ -67,26 +99,21 @@ def get_list_of_aggregation_handles_from_record(json_record):
field_string = _get_aggregation_field_from_record(json_record)
return field_string.split(';')
def _extract_url_info_from_field(field_string):
def _extract_url_info_from_field(field_string, *attributes):
if field_string is None:
return None
else:
field_xml = ET.fromstring(field_string)
list_of_originals = []
temp_url_list = []
list_of_items = []
locations = field_xml.findall('location')
for location in locations:
url = location.get('href')
if url is not None and url not in temp_url_list:
temp_url_list.append(url)
host = location.get('host')
if host is None:
host = ''
list_of_originals.append(dict(
host=host,
url=url
))
return list_of_originals
temp = {}
for attr in attributes:
val = location.get(attr)
if val is not None:
temp[attr] = val
list_of_items.append(temp)
return list_of_items
def get_list_of_aggregation_records_from_record(list_of_aggregation_handles):
......@@ -119,7 +146,7 @@ def _extract_info_on_one_aggregation_record(handle):
info_to_return = dict(
handle = handle,
level = aggregation_level,
drs = drs_id,
title = drs_id,
version = vers_num,
replaced = newer_version
)
......
......@@ -31,8 +31,8 @@
calls to retrieve handle records are needed. Maybe one day this could be done on user's request only.
Every dict needs:
* level (e.g. "dataset") [*]
* version [*]
* drs [*]
* version
* title [*] (can be drs-name, can be file name... depends on the level)
* handle [*]
* replaced (True or False) [*]
......@@ -121,7 +121,10 @@
<tr>
<td id="leftcol">{{item.level}}</td>
<td>
{{ item.drs }} (version {{ item.version }})
{{ item.title }}
{% if item.version %}
(version {{ item.version }})
{% endif %}
<br/><a href="http://hdl.handle.net/{{ item.handle }}/">hdl:{{ item.handle }}</a>
{% if item.replaced %}
<br/><span class="attention">Attention, this {{item.level}} was replaced by a newer version!*</span>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment