Skip to content

Commit

Permalink
Parsers: parsing green and golden access from larger set
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Jun 26, 2024
1 parent 82dad35 commit 4f062d0
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 40 deletions.
7 changes: 2 additions & 5 deletions dags/open_access/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@
r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+" + r"not+540__f:Bronze+not+540__3:preprint"
)
BRONZE_ACCESS = r"540__f:'Bronze'"
GREEN_ACCESS = (
r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+not+540__a:"
+ r"'arXiv+nonexclusive-distrib'+not+540__f:'Bronze'"
)
GOLD_ACCESS = r"540__3:'publication'+and+" + r"(540__a:'CC-BY'+OR++540__a:'CC+BY')"
GREEN_ACCESS = r""
GOLD_ACCESS = r""

CERN_READ_AND_PUBLISH = r"540__f:'CERN-RP"
CERN_INDIVIDUAL_APCS = r"540__f:'CERN-APC'"
Expand Down
11 changes: 4 additions & 7 deletions dags/open_access/open_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,19 @@ def oa_dag():
@task(executor_config=kubernetes_executor_config)
def fetch_data_task(query, **kwargs):
year = kwargs["params"].get("year")
cds_token = os.environ.get("CDS_TOKEN")
if not cds_token:
logging.warning("cds token is not set!")
base_query = (
r"(affiliation:CERN+or+595:'For+annual+report')"
+ rf"and+year:{year}+not+980:ConferencePaper+"
+ r"not+980:BookChapter"
)
type_of_query = [*query][0]
url = utils.get_url(query=f"{base_query}+{query[type_of_query]}")
data = request_again_if_failed(url=url, cds_token=cds_token)
url = utils.get_url(query=f"{base_query}")
data = request_again_if_failed(url=url)
total = get_total_results_count(data.text)
if type_of_query == "gold":
total = utils.get_gold_access_count(total, url)
total = utils.get_golden_access_count(total, url)
if type_of_query == "green":
total = total - utils.get_gold_access_count(total, url)
total = utils.get_green_access_count(total, url)
return {type_of_query: total}

@task(multiple_outputs=True, executor_config=kubernetes_executor_config)
Expand Down
139 changes: 119 additions & 20 deletions dags/open_access/parsers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import xml.etree.ElementTree as ET
from io import StringIO

Expand All @@ -13,25 +14,123 @@ def parse_without_names_spaces(xml):
return root


def get_golden_access_records_ids(data):
xml = parse_without_names_spaces(data)
records = xml.findall(".record")
golden_access = []
def is_correct_value(value):
match value.text.lower():
case "accepted manuscript":
return True
case "preprint":
return True
case _:
return False


def field_has_cc_by(field_value):
# is CC BY-SA 4.0 falls under the condition of "contains ‘CC-BY’ or ‘CC BY’??
#
pattern = re.compile(r"CC(\s|-)?BY(\s|-)?4.0", flags=re.I)
return bool(pattern.match(field_value))


def parse_subset_856(datafields_856):
at_least_one_found = False
for datafield in datafields_856:
subfield = datafield.find("subfield[@code='y']")
try:
is_subfield_y_wanted_value = is_correct_value(subfield)
if not at_least_one_found:
at_least_one_found = is_subfield_y_wanted_value
at_least_one_found = is_subfield_y_wanted_value
except AttributeError:
pass
return at_least_one_found


def parse_subset_540_preprint(datafields_540):
at_least_one_found = False
for datafield in datafields_540:
subfield_3 = datafield.find("subfield[@code='3']")
try:
is_subfield_3_wanted_value = subfield_3.text.lower() == "preprint"
if not at_least_one_found:
at_least_one_found = is_subfield_3_wanted_value
except AttributeError:
pass
return at_least_one_found


def parse_subset_540_publication(datafields_540):
at_least_one_found = False
for datafield in datafields_540:
subfield_3 = datafield.find("subfield[@code='3']")
subfield_a = datafield.find("subfield[@code='a']")
try:
is_subfield_wanted_3_value = subfield_3.text.lower() == "publication"
is_subfield_a_wanted_value = field_has_cc_by(subfield_a.text)
if not at_least_one_found:
at_least_one_found = bool(
is_subfield_wanted_3_value and is_subfield_a_wanted_value
)
except AttributeError:
pass
return at_least_one_found


def parse_subset_green_access(records):
filtered_records = []
for record in records:
datafields_856 = record.findall("datafield[@tag='856'][@ind1='4'][@ind2=' ']")
datafields_540 = record.findall("datafield/[@tag='540']")
if datafields_856 is None:
continue
if datafields_540 is None:
continue
is_it_wanted_record_by_856 = parse_subset_856(datafields_856)
is_it_wanted_record_by_540_preprint = parse_subset_540_preprint(datafields_540)
is_it_wanted_record_by_540_publication = not parse_subset_540_publication(
datafields_540
)

if (
is_it_wanted_record_by_856
or is_it_wanted_record_by_540_preprint
or is_it_wanted_record_by_540_publication
):
filtered_records.append(record)

return filtered_records


def parse_subset_golden_access(records):
filtered_records = []
for record in records:
datafields = record.findall("datafield/[@tag='540']")
if datafields is None:
datafields_540 = record.findall("datafield/[@tag='540']")
if datafields_540 is None:
continue
for datafield in datafields:
record_type = datafield.find("subfield/[@code='3']")
license = datafield.find("subfield/[@code='a']")
if record_type is not None and license is not None:
if (
"CC" in license.text
and "BY" in license.text
and record_type.text == "publication"
):
record_id = record.find("controlfield/[@tag='001']")
if record_id is not None:
doi = record_id.text
golden_access.append(doi)
return golden_access
is_it_wanted_record_by_540_publication = parse_subset_540_publication(
datafields_540
)

if is_it_wanted_record_by_540_publication:
filtered_records.append(record)
return filtered_records


def get_records_ids(data, record_filter):
xml = parse_without_names_spaces(data)
records = xml.findall(".record")
filtered_records = record_filter(records)
green_access = []
for record in filtered_records:
record_id = record.find("controlfield/[@tag='001']")
if record_id is not None:
doi = record_id.text
green_access.append(doi)
return green_access


def get_golden_access_records_ids(data):
return get_records_ids(data, parse_subset_golden_access)


def get_green_access_records_ids(data):
return get_records_ids(data, parse_subset_green_access)
17 changes: 12 additions & 5 deletions dags/open_access/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,30 @@
import math

from common.utils import request_again_if_failed
from open_access.parsers import get_golden_access_records_ids
from open_access.parsers import (get_golden_access_records_ids,
get_green_access_records_ids)


def get_gold_access_count(total, url):
def get_count(total, url, record_extractor):
iterations = math.ceil(total / 100.0)
records_ids_count = 0
for i in range(0, iterations):
jrec = (i * 100) + 1
full_url = f"{url}&jrec={jrec}"
response = request_again_if_failed(full_url)
records_ids_count = records_ids_count + len(
get_golden_access_records_ids(response.text)
)
records_ids_count = records_ids_count + len(record_extractor(response.text))
logging.info(f"In total was found {records_ids_count} golden access records")
return records_ids_count


def get_golden_access_count(total, url):
return get_count(total, url, get_golden_access_records_ids)


def get_green_access_count(total, url):
return get_count(total, url, get_green_access_records_ids)


def get_url(query, current_collection="Published+Articles"):
url = (
rf"https://cds.cern.ch/search?ln=en&cc={current_collection}&p={query}"
Expand Down
83 changes: 80 additions & 3 deletions tests/open_access/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from open_access.parsers import get_golden_access_records_ids
from open_access.parsers import (get_golden_access_records_ids,
get_green_access_records_ids,
parse_subset_540_preprint,
parse_subset_540_publication, parse_subset_856,
parse_without_names_spaces)

expected = [
expected_golden = [
"2894668",
"2891488",
"2888511",
Expand All @@ -11,6 +15,28 @@
"2882429",
"2882335",
"2882328",
"2882324",
"2882322",
"2882311",
"2882298",
]

expected_green = [
"2894668",
"2891489",
"2891488",
"2891487",
"2888511",
"2888151",
"2886038",
"2884472",
"2884471",
"2884470",
"2884469",
"2883672",
"2882429",
"2882335",
"2882328",
"2882327",
"2882324",
"2882322",
Expand All @@ -22,4 +48,55 @@
def test_get_golden_access_records_dois(shared_datadir):
with open(shared_datadir / "search.xml") as file:
records_ids = get_golden_access_records_ids(file.read())
assert records_ids == expected
assert records_ids == expected_golden


def test_parse_subset_856(shared_datadir):
with open(shared_datadir / "search.xml") as file:
filtered_records_count = 0
parsed_records = parse_without_names_spaces(file.read())
records = parsed_records.findall(".record")
for record in records:
datafields_856 = record.findall(
"datafield[@tag='856'][@ind1='4'][@ind2=' ']"
)
is_it_wanted_record_by_856 = parse_subset_856(datafields_856)
if is_it_wanted_record_by_856:
filtered_records_count = filtered_records_count + 1
assert filtered_records_count == 0


def test_parse_subset_540_preprint(shared_datadir):
with open(shared_datadir / "search.xml") as file:
filtered_records_count = 0
parsed_records = parse_without_names_spaces(file.read())
records = parsed_records.findall(".record")
for record in records:
datafields_540 = record.findall(
"datafield[@tag='540'][@ind1=' '][@ind2=' ']"
)
is_it_wanted_record_by_540 = parse_subset_540_preprint(datafields_540)
if is_it_wanted_record_by_540:
filtered_records_count = filtered_records_count + 1
assert filtered_records_count == 20


def test_parse_subset_540_publications(shared_datadir):
with open(shared_datadir / "search.xml") as file:
filtered_records_count = 0
parsed_records = parse_without_names_spaces(file.read())
records = parsed_records.findall(".record")
for record in records:
datafields_540 = record.findall(
"datafield[@tag='540'][@ind1=' '][@ind2=' ']"
)
is_it_wanted_record_by_540 = parse_subset_540_publication(datafields_540)
if is_it_wanted_record_by_540:
filtered_records_count = filtered_records_count + 1
assert filtered_records_count == 14


def test_get_green_access_records_dois(shared_datadir):
with open(shared_datadir / "search.xml") as file:
records_ids = get_green_access_records_ids(file.read())
assert records_ids == expected_green

0 comments on commit 4f062d0

Please sign in to comment.