diff --git a/dags/open_access/constants.py b/dags/open_access/constants.py index 294509c..1b68abe 100644 --- a/dags/open_access/constants.py +++ b/dags/open_access/constants.py @@ -2,11 +2,8 @@ r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+" + r"not+540__f:Bronze+not+540__3:preprint" ) BRONZE_ACCESS = r"540__f:'Bronze'" -GREEN_ACCESS = ( - r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+not+540__a:" - + r"'arXiv+nonexclusive-distrib'+not+540__f:'Bronze'" -) -GOLD_ACCESS = r"540__3:'publication'+and+" + r"(540__a:'CC-BY'+OR++540__a:'CC+BY')" +GREEN_ACCESS = r"" +GOLD_ACCESS = r"" CERN_READ_AND_PUBLISH = r"540__f:'CERN-RP" CERN_INDIVIDUAL_APCS = r"540__f:'CERN-APC'" diff --git a/dags/open_access/open_access.py b/dags/open_access/open_access.py index 9d221de..4619948 100644 --- a/dags/open_access/open_access.py +++ b/dags/open_access/open_access.py @@ -20,22 +20,19 @@ def oa_dag(): @task(executor_config=kubernetes_executor_config) def fetch_data_task(query, **kwargs): year = kwargs["params"].get("year") - cds_token = os.environ.get("CDS_TOKEN") - if not cds_token: - logging.warning("cds token is not set!") base_query = ( r"(affiliation:CERN+or+595:'For+annual+report')" + rf"and+year:{year}+not+980:ConferencePaper+" + r"not+980:BookChapter" ) type_of_query = [*query][0] - url = utils.get_url(query=f"{base_query}+{query[type_of_query]}") - data = request_again_if_failed(url=url, cds_token=cds_token) + url = utils.get_url(query=f"{base_query}") + data = request_again_if_failed(url=url) total = get_total_results_count(data.text) if type_of_query == "gold": - total = utils.get_gold_access_count(total, url) + total = utils.get_golden_access_count(total, url) if type_of_query == "green": - total = total - utils.get_gold_access_count(total, url) + total = utils.get_green_access_count(total, url) return {type_of_query: total} @task(multiple_outputs=True, executor_config=kubernetes_executor_config) diff --git a/dags/open_access/parsers.py b/dags/open_access/parsers.py index 55da0ca..65b7d33 100644 --- a/dags/open_access/parsers.py +++ b/dags/open_access/parsers.py @@ -1,3 +1,4 @@ +import re import xml.etree.ElementTree as ET from io import StringIO @@ -13,25 +14,123 @@ def parse_without_names_spaces(xml): return root -def get_golden_access_records_ids(data): - xml = parse_without_names_spaces(data) - records = xml.findall(".record") - golden_access = [] +def is_correct_value(value): + match value.text.lower(): + case "accepted manuscript": + return True + case "preprint": + return True + case _: + return False + + +def field_has_cc_by(field_value): + # is CC BY-SA 4.0 falls under the condition of "contains ‘CC-BY’ or ‘CC BY’?? + # + pattern = re.compile(r"CC(\s|-)?BY(\s|-)?4.0", flags=re.I) + return bool(pattern.match(field_value)) + + +def parse_subset_856(datafields_856): + at_least_one_found = False + for datafield in datafields_856: + subfield = datafield.find("subfield[@code='y']") + try: + is_subfield_y_wanted_value = is_correct_value(subfield) + if not at_least_one_found: + at_least_one_found = is_subfield_y_wanted_value + at_least_one_found = is_subfield_y_wanted_value + except AttributeError: + pass + return at_least_one_found + + +def parse_subset_540_preprint(datafields_540): + at_least_one_found = False + for datafield in datafields_540: + subfield_3 = datafield.find("subfield[@code='3']") + try: + is_subfield_3_wanted_value = subfield_3.text.lower() == "preprint" + if not at_least_one_found: + at_least_one_found = is_subfield_3_wanted_value + except AttributeError: + pass + return at_least_one_found + + +def parse_subset_540_publication(datafields_540): + at_least_one_found = False + for datafield in datafields_540: + subfield_3 = datafield.find("subfield[@code='3']") + subfield_a = datafield.find("subfield[@code='a']") + try: + is_subfield_wanted_3_value = subfield_3.text.lower() == "publication" + is_subfield_a_wanted_value = field_has_cc_by(subfield_a.text) + if not at_least_one_found: + at_least_one_found = bool( + is_subfield_wanted_3_value and is_subfield_a_wanted_value + ) + except AttributeError: + pass + return at_least_one_found + + +def parse_subset_green_access(records): + filtered_records = [] + for record in records: + datafields_856 = record.findall("datafield[@tag='856'][@ind1='4'][@ind2=' ']") + datafields_540 = record.findall("datafield/[@tag='540']") + if datafields_856 is None: + continue + if datafields_540 is None: + continue + is_it_wanted_record_by_856 = parse_subset_856(datafields_856) + is_it_wanted_record_by_540_preprint = parse_subset_540_preprint(datafields_540) + is_it_wanted_record_by_540_publication = not parse_subset_540_publication( + datafields_540 + ) + + if ( + is_it_wanted_record_by_856 + or is_it_wanted_record_by_540_preprint + or is_it_wanted_record_by_540_publication + ): + filtered_records.append(record) + + return filtered_records + + +def parse_subset_golden_access(records): + filtered_records = [] for record in records: - datafields = record.findall("datafield/[@tag='540']") - if datafields is None: + datafields_540 = record.findall("datafield/[@tag='540']") + if datafields_540 is None: continue - for datafield in datafields: - record_type = datafield.find("subfield/[@code='3']") - license = datafield.find("subfield/[@code='a']") - if record_type is not None and license is not None: - if ( - "CC" in license.text - and "BY" in license.text - and record_type.text == "publication" - ): - record_id = record.find("controlfield/[@tag='001']") - if record_id is not None: - doi = record_id.text - golden_access.append(doi) - return golden_access + is_it_wanted_record_by_540_publication = parse_subset_540_publication( + datafields_540 + ) + + if is_it_wanted_record_by_540_publication: + filtered_records.append(record) + return filtered_records + + +def get_records_ids(data, record_filter): + xml = parse_without_names_spaces(data) + records = xml.findall(".record") + filtered_records = record_filter(records) + green_access = [] + for record in filtered_records: + record_id = record.find("controlfield/[@tag='001']") + if record_id is not None: + doi = record_id.text + green_access.append(doi) + return green_access + + +def get_golden_access_records_ids(data): + return get_records_ids(data, parse_subset_golden_access) + + +def get_green_access_records_ids(data): + return get_records_ids(data, parse_subset_green_access) diff --git a/dags/open_access/utils.py b/dags/open_access/utils.py index e2d101f..b682beb 100644 --- a/dags/open_access/utils.py +++ b/dags/open_access/utils.py @@ -2,23 +2,30 @@ import math from common.utils import request_again_if_failed -from open_access.parsers import get_golden_access_records_ids +from open_access.parsers import (get_golden_access_records_ids, + get_green_access_records_ids) -def get_gold_access_count(total, url): +def get_count(total, url, record_extractor): iterations = math.ceil(total / 100.0) records_ids_count = 0 for i in range(0, iterations): jrec = (i * 100) + 1 full_url = f"{url}&jrec={jrec}" response = request_again_if_failed(full_url) - records_ids_count = records_ids_count + len( - get_golden_access_records_ids(response.text) - ) + records_ids_count = records_ids_count + len(record_extractor(response.text)) logging.info(f"In total was found {records_ids_count} golden access records") return records_ids_count +def get_golden_access_count(total, url): + return get_count(total, url, get_golden_access_records_ids) + + +def get_green_access_count(total, url): + return get_count(total, url, get_green_access_records_ids) + + def get_url(query, current_collection="Published+Articles"): url = ( rf"https://cds.cern.ch/search?ln=en&cc={current_collection}&p={query}" diff --git a/tests/open_access/test_parser.py b/tests/open_access/test_parser.py index 599a76d..d1ff1da 100644 --- a/tests/open_access/test_parser.py +++ b/tests/open_access/test_parser.py @@ -1,6 +1,10 @@ -from open_access.parsers import get_golden_access_records_ids +from open_access.parsers import (get_golden_access_records_ids, + get_green_access_records_ids, + parse_subset_540_preprint, + parse_subset_540_publication, parse_subset_856, + parse_without_names_spaces) -expected = [ +expected_golden = [ "2894668", "2891488", "2888511", @@ -11,6 +15,28 @@ "2882429", "2882335", "2882328", + "2882324", + "2882322", + "2882311", + "2882298", +] + +expected_green = [ + "2894668", + "2891489", + "2891488", + "2891487", + "2888511", + "2888151", + "2886038", + "2884472", + "2884471", + "2884470", + "2884469", + "2883672", + "2882429", + "2882335", + "2882328", "2882327", "2882324", "2882322", @@ -22,4 +48,55 @@ def test_get_golden_access_records_dois(shared_datadir): with open(shared_datadir / "search.xml") as file: records_ids = get_golden_access_records_ids(file.read()) - assert records_ids == expected + assert records_ids == expected_golden + + +def test_parse_subset_856(shared_datadir): + with open(shared_datadir / "search.xml") as file: + filtered_records_count = 0 + parsed_records = parse_without_names_spaces(file.read()) + records = parsed_records.findall(".record") + for record in records: + datafields_856 = record.findall( + "datafield[@tag='856'][@ind1='4'][@ind2=' ']" + ) + is_it_wanted_record_by_856 = parse_subset_856(datafields_856) + if is_it_wanted_record_by_856: + filtered_records_count = filtered_records_count + 1 + assert filtered_records_count == 0 + + +def test_parse_subset_540_preprint(shared_datadir): + with open(shared_datadir / "search.xml") as file: + filtered_records_count = 0 + parsed_records = parse_without_names_spaces(file.read()) + records = parsed_records.findall(".record") + for record in records: + datafields_540 = record.findall( + "datafield[@tag='540'][@ind1=' '][@ind2=' ']" + ) + is_it_wanted_record_by_540 = parse_subset_540_preprint(datafields_540) + if is_it_wanted_record_by_540: + filtered_records_count = filtered_records_count + 1 + assert filtered_records_count == 20 + + +def test_parse_subset_540_publications(shared_datadir): + with open(shared_datadir / "search.xml") as file: + filtered_records_count = 0 + parsed_records = parse_without_names_spaces(file.read()) + records = parsed_records.findall(".record") + for record in records: + datafields_540 = record.findall( + "datafield[@tag='540'][@ind1=' '][@ind2=' ']" + ) + is_it_wanted_record_by_540 = parse_subset_540_publication(datafields_540) + if is_it_wanted_record_by_540: + filtered_records_count = filtered_records_count + 1 + assert filtered_records_count == 14 + + +def test_get_green_access_records_dois(shared_datadir): + with open(shared_datadir / "search.xml") as file: + records_ids = get_green_access_records_ids(file.read()) + assert records_ids == expected_green