From 19fe1d2877c0bf6f38e5ad1058f4b2b73d1a131f Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 8 Oct 2024 23:16:13 +0300 Subject: [PATCH] Update for 2024 and remove invalid records - Update range for the 2024 Crossref dataset - Obtain distinct records - Remove records for which CD index cannot be calculated --- examples/cdindex/cdindex-db.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/examples/cdindex/cdindex-db.py b/examples/cdindex/cdindex-db.py index 16199d0b..a01689c9 100644 --- a/examples/cdindex/cdindex-db.py +++ b/examples/cdindex/cdindex-db.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Calculate the CD5 index of Crossref works published 1945-2021 +# Calculate the CD5 index based on Crossref works published 1945-2023 # from a previously populated database # @@ -28,7 +28,7 @@ RANDOM_POPULATION_SIZE = 1000000 random.seed("xyzzy") -RANGE = "published_year BETWEEN 1945 and 2021" +RANGE = "published_year BETWEEN 1945 and 2023" debug.set_flags(["perf"]) debug.set_output(sys.stderr) @@ -61,7 +61,7 @@ def add_vertices(db, graph): counter = 0 for (doi, year, month, day) in db.execute( f""" - SELECT doi, published_year, + SELECT DISTINCT doi, published_year, Coalesce(published_month, 1), Coalesce(published_day, 1) FROM works WHERE {RANGE}""" @@ -151,4 +151,19 @@ def process_batch(start): perf.log("Calculate CD index") db.commit() + + db.execute(""" + -- Works and references + ATTACH 'cdindex.db' AS wr; + + DELETE FROM cdindex + WHERE doi NOT IN ( + SELECT cdindex.doi FROM cdindex + INNER JOIN wr.works USING(doi) + WHERE works.published_year <= 2018 OR + (SELECT 1 FROM work_references WHERE work_id == works.id) + ); + """) + perf.log("Remove invalid records") + db.close()