From 10d178bfb6e87b1425d95ffb4ea72cd00e1e8802 Mon Sep 17 00:00:00 2001 From: Keegan Smith Date: Tue, 2 Jul 2024 13:10:29 +0800 Subject: [PATCH] Subtitle and subjects update (#241) --- .../onix_workflow/onix_workflow.py | 6 ++ .../onix_workflow/schema/book_list.json | 64 ++++++++++++++--- .../onix_workflow/schema/book_metrics.json | 62 ++++++++++++++-- .../schema/book_metrics_city.json | 62 ++++++++++++++-- .../schema/book_metrics_country.json | 62 ++++++++++++++-- .../schema/book_metrics_events.json | 62 ++++++++++++++-- .../schema/book_metrics_institution.json | 62 ++++++++++++++-- .../onix_workflow/schema/book_product.json | 70 ++++++++++++++++++- .../sql/book_institution_list.sql.jinja2 | 4 +- .../onix_workflow/sql/book_list.sql.jinja2 | 8 +-- .../onix_workflow/sql/book_metrics.sql.jinja2 | 8 +-- .../sql/book_metrics_author.sql.jinja2 | 2 +- .../sql/book_metrics_city.sql.jinja2 | 8 +-- .../sql/book_metrics_country.sql.jinja2 | 9 ++- .../sql/book_metrics_events.sql.jinja2 | 8 +-- .../sql/book_metrics_institution.sql.jinja2 | 8 +-- .../sql/book_metrics_subject_bic.sql.jinja2 | 2 +- .../sql/book_metrics_subject_bisac.sql.jinja2 | 2 +- .../sql/book_metrics_subject_thema.sql.jinja2 | 2 +- .../onix_workflow/sql/book_product.sql.jinja2 | 49 +++++++++++++ .../fixtures/e2e_inputs/onix.jsonl | 4 +- .../fixtures/e2e_outputs/book_list.json | 4 +- .../fixtures/e2e_outputs/book_list_dry.json | 3 - .../fixtures/e2e_outputs/book_product.json | 4 +- .../e2e_outputs/book_product_dry.json | 3 - tests/onix_workflow/test_onix_workflow.py | 10 +-- 26 files changed, 491 insertions(+), 97 deletions(-) delete mode 100644 tests/onix_workflow/fixtures/e2e_outputs/book_list_dry.json delete mode 100644 tests/onix_workflow/fixtures/e2e_outputs/book_product_dry.json diff --git a/dags/oaebu_workflows/onix_workflow/onix_workflow.py b/dags/oaebu_workflows/onix_workflow/onix_workflow.py index 5d78cf99..0904cb5a 100644 --- a/dags/oaebu_workflows/onix_workflow/onix_workflow.py +++ b/dags/oaebu_workflows/onix_workflow/onix_workflow.py @@ -686,6 +686,9 @@ def create_book_product_table(release: dict, **context) -> None: ) # Render the SQL + bic_table_id = bq_table_id(bq_subject_project_id, bq_subject_dataset_id, "bic_lookup") + thema_table_id = bq_table_id(bq_subject_project_id, bq_subject_dataset_id, "thema_lookup") + bisac_table_id = bq_table_id(bq_subject_project_id, bq_subject_dataset_id, "bisac_lookup") env = create_data_partner_env( main_template=os.path.join(sql_folder(workflow_module="onix_workflow"), "book_product.sql.jinja2"), data_partners=data_partners, @@ -698,6 +701,9 @@ def create_book_product_table(release: dict, **context) -> None: workid_table_id=workid_table_id, workfamilyid_table_id=workfamilyid_table_id, ga3_views_field=ga3_views_field, + bic_table_id=bic_table_id, + thema_table_id=thema_table_id, + bisac_table_id=bisac_table_id, **dp_tables, ) logging.info(f"Book Product SQL:\n{sql}") diff --git a/dags/oaebu_workflows/onix_workflow/schema/book_list.json b/dags/oaebu_workflows/onix_workflow/schema/book_list.json index 2052236e..702ef67a 100644 --- a/dags/oaebu_workflows/onix_workflow/schema/book_list.json +++ b/dags/oaebu_workflows/onix_workflow/schema/book_list.json @@ -57,7 +57,19 @@ "mode": "NULLABLE", "name": "title", "type": "STRING", - "description": "The Books Title" + "description": "The Book's Title" + }, + { + "mode": "NULLABLE", + "name": "subtitle", + "type": "STRING", + "description": "Subtitle of the Book" + }, + { + "mode": "NULLABLE", + "name": "title_subtitle", + "type": "STRING", + "description": "The concatenated title and subtitle of the Book" }, { "mode": "REPEATED", @@ -69,27 +81,63 @@ "fields": [ { "mode": "REPEATED", - "name": "bic", + "name": "bic_codes", + "type": "STRING", + "description": "A list of BIC subject codes" + }, + { + "mode": "REPEATED", + "name": "bic_top", + "type": "STRING", + "description": "A list of BIC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bic_names", + "type": "STRING", + "description": "A list of BIC subject names" + }, + { + "mode": "REPEATED", + "name": "bisac_codes", + "type": "STRING", + "description": "A list of BISAC subject codes" + }, + { + "mode": "REPEATED", + "name": "bisac_top", + "type": "STRING", + "description": "A list of BISAC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bisac_names", + "type": "STRING", + "description": "A list of BISAC subject names" + }, + { + "mode": "REPEATED", + "name": "thema_codes", "type": "STRING", - "description": "BIC subject codes" + "description": "A list of Thema subject codes" }, { "mode": "REPEATED", - "name": "bisac", + "name": "thema_top", "type": "STRING", - "description": "BISAC subject codes" + "description": "A list of Thema subject codes for the top level subject" }, { "mode": "REPEATED", - "name": "thema", + "name": "thema_names", "type": "STRING", - "description": "THEMA subject codes" + "description": "A list of Thema subject names" } ], "mode": "NULLABLE", "name": "subjects", "type": "RECORD", - "description": "Subject codes" + "description": "Subjects associated with this product" }, { "fields": [ diff --git a/dags/oaebu_workflows/onix_workflow/schema/book_metrics.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics.json index 0a351fbc..dba84459 100644 --- a/dags/oaebu_workflows/onix_workflow/schema/book_metrics.json +++ b/dags/oaebu_workflows/onix_workflow/schema/book_metrics.json @@ -23,31 +23,79 @@ "type": "STRING", "description": "The title of the book" }, + { + "mode": "NULLABLE", + "name": "subtitle", + "type": "STRING", + "description": "Subtitle of the Book" + }, + { + "mode": "NULLABLE", + "name": "title_subtitle", + "type": "STRING", + "description": "The concatenated title and subtitle of the Book" + }, { "fields": [ { "mode": "REPEATED", - "name": "bic", + "name": "bic_codes", + "type": "STRING", + "description": "A list of BIC subject codes" + }, + { + "mode": "REPEATED", + "name": "bic_top", + "type": "STRING", + "description": "A list of BIC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bic_names", + "type": "STRING", + "description": "A list of BIC subject names" + }, + { + "mode": "REPEATED", + "name": "bisac_codes", + "type": "STRING", + "description": "A list of BISAC subject codes" + }, + { + "mode": "REPEATED", + "name": "bisac_top", + "type": "STRING", + "description": "A list of BISAC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bisac_names", + "type": "STRING", + "description": "A list of BISAC subject names" + }, + { + "mode": "REPEATED", + "name": "thema_codes", "type": "STRING", - "description": "BIC subject codes" + "description": "A list of Thema subject codes" }, { "mode": "REPEATED", - "name": "bisac", + "name": "thema_top", "type": "STRING", - "description": "BISAC subject codes" + "description": "A list of Thema subject codes for the top level subject" }, { "mode": "REPEATED", - "name": "thema", + "name": "thema_names", "type": "STRING", - "description": "THEMA subject codes" + "description": "A list of Thema subject names" } ], "mode": "NULLABLE", "name": "subjects", "type": "RECORD", - "description": "Subject codes" + "description": "Subjects associated with this product" }, { "fields": [ diff --git a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_city.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_city.json index ddfa4dcb..f4dd2122 100644 --- a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_city.json +++ b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_city.json @@ -23,6 +23,18 @@ "type": "STRING", "description": "The title of the book" }, + { + "mode": "NULLABLE", + "name": "subtitle", + "type": "STRING", + "description": "Subtitle of the Book" + }, + { + "mode": "NULLABLE", + "name": "title_subtitle", + "type": "STRING", + "description": "The concatenated title and subtitle of the Book" + }, { "mode": "NULLABLE", "name": "published_year", @@ -63,27 +75,63 @@ "fields": [ { "mode": "REPEATED", - "name": "bic", + "name": "bic_codes", + "type": "STRING", + "description": "A list of BIC subject codes" + }, + { + "mode": "REPEATED", + "name": "bic_top", + "type": "STRING", + "description": "A list of BIC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bic_names", + "type": "STRING", + "description": "A list of BIC subject names" + }, + { + "mode": "REPEATED", + "name": "bisac_codes", + "type": "STRING", + "description": "A list of BISAC subject codes" + }, + { + "mode": "REPEATED", + "name": "bisac_top", + "type": "STRING", + "description": "A list of BISAC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bisac_names", + "type": "STRING", + "description": "A list of BISAC subject names" + }, + { + "mode": "REPEATED", + "name": "thema_codes", "type": "STRING", - "description": "BIC subject codes" + "description": "A list of Thema subject codes" }, { "mode": "REPEATED", - "name": "bisac", + "name": "thema_top", "type": "STRING", - "description": "BISAC subject codes" + "description": "A list of Thema subject codes for the top level subject" }, { "mode": "REPEATED", - "name": "thema", + "name": "thema_names", "type": "STRING", - "description": "THEMA subject codes" + "description": "A list of Thema subject names" } ], "mode": "NULLABLE", "name": "subjects", "type": "RECORD", - "description": "Subject codes" + "description": "Subjects associated with this product" }, { "fields": [ diff --git a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_country.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_country.json index a8b04d64..be932a00 100644 --- a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_country.json +++ b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_country.json @@ -23,6 +23,18 @@ "type": "STRING", "description": "The title of the book" }, + { + "mode": "NULLABLE", + "name": "subtitle", + "type": "STRING", + "description": "Subtitle of the Book" + }, + { + "mode": "NULLABLE", + "name": "title_subtitle", + "type": "STRING", + "description": "The concatenated title and subtitle of the Book" + }, { "mode": "NULLABLE", "name": "published_year", @@ -51,27 +63,63 @@ "fields": [ { "mode": "REPEATED", - "name": "bic", + "name": "bic_codes", + "type": "STRING", + "description": "A list of BIC subject codes" + }, + { + "mode": "REPEATED", + "name": "bic_top", + "type": "STRING", + "description": "A list of BIC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bic_names", + "type": "STRING", + "description": "A list of BIC subject names" + }, + { + "mode": "REPEATED", + "name": "bisac_codes", + "type": "STRING", + "description": "A list of BISAC subject codes" + }, + { + "mode": "REPEATED", + "name": "bisac_top", + "type": "STRING", + "description": "A list of BISAC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bisac_names", + "type": "STRING", + "description": "A list of BISAC subject names" + }, + { + "mode": "REPEATED", + "name": "thema_codes", "type": "STRING", - "description": "BIC subject codes" + "description": "A list of Thema subject codes" }, { "mode": "REPEATED", - "name": "bisac", + "name": "thema_top", "type": "STRING", - "description": "BISAC subject codes" + "description": "A list of Thema subject codes for the top level subject" }, { "mode": "REPEATED", - "name": "thema", + "name": "thema_names", "type": "STRING", - "description": "THEMA subject codes" + "description": "A list of Thema subject names" } ], "mode": "NULLABLE", "name": "subjects", "type": "RECORD", - "description": "Subject codes" + "description": "Subjects associated with this product" }, { "fields": [ diff --git a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_events.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_events.json index dea37b58..40140bdf 100644 --- a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_events.json +++ b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_events.json @@ -23,6 +23,18 @@ "type": "STRING", "description": "The title of the book" }, + { + "mode": "NULLABLE", + "name": "subtitle", + "type": "STRING", + "description": "Subtitle of the Book" + }, + { + "mode": "NULLABLE", + "name": "title_subtitle", + "type": "STRING", + "description": "The concatenated title and subtitle of the Book" + }, { "mode": "NULLABLE", "name": "published_year", @@ -57,27 +69,63 @@ "fields": [ { "mode": "REPEATED", - "name": "bic", + "name": "bic_codes", + "type": "STRING", + "description": "A list of BIC subject codes" + }, + { + "mode": "REPEATED", + "name": "bic_top", + "type": "STRING", + "description": "A list of BIC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bic_names", + "type": "STRING", + "description": "A list of BIC subject names" + }, + { + "mode": "REPEATED", + "name": "bisac_codes", + "type": "STRING", + "description": "A list of BISAC subject codes" + }, + { + "mode": "REPEATED", + "name": "bisac_top", + "type": "STRING", + "description": "A list of BISAC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bisac_names", + "type": "STRING", + "description": "A list of BISAC subject names" + }, + { + "mode": "REPEATED", + "name": "thema_codes", "type": "STRING", - "description": "BIC subject codes" + "description": "A list of Thema subject codes" }, { "mode": "REPEATED", - "name": "bisac", + "name": "thema_top", "type": "STRING", - "description": "BISAC subject codes" + "description": "A list of Thema subject codes for the top level subject" }, { "mode": "REPEATED", - "name": "thema", + "name": "thema_names", "type": "STRING", - "description": "THEMA subject codes" + "description": "A list of Thema subject names" } ], "mode": "NULLABLE", "name": "subjects", "type": "RECORD", - "description": "Subject codes" + "description": "Subjects associated with this product" }, { "fields": [ diff --git a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_institution.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_institution.json index 7988e348..c4f9b9cd 100644 --- a/dags/oaebu_workflows/onix_workflow/schema/book_metrics_institution.json +++ b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_institution.json @@ -11,6 +11,18 @@ "type": "STRING", "description": "The title of the book" }, + { + "mode": "NULLABLE", + "name": "subtitle", + "type": "STRING", + "description": "Subtitle of the Book" + }, + { + "mode": "NULLABLE", + "name": "title_subtitle", + "type": "STRING", + "description": "The concatenated title and subtitle of the Book" + }, { "mode": "NULLABLE", "name": "published_year", @@ -45,27 +57,63 @@ "fields": [ { "mode": "REPEATED", - "name": "bic", + "name": "bic_codes", + "type": "STRING", + "description": "A list of BIC subject codes" + }, + { + "mode": "REPEATED", + "name": "bic_top", + "type": "STRING", + "description": "A list of BIC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bic_names", + "type": "STRING", + "description": "A list of BIC subject names" + }, + { + "mode": "REPEATED", + "name": "bisac_codes", + "type": "STRING", + "description": "A list of BISAC subject codes" + }, + { + "mode": "REPEATED", + "name": "bisac_top", + "type": "STRING", + "description": "A list of BISAC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bisac_names", + "type": "STRING", + "description": "A list of BISAC subject names" + }, + { + "mode": "REPEATED", + "name": "thema_codes", "type": "STRING", - "description": "BIC subject codes" + "description": "A list of Thema subject codes" }, { "mode": "REPEATED", - "name": "bisac", + "name": "thema_top", "type": "STRING", - "description": "BISAC subject codes" + "description": "A list of Thema subject codes for the top level subject" }, { "mode": "REPEATED", - "name": "thema", + "name": "thema_names", "type": "STRING", - "description": "THEMA subject codes" + "description": "A list of Thema subject names" } ], "mode": "NULLABLE", "name": "subjects", "type": "RECORD", - "description": "Subject codes" + "description": "Subjects associated with this product" }, { "fields": [ diff --git a/dags/oaebu_workflows/onix_workflow/schema/book_product.json b/dags/oaebu_workflows/onix_workflow/schema/book_product.json index c0c3c10b..65e9918f 100644 --- a/dags/oaebu_workflows/onix_workflow/schema/book_product.json +++ b/dags/oaebu_workflows/onix_workflow/schema/book_product.json @@ -31,6 +31,12 @@ "type": "STRING", "description": "The Book's Title" }, + { + "mode": "NULLABLE", + "name": "subtitle", + "type": "STRING", + "description": "The Book's Subtitle" + }, { "mode": "NULLABLE", "name": "published_year", @@ -119,6 +125,69 @@ "type": "RECORD", "description": "Fields Pulled from the ONIX Record for this Book Product" }, + { + "fields": [ + { + "mode": "REPEATED", + "name": "bic_codes", + "type": "STRING", + "description": "A list of BIC subject codes" + }, + { + "mode": "REPEATED", + "name": "bic_top", + "type": "STRING", + "description": "A list of BIC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bic_names", + "type": "STRING", + "description": "A list of BIC subject names" + }, + { + "mode": "REPEATED", + "name": "bisac_codes", + "type": "STRING", + "description": "A list of BISAC subject codes" + }, + { + "mode": "REPEATED", + "name": "bisac_top", + "type": "STRING", + "description": "A list of BISAC subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "bisac_names", + "type": "STRING", + "description": "A list of BISAC subject names" + }, + { + "mode": "REPEATED", + "name": "thema_codes", + "type": "STRING", + "description": "A list of Thema subject codes" + }, + { + "mode": "REPEATED", + "name": "thema_top", + "type": "STRING", + "description": "A list of Thema subject codes for the top level subject" + }, + { + "mode": "REPEATED", + "name": "thema_names", + "type": "STRING", + "description": "A list of Thema subject names" + } + ], + "mode": "NULLABLE", + "name": "subjects", + "type": "RECORD", + "description": "Subjects associated with this product" + }, + { "mode": "NULLABLE", "name": "work_id", @@ -270,4 +339,3 @@ "description": "Linked Metrics from all sources, organised by month of occurance" } ] - diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 index 22079c49..fb6ad0e2 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 @@ -15,11 +15,11 @@ # Author: Richard Hosking #} {# The purpose of this script it to export the unique list of institutions from the book_product table -Primarily, the goal is to create a flat structure which is suitable for graphing in Kibana +Primarily, the goal is to create a flat structure #} SELECT institution.institution, FROM `{{ book_product_table_id }}`, UNNEST(months) as month, UNNEST(month.jstor_institution) as institution WHERE ARRAY_LENGTH(month.jstor_institution) > 0 -GROUP BY institution \ No newline at end of file +GROUP BY institution diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 index 89cc0897..226c0d50 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 @@ -21,6 +21,7 @@ SELECT ISBN13 as product_id, work_id, work_family_id, + subjects, onix.ProductForm, (ARRAY_LENGTH(months) > 0) as usage_flag, onix.EditionNumber, @@ -28,6 +29,8 @@ SELECT onix.published_date as published_date, (SELECT p.publisher_name as publisher_name FROM UNNEST(onix.publisher) as p WHERE p.publishing_role = "Publisher" LIMIT 1) as publisher_name, onix.title, + onix.subtitle, + IFNULL(CONCAT(onix.title, ": ", onix.subtitle), onix.title) as title_subtitle, onix.keywords, ARRAY( SELECT STRUCT( @@ -36,9 +39,4 @@ SELECT a.ORCID as ORCID ) FROM UNNEST(onix.authors) as a) as authors, - STRUCT( - onix.bic_subjects as bic, - onix.bisac_subjects as bisac, - onix.thema_subjects as thema - ) as subjects FROM `{{ book_product_table_id }}` diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 index 2fe7b77f..c59c0019 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 @@ -19,9 +19,12 @@ The purpose of this script it to export the book metrics section from the book_p SELECT ISBN13 as product_id, + subjects, work_id, work_family_id, onix.title, + onix.subtitle, + IFNULL(CONCAT(onix.title, ": ", onix.subtitle), onix.title) as title_subtitle, CAST(onix.published_year as INT64) as published_year, onix.published_date as published_date, ARRAY( @@ -31,11 +34,6 @@ SELECT a.ORCID as ORCID ) FROM UNNEST(onix.authors) as a) as authors, - STRUCT( - onix.bic_subjects as bic, - onix.bisac_subjects as bisac, - onix.thema_subjects as thema - ) as subjects, (SELECT p.publisher_name as publisher_name FROM UNNEST(onix.publisher) as p WHERE p.publishing_role = "Publisher" LIMIT 1) as publisher_name, -- pull the publisher name from the onix.publisher field month.month, {% for dp in data_partners | selectattr("export_book_metrics", "equalto", True) %} diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 index 3615e85a..9c496172 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 @@ -15,7 +15,7 @@ # Author: Richard Hosking, Keegan Smith #} {# The purpose of this script it to export the book author metrics from the book_product table -Primarily, the goal is to create a flat structure which is suitable for graphing in Kibana +Primarily, the goal is to create a flat structure #} CREATE TEMP FUNCTION group_counts(counts ARRAY>) AS ( diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 index 8ef5d7e6..89897cb5 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 @@ -46,16 +46,14 @@ body as ( MAX(work_id) as work_id, MAX(work_family_id) as work_family_id, MAX(onix.title) as title, + MAX(onix.subtitle) as subtitle, + IFNULL(MAX(CONCAT(onix.title, ": ", onix.subtitle)), MAX(onix.title)) as title_subtitle, CAST(MAX(onix.published_year) as INT64) as published_year, MAX(onix.published_date) as published_date, month.month, city.city, CONCAT(CAST(MAX(city.latitude) as STRING), ", ", CAST(MAX(city.longitude) as STRING)) as coordinates, - STRUCT( - ARRAY_CONCAT_AGG(onix.bic_subjects) as bic, - ARRAY_CONCAT_AGG(onix.bisac_subjects) as bisac, - ARRAY_CONCAT_AGG(onix.thema_subjects) as thema - ) as subjects, + ANY_VALUE(subjects) as subjects, -- All subject structs created are the same for the group so take any value {# Currently only used by IRUS OAPEN #} STRUCT( SUM(city.title_requests) as title_requests, diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 index f2ebc78a..d9551934 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 @@ -32,6 +32,8 @@ month_country as ( work_id, work_family_id, onix.title, + onix.subtitle, + IFNULL(CONCAT(onix.title, ": ", onix.subtitle), onix.title) as title_subtitle, (SELECT p.publisher_name as publisher_name FROM UNNEST(onix.publisher) as p WHERE p.publishing_role = "Publisher" LIMIT 1) as publisher_name, -- pull the publisher name from the onix.publisher field CAST(onix.published_year as INT64) as published_year, onix.published_date as published_date, @@ -42,11 +44,7 @@ month_country as ( a.ORCID as ORCID ) FROM UNNEST(onix.authors) as a) as authors, - STRUCT( - onix.bic_subjects as bic, - onix.bisac_subjects as bisac, - onix.thema_subjects as thema - ) as subjects, + subjects, month, alpha2, iso_name as country_name, @@ -71,6 +69,7 @@ SELECT month_country.work_id, month_country.work_family_id, month_country.title, + month_country.subtitle, month_country.authors as authors, month_country.subjects as subjects, month_country.published_year, diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 index be881297..76dec6c9 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 @@ -46,15 +46,13 @@ body as ( MAX(work_id) as work_id, MAX(work_family_id) as work_family_id, MAX(onix.title) as title, + MAX(onix.subtitle) as subtitle, + IFNULL(MAX(CONCAT(onix.title, ": ", onix.subtitle)), MAX(onix.title)) as title_subtitle, CAST(MAX(onix.published_year) as INT64) as published_year, MAX(onix.published_date) as published_date, month.month, events.source as event_source, - STRUCT( - ARRAY_CONCAT_AGG(onix.bic_subjects) as bic, - ARRAY_CONCAT_AGG(onix.bisac_subjects) as bisac, - ARRAY_CONCAT_AGG(onix.thema_subjects) as thema - ) as subjects, + ANY_VALUE(subjects) as subjects, -- All subject structs created are the same for the group so take any value STRUCT( SUM(events.count) as count ) as crossref_events, diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 index e1a76d0e..2f044181 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 @@ -20,6 +20,8 @@ The purpose of this script it to export the book institutional metrics section f SELECT ISBN13 as product_id, onix.title as title, + onix.subtitle as subtitle, + IFNULL(CONCAT(onix.title, ": ", onix.subtitle), onix.title) as title_subtitle, (SELECT p.publisher_name as publisher_name FROM UNNEST(onix.publisher) as p WHERE p.publishing_role = "Publisher" LIMIT 1) as publisher_name, -- pull the publisher name from the onix.publisher field CAST(onix.published_year as INT64) as published_year, onix.published_date as published_date, @@ -30,11 +32,7 @@ SELECT a.ORCID as ORCID ) FROM UNNEST(onix.authors) as a) as authors, - STRUCT( - onix.bic_subjects as bic, - onix.bisac_subjects as bisac, - onix.thema_subjects as thema - ) as subjects, + subjects, month.month, institution.institution, {# Currently only used by jstor #} diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 index ebe929c7..9f3ae2ac 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 @@ -15,7 +15,7 @@ # Author: Richard Hosking, Keegan Smith #} {# The purpose of this script it to export the BIC subject metrics section from the book_product table -Primarily, the goal is to create a flat structure which is suitable for graphing in Kibana +Primarily, the goal is to create a flat structure #} # Helper Function: Sum up the total number of values for the given bucket diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 index a6a55e4b..e35bc0f3 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 @@ -15,7 +15,7 @@ # Author: Richard Hosking, Keegan Smith #} {# The purpose of this script it to export the BISAC subject metrics section from the book_product table -Primarily, the goal is to create a flat structure which is suitable for graphing in Kibana +Primarily, the goal is to create a flat structure #} # Helper Function: Sum up the total number of values for the given bucket diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 index 8b0610cc..7dc0e076 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 @@ -15,7 +15,7 @@ # Author: Richard Hosking, Keegan Smith #} {# The purpose of this script it to export the Thema subject metrics section from the book_product table -Primarily, the goal is to create a flat structure which is suitable for graphing in Kibana +Primarily, the goal is to create a flat structure #} # Helper Function: Sum up the total number of values for the given bucket diff --git a/dags/oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 index c201d2c2..8fec7bb9 100644 --- a/dags/oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 +++ b/dags/oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 @@ -14,6 +14,20 @@ # Author: Richard Hosking, Keegan Smith #} +# Helper Function: Get just the unique set of top level BISAC Subjects from the provided list +CREATE TEMP FUNCTION top_level_subjects_bisac(subjects ARRAY) AS ( + ARRAY(SELECT + DISTINCT(SUBSTRING(subject, 0, 3)) as top_level + FROM UNNEST(subjects) as subject) +); +# +# Helper Function: Get just the unique set of top level Thema/BIC Subjects from the provided list +CREATE TEMP FUNCTION top_level_subjects_bic_thema(subjects ARRAY) AS ( + ARRAY(SELECT + DISTINCT(SUBSTRING(subject, 0, 1)) as top_level + FROM UNNEST(subjects) as subject) +); + # Add data partner temp functions {% for dp in data_partners %} {% if dp.book_product_functions %} @@ -60,6 +74,7 @@ onix_ebook_titles_raw as ( onix.TitleDetails[SAFE_OFFSET(0)].TitleElements[SAFE_OFFSET(0)].TitleText is not null, onix.TitleDetails[SAFE_OFFSET(0)].TitleElements[SAFE_OFFSET(0)].TitleText, onix.TitleDetails[SAFE_OFFSET(0)].TitleElements[SAFE_OFFSET(0)].TitleWithoutPrefix) as title, + onix.TitleDetails[SAFE_OFFSET(0)].TitleElements[SAFE_OFFSET(0)].Subtitle as subtitle, ARRAY(SELECT SUBSTRING(CAST(dates.Date as STRING), 0, 4) FROM UNNEST(onix.PublishingDates) as dates @@ -129,6 +144,28 @@ onix_ebook_titles as ( ) ), +subjects AS ( + SELECT + ISBN13, + onix.bic_subjects AS bic_codes, + top_level_subjects_bic_thema(onix.bic_subjects) AS bic_top, + ARRAY( + SELECT l.name FROM UNNEST(top_level_subjects_bic_thema(onix.bic_subjects)) AS c JOIN `{{ bic_table_id }}` AS l on c = l.code + ) AS bic_names, + onix.thema_subjects AS thema_codes, + top_level_subjects_bic_thema(onix.thema_subjects) AS thema_top, + ARRAY( + SELECT l.name FROM UNNEST(top_level_subjects_bic_thema(onix.thema_subjects)) AS c JOIN `{{ thema_table_id }}` AS l on c = l.code + ) AS thema_names, + onix.bisac_subjects AS bisac_codes, + top_level_subjects_bisac(onix.bisac_subjects) AS bisac_top, + ARRAY( + SELECT l.name FROM UNNEST(top_level_subjects_bisac(onix.bisac_subjects)) AS c JOIN `{{ bisac_table_id }}` AS l on c = l.code + ) AS bisac_names + + FROM + onix_ebook_titles), + crossref_events as ( SELECT public_data.isbn as ISBN13, @@ -209,6 +246,17 @@ Then we are able to pull in specific metadata from across all the various source #} SELECT onix_ebook_titles.*, + STRUCT( + s.bic_codes as bic_codes, + s.bic_top as bic_top, + s.bic_names as bic_names, + s.thema_codes as thema_codes, + s.thema_top as thema_top, + s.thema_names as thema_names, + s.bisac_codes as bisac_codes, + s.bisac_top as bisac_top, + s.bisac_names as bisac_names + ) as subjects, work_ids.work_id, work_family_ids.work_family_id, STRUCT( @@ -221,6 +269,7 @@ SELECT FROM onix_ebook_titles LEFT JOIN `{{ workid_table_id }}` AS work_ids ON work_ids.isbn13 = onix_ebook_titles.ISBN13 LEFT JOIN `{{ workfamilyid_table_id }}` AS work_family_ids ON work_family_ids.isbn13 = onix_ebook_titles.ISBN13 +LEFT JOIN subjects AS s ON s.ISBN13 = onix_ebook_titles.ISBN13 LEFT JOIN metrics AS metrics ON metrics.ISBN13 = onix_ebook_titles.ISBN13 {% for dp in data_partners | selectattr("has_metadata", "equalto", True) %} LEFT JOIN {{ dp.type_id + "_metadata" }} ON {{ dp.type_id + "_metadata.ISBN13" }} = onix_ebook_titles.ISBN13 diff --git a/tests/onix_workflow/fixtures/e2e_inputs/onix.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/onix.jsonl index 1cce45e1..302ae62e 100644 --- a/tests/onix_workflow/fixtures/e2e_inputs/onix.jsonl +++ b/tests/onix_workflow/fixtures/e2e_inputs/onix.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:684c681c865b39032e93cb6bbb8059e0f0d4876eef8731e27c824bd22af86348 -size 2102 +oid sha256:514bd6a58d11ba856b0f1c4f12106e8734bc5e357165b0df679f6584a3cbb7c4 +size 2866 diff --git a/tests/onix_workflow/fixtures/e2e_outputs/book_list.json b/tests/onix_workflow/fixtures/e2e_outputs/book_list.json index fe9a64bb..8c785b7c 100644 --- a/tests/onix_workflow/fixtures/e2e_outputs/book_list.json +++ b/tests/onix_workflow/fixtures/e2e_outputs/book_list.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33f19a86e950bc667b0d4c679223cf978f33362e77dda1c7d49c91cba6af2380 -size 1894 +oid sha256:9d4819c22beb2be183463f88a09c552529cee4855e94baecf67a4ab64427a184 +size 2985 diff --git a/tests/onix_workflow/fixtures/e2e_outputs/book_list_dry.json b/tests/onix_workflow/fixtures/e2e_outputs/book_list_dry.json deleted file mode 100644 index f58a635d..00000000 --- a/tests/onix_workflow/fixtures/e2e_outputs/book_list_dry.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1891adc6f68df647a8ecb6869efb61511402dc0114413795832a2e7f68752be8 -size 1951 diff --git a/tests/onix_workflow/fixtures/e2e_outputs/book_product.json b/tests/onix_workflow/fixtures/e2e_outputs/book_product.json index 42e6123d..aea51d4b 100644 --- a/tests/onix_workflow/fixtures/e2e_outputs/book_product.json +++ b/tests/onix_workflow/fixtures/e2e_outputs/book_product.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55d015ed064994cc17efa0348f92637e38bdc6eff2c2a53733363adec10927b6 -size 42024 +oid sha256:85c83efcf63955cb6c4b9884bb950bc3c4d4fa94421825dfe39fa528ac25a799 +size 43233 diff --git a/tests/onix_workflow/fixtures/e2e_outputs/book_product_dry.json b/tests/onix_workflow/fixtures/e2e_outputs/book_product_dry.json deleted file mode 100644 index ab9dc83b..00000000 --- a/tests/onix_workflow/fixtures/e2e_outputs/book_product_dry.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69224857af88d8c8e75ca281b35b92ace971b9d0198a73d59d266974a6212558 -size 4213 diff --git a/tests/onix_workflow/test_onix_workflow.py b/tests/onix_workflow/test_onix_workflow.py index a748437f..b1076402 100644 --- a/tests/onix_workflow/test_onix_workflow.py +++ b/tests/onix_workflow/test_onix_workflow.py @@ -1192,9 +1192,9 @@ def vcr_ignore_condition(request): ("book_metrics_author", 3), ("book_metrics_city", 39), ("book_metrics_events", 3), - ("book_metrics_subject_bic", 0), - ("book_metrics_subject_bisac", 0), - ("book_metrics_subject_thema", 0), + ("book_metrics_subject_bic", 2), + ("book_metrics_subject_bisac", 1), + ("book_metrics_subject_thema", 2), ] # Create the export tables @@ -1305,8 +1305,8 @@ def vcr_ignore_condition(request): "dag_id": dag_id, "entity_id": "onix_workflow", "dag_run_id": release.run_id, - "created": datetime_normalise(now), - "modified": datetime_normalise(now), + "created": datetime_normalise(now), + "modified": datetime_normalise(now), "data_interval_start": "2021-05-17T00:00:00+00:00", "data_interval_end": "2021-05-24T00:00:00+00:00", "snapshot_date": "2021-05-24T00:00:00+00:00",