From 357735b3d711659733fbe6a2ff8a2b08e7d26566 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 9 Sep 2024 12:47:48 +0200 Subject: [PATCH 1/7] reformatting fix --- mardi_importer/mardi_importer/zbmath/ZBMathSource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py index 63ffb06..d7bc791 100644 --- a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py +++ b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py @@ -283,7 +283,7 @@ def process_data(self): if record: for key, value in record.items(): if isinstance(value, str): - record[key] = value.replace("\t", " ").replace("\n", " ") + record[key] = value.replace("\t", "TAB").replace("\n", "NEWLINE").replace("\r", "CARRIAGE_RETURN") outfile.write( "\t".join(str(x) for x in record.values()) + "\n" ) From 96e4682fd65db6d0fa241418243dad02d1fb9fb4 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 12 Sep 2024 09:45:48 +0200 Subject: [PATCH 2/7] fix reference problem --- mardi_importer/mardi_importer/zbmath/ZBMathSource.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py index d7bc791..8ae429d 100644 --- a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py +++ b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py @@ -276,9 +276,9 @@ def process_data(self): for d in literal_eval(row["editorial_contributions"]): if d["contribution_type"] == "review": review_text = d["text"] - row["review_text"] = review_text - row["review_sign"] = d["reviewer"]["name"] - row["reviewer_id"] = d["reviewer"]["author_code"] + record["review_text"] = review_text + record["review_sign"] = d["reviewer"]["name"] + record["reviewer_id"] = d["reviewer"]["author_code"] break if record: for key, value in record.items(): From 33e3c41de4d63f5c25cd8d4e9c4a6329b34265fd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 12 Sep 2024 10:25:10 +0200 Subject: [PATCH 3/7] correct empty field problem --- .../mardi_importer/zbmath/ZBMathSource.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py index 8ae429d..84842e9 100644 --- a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py +++ b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py @@ -252,6 +252,8 @@ def process_data(self): record["classifications"] = ";".join(msc) if literal_eval(row["language"])["languages"]: record["language"] = literal_eval(row["language"])["languages"][0] + else: + record["language"] = None links = [] doi = None for d in literal_eval(row["links"]): @@ -268,18 +270,26 @@ def process_data(self): record["publication_year"] = row["year"] if literal_eval(row["source"])["series"]: record["serial"] = literal_eval(row["source"])["series"][0]["title"] + else: + record["serial"] = None record["zbl_id"] = row["identifier"] ref_ids = [] for d in literal_eval(row["references"]): ref_ids.append(str(d["zbmath"]["document_id"])) record["references"] = ";".join(ref_ids) + review_text = None + review_sign = None + reviewer_id = None for d in literal_eval(row["editorial_contributions"]): if d["contribution_type"] == "review": review_text = d["text"] - record["review_text"] = review_text - record["review_sign"] = d["reviewer"]["name"] - record["reviewer_id"] = d["reviewer"]["author_code"] + review_sign = d["reviewer"]["name"] + reviewer_id = d["reviewer"]["author_code"] break + record["review_text"] = review_text + record["review_sign"] = review_sign + record["reviewer_id"] = reviewer_id + if record: for key, value in record.items(): if isinstance(value, str): @@ -420,8 +430,8 @@ def push(self): info_dict = dict(zip(headers, split_line)) # this part is for continuing at a certain position if the import failed # if not found: - # if info_dict["de_number"].strip() != " ": - # if info_dict["document_title"] != "Unimodular supergravity": + # if info_dict["de_number"].strip() != "49686": + # #if info_dict["document_title"] != "Unimodular supergravity": # continue # else: # found = True From 5d11efc657afc41aac837ec7e542b54160eca16a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 25 Sep 2024 11:16:36 +0200 Subject: [PATCH 4/7] minor --- mardi_importer/mardi_importer/scripts/import.py | 2 +- mardi_importer/mardi_importer/zbmath/new_entities.json | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/mardi_importer/mardi_importer/scripts/import.py b/mardi_importer/mardi_importer/scripts/import.py index b689328..a6755da 100644 --- a/mardi_importer/mardi_importer/scripts/import.py +++ b/mardi_importer/mardi_importer/scripts/import.py @@ -42,7 +42,7 @@ def main(**args): processed_dump_path=conf["processed_dump_path"], ) importer = Importer(data_source) - importer.import_all(pull=True, push=False) + importer.import_all(pull=False, push=True) elif args["mode"] == "OpenML": # if args["conf_path"] is None: diff --git a/mardi_importer/mardi_importer/zbmath/new_entities.json b/mardi_importer/mardi_importer/zbmath/new_entities.json index b7572bb..790fcc9 100644 --- a/mardi_importer/mardi_importer/zbmath/new_entities.json +++ b/mardi_importer/mardi_importer/zbmath/new_entities.json @@ -14,12 +14,21 @@ "label": "zbMATH Keywords", "description": "keyword string from zbMATH", "datatype": "string" + }, + { + "label": "MaRDI profile type", + "description": "defines the types of MaRDI profiles expected to work for this item", + "datatype": "wikibase-item" } ], "items": [ { "label": "MaRDI person profile", "description": "type of MaRDI profile" + }, + { + "label": "MaRDI publication profile", + "description": "type of MaRDI profile" } ] } \ No newline at end of file From 8e884eb9fb25979db511a431786eba28f0a692d5 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 1 Oct 2024 12:23:07 +0200 Subject: [PATCH 5/7] fix bug with long labels --- .../mardi_importer/integrator/MardiEntities.py | 12 ++++++++++-- mardi_importer/mardi_importer/zbmath/ZBMathSource.py | 10 +++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/mardi_importer/mardi_importer/integrator/MardiEntities.py b/mardi_importer/mardi_importer/integrator/MardiEntities.py index 57d12ae..ab32404 100644 --- a/mardi_importer/mardi_importer/integrator/MardiEntities.py +++ b/mardi_importer/mardi_importer/integrator/MardiEntities.py @@ -1,6 +1,6 @@ import re import sqlalchemy as db -from sqlalchemy import and_ +from sqlalchemy import and_, case from mardiclient import MardiItem, MardiProperty from wikibaseintegrator.wbi_exceptions import ModificationFailed @@ -28,6 +28,11 @@ def get_QID(self, alias=False): label = "" if 'en' in self.labels.values: label = self.labels.values['en'].value + label = bytes(label, "utf-8") + is_truncated = False + if len(label) > 250: + label = label[:250] + is_truncated = True def query_wikidata_table(field_type): # field_type = 1 : Label @@ -53,7 +58,10 @@ def query_wikidata_table(field_type): .join(wbt_term_in_lang, wbt_item_terms.columns.wbit_term_in_lang_id == wbt_term_in_lang.columns.wbtl_id) .join(wbt_text_in_lang, wbt_term_in_lang.columns.wbtl_text_in_lang_id == wbt_text_in_lang.columns.wbxl_id) .join(wbt_text, wbt_text.columns.wbx_id == wbt_text_in_lang.columns.wbxl_text_id) - .where(and_(wbt_text.columns.wbx_text == bytes(label, "utf-8"), + .where(and_( + case( + (is_truncated, wbt_text.columns.wbx_text.like(label + b"%")), + else_=wbt_text.columns.wbx_text == label), wbt_term_in_lang.columns.wbtl_type_id == field_type, wbt_text_in_lang.columns.wbxl_language == bytes("en", "utf-8")))) results = connection.execute(query).fetchall() diff --git a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py index 84842e9..71e78a5 100644 --- a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py +++ b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py @@ -255,7 +255,7 @@ def process_data(self): else: record["language"] = None links = [] - doi = None + dois = None for d in literal_eval(row["links"]): if "type" not in d: continue @@ -432,10 +432,10 @@ def push(self): # if not found: # if info_dict["de_number"].strip() != "49686": # #if info_dict["document_title"] != "Unimodular supergravity": - # continue - # else: - # found = True - # continue + # continue + # else: + # found = True + # continue # if there is not title, don't add if self.conflict_string in info_dict["document_title"]: if ( From 9bd0fa9adf5265cb494ab39c425f0fd5231af294 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 1 Oct 2024 12:24:52 +0200 Subject: [PATCH 6/7] fix typo --- mardi_importer/mardi_importer/zbmath/ZBMathSource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py index 71e78a5..352d7f7 100644 --- a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py +++ b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py @@ -255,7 +255,7 @@ def process_data(self): else: record["language"] = None links = [] - dois = None + doi = None for d in literal_eval(row["links"]): if "type" not in d: continue From 47a4df534d55922c018cb3614e4a31c2bf7ab6f3 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 1 Oct 2024 12:57:17 +0200 Subject: [PATCH 7/7] change formatting --- mardi_importer/mardi_importer/zbmath/ZBMathSource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py index 352d7f7..0aec33b 100644 --- a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py +++ b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py @@ -293,7 +293,7 @@ def process_data(self): if record: for key, value in record.items(): if isinstance(value, str): - record[key] = value.replace("\t", "TAB").replace("\n", "NEWLINE").replace("\r", "CARRIAGE_RETURN") + record[key] = value.replace("\t", "\T").replace("\n", "\N").replace("\r", "\R") outfile.write( "\t".join(str(x) for x in record.values()) + "\n" )