From 1a8bb75cb5dcd5828352b16b7d5ce9104cd35141 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Nov 2023 16:54:22 +0100 Subject: [PATCH 1/6] create scripts for openml import --- .../mardi_importer/openml/OpenMLDataset.py | 342 ++++++++++++++++++ .../openml/OpenMLPublication.py | 55 +++ .../mardi_importer/openml/OpenMLSource.py | 171 +++++++++ .../mardi_importer/openml/__init__.py | 1 + mardi_importer/mardi_importer/openml/misc.py | 21 ++ .../mardi_importer/openml/new_entities.json | 93 +++++ .../openml/wikidata_entities.txt | 4 + .../mardi_importer/scripts/import.py | 13 +- mardi_importer/setup.py | 1 + 9 files changed, 700 insertions(+), 1 deletion(-) create mode 100644 mardi_importer/mardi_importer/openml/OpenMLDataset.py create mode 100644 mardi_importer/mardi_importer/openml/OpenMLPublication.py create mode 100644 mardi_importer/mardi_importer/openml/OpenMLSource.py create mode 100644 mardi_importer/mardi_importer/openml/__init__.py create mode 100644 mardi_importer/mardi_importer/openml/misc.py create mode 100644 mardi_importer/mardi_importer/openml/new_entities.json create mode 100644 mardi_importer/mardi_importer/openml/wikidata_entities.txt diff --git a/mardi_importer/mardi_importer/openml/OpenMLDataset.py b/mardi_importer/mardi_importer/openml/OpenMLDataset.py new file mode 100644 index 0000000..c7125b0 --- /dev/null +++ b/mardi_importer/mardi_importer/openml/OpenMLDataset.py @@ -0,0 +1,342 @@ +import re +import sys + +semantic_tags = [ +"Agriculture", +"Astronomy", +"Chemistry", +"Computational Universe", +"Computer Systems", +"Culture", +"Demographics", +"Earth Science", +"Economics", +"Education", +"Geography", +"Government", +"Health", +"History", +"Human Activities", +"Images", +"Language", +"Life Science", +"Machine Learning", +"Manufacturing", +"Mathematics", +"Medicine", +"Meteorology", +"Physical Sciences", +"Politics", +"Social Media", +"Sociology", +"Statistics", +"Text & Literature", +"Transportation"] + +class OpenMLDataset: + def __init__( + self, + integrator, + name, + description, + dataset_id, + version, + creators, + contributors, + collection_date, + upload_date, + license, + url, + default_target_attribute, + row_id_attribute, + tags, + original_data_url, + paper_url, + md5_checksum, + features, + num_binary_features, + num_classes, + num_features, + num_instances, + num_instances_missing_vals, + num_missing_vals, + num_numeric_features, + num_symbolic_features, + format + ): + self.api = integrator + self.name = name #done + self.dataset_id = dataset_id #done + self.version = version #done + self.creators = creators + self.contributors = contributors + self.collection_date = collection_date + self.upload_date = upload_date + self.license = license + self.url = url + self.default_target_attribute = default_target_attribute + self.row_id_attribute = row_id_attribute + self.tags = tags + self.original_data_url = original_data_url + self.paper_url = paper_url + self.md5_checksum = md5_checksum + self.features = features + self.num_binary_features = num_binary_features + self.num_classes = num_classes + self.num_features = num_features + self.num_instances = num_instances + self.num_instances_missing_vals = num_instances_missing_vals + self.num_missing_vals = num_missing_vals + self.num_numeric_features = num_numeric_features + self.num_symbolic_features = num_symbolic_features + self.format = format + self.QID = None + self.item = self.init_item() + + def init_item(self): + item = self.api.item.new() + item.labels.set(language="en", value=self.name) + item.descriptions.set(language="en", value=f"OpenML dataset with id {self.dataset_id}") + return(item) + + def create(self): + self.item.add_claim("wdt:P31", "wd:Q1172284") + self.insert_claims() + dataset_id = self.item.write().id + return(dataset_id) + + def insert_claims(self): + self.item.add_claim("wdt:P11238", self.dataset_id) + if self.version is not None and self.version != "None": + prop_nr = self.api.get_local_id_by_label("dataset version", "property") + self.item.add_claim(prop_nr, str(self.version)) + if self.creators and self.creators != "None": + creator_claims = [] + prop_nr = self.api.get_local_id_by_label("author name string", "property") + if not isinstance(object, list): + self.creators = [self.creators] + for c in self.creators: + claim = self.api.get_claim(prop_nr, c) + creator_claims.append(claim) + self.item.add_claims(creator_claims) + if self.contributors and self.contributors != "None": + contributor_claims = [] + prop_nr = self.api.get_local_id_by_label("author name string", "property") + if not isinstance(object, list): + self.contributors = [self.contributors] + for c in self.contributors: + claim = self.api.get_claim(prop_nr, c) + contributor_claims.append(claim) + self.item.add_claims(contributor_claims) + if self.collection_date and self.collection_date != "None": + prop_nr = self.api.get_local_id_by_label("dataset version", "property") + self.item.add_claim(prop_nr, str(self.collection_date)) + if self.upload_date and self.upload_date != "None": + prop_nr = self.api.get_local_id_by_label("upload date", "property") + self.item.add_claim(prop_nr, self.upload_date) + if self.license and self.license != "None": + claims = self.process_licenses() + self.item.add_claims(claims) + url_claims = [] + if self.url and self.url != "None": + claim = self.api.get_claim("wdt:P953", self.url) + url_claims.append(claim) + if self.original_data_url and self.original_data_url != "None": + claim = self.api.get_claim("wdt:P953", self.original_data_url) + url_claims.append(claim) + if url_claims: + self.item.add_claims(url_claims) + if self.default_target_attribute and self.default_target_attribute != "None": + prop_nr = self.api.get_local_id_by_label("default target attribute", "property") + self.item.add_claim(prop_nr, self.default_target_attribute) + if self.row_id_attribute and self.row_id_attribute != "None": + prop_nr = self.api.get_local_id_by_label("row id attribute", "property") + self.item.add_claim(prop_nr, self.row_id_attribute) + if self.tags and self.tags != "None": + valid_tags = [] + for t in self.tags: + if t in semantic_tags: + valid_tags.append(t) + if valid_tags: + prop_nr = self.api.get_local_id_by_label("OpenML semantic tag", "property") + tag_claims = [] + for vt in valid_tags: + claim = self.api.get_claim(prop_nr, vt) + tag_claims.append(claim) + self.item.add_claims(tag_claims) + if self.paper_url and self.paper_url != "None": + identifier, identifier_type = self.get_identifier() + publication = OpenMLPublication(integrator=self.api, identifier=identifier, + identifier_type=identifier_type) + paper_qid = publication.exists() + if not paper_qid: + paper_qid = publication.create() + self.item.add_claim("wdt:P2860", paper_qid) + if self.md5_checksum and self.md5_checksum != "None": + qualifier = [self.api.get_claim("wdt:P459", "wd:Q185235")] + self.item.add_claims(self.api.get_claim("wdt:P4092", self.md5_checksum, qualifiers=qualifier)) + if self.features and self.features != "None": + for _, v in self.features.items(): + full_feature = str(v).split(" - ")[1][:-1] + match = re.match(r'^(.*?)\s*\(([^()]+)\)$', full_feature) + if match: + feature = match.group(1).strip() + feature_type = match.group(2).strip() + if feature_type not in ["numeric", "nominal", "string", "date"]: + sys.exit("Incorrect feature type {feature_type}") + data_type_prop_nr = self.api.get_local_id_by_label("data type", "property") + qualifier = [self.api.get_claim(data_type_prop_nr, feature_type)] + feature_prop_nr = self.api.get_local_id_by_label("has feature", "property") + self.item.add_claims(self.api.get_claim(feature_prop_nr, feature, qualifiers=qualifier)) + if self.num_binary_features is not None and self.num_binary_features != "None": + prop_nr = self.api.get_local_id_by_label("number of binary features", "property") + self.item.add_claim(prop_nr, int(self.num_binary_features)) + if self.num_classes is not None and self.num_classes != "None": + prop_nr = self.api.get_local_id_by_label("number of classes", "property") + self.item.add_claim(prop_nr, int(self.num_classes)) + if self.num_features is not None and self.num_features != "None": + prop_nr = self.api.get_local_id_by_label("number of features", "property") + self.item.add_claim(prop_nr, int(self.num_features)) + if self.num_instances is not None and self.num_instances != "None": + prop_nr = self.api.get_local_id_by_label("number of instances", "property") + self.item.add_claim(prop_nr, int(self.num_instances)) + if self.num_instances_missing_vals is not None and self.num_instances_missing_vals != "None": + prop_nr = self.api.get_local_id_by_label("number of instances with missing values", "property") + self.item.add_claim(prop_nr, int(self.num_instances_missing_vals)) + if self.num_missing_vals is not None and self.num_missing_vals != "None": + prop_nr = self.api.get_local_id_by_label("number of missing values", "property") + self.item.add_claim(prop_nr, int(self.num_missing_vals)) + if self.num_numeric_features is not None and self.num_numeric_features != "None": + prop_nr = self.api.get_local_id_by_label("number of numeric features", "property") + self.item.add_claim(prop_nr, int(self.num_numeric_features)) + if self.num_symbolic_features is not None and self.num_symbolic_features != "None": + prop_nr = self.api.get_local_id_by_label("number of symbolic features", "property") + self.item.add_claim(prop_nr, int(self.num_symbolic_features)) + if self.format and self.format != "None": + if self.format.lower() == "arff": + self.item.add_claim("wdt:P2701", "wd:Q4489412") + elif self.format.lower() == "sparse_arff": + qid = self.api.get_local_id_by_label("Sparse ARFF", "item") + self.item.add_claim("wdt:P2701", qid) + else: + sys.exit(f"Invalid file format {self.format}") + + def exists(self): + """Checks if a WB item corresponding to the dataset already exists. + Searches for a WB item with the package label in the SQL Wikibase + tables and returns **True** if a matching result is found. + It uses for that the :meth:`mardi_importer.wikibase.WBItem.instance_exists()` + method. + Returns: + String: Entity ID + """ + if self.QID: + return self.QID + # instance of scholarly article + self.QID = self.item.is_instance_of_with_property( + "wd:Q1172284", "wdt:P11238", self.dataset_id + ) + return self.QID + + def update(self): + """ + Update existing item. + """ + self.item = self.api.item.get(entity_id=self.QID) + + self.insert_claims() + self.item.write() + + if self.QID: + print(f"Dataset with ID {self.QID} has been updated.") + return self.QID + else: + print(f"Dataset could not be updated.") + return None + + + def process_licenses(self): + """Processes the license string and adds the corresponding statements. + + The concrete License is identified and linked to the corresponding + item that has previously been imported from Wikidata. Further license + information, when provided between round or square brackets, is added + as a qualifier. + + If a file license is mentioned, the linked to the file license + in CRAN is added as a qualifier. + + Args: + item (WBItem): + Item representing the R package to which the statement must be added. + """ + claims = [] + license_qualifier = "" + if re.findall("\(.*?\)", license_str): + qualifier_groups = re.search("\((.*?)\)", license_str) + license_qualifier = qualifier_groups.group(1) + license_aux = re.sub("\(.*?\)", "", license_str) + if re.findall("\[.*?\]", license_aux): + qualifier_groups = re.search("\[(.*?)\]", license_str) + license_qualifier = qualifier_groups.group(1) + license_str = re.sub("\[.*?\]", "", license_aux) + else: + license_str = license_aux + elif re.findall("\[.*?\]", license_str): + qualifier_groups = re.search("\[(.*?)\]", license_str) + license_qualifier = qualifier_groups.group(1) + license_str = re.sub("\[.*?\]", "", license_str) + license_str = license_str.strip() + license_QID = self.get_license_QID(license_str) + if license_QID: + if license_qualifier: + qualifier = [self.api.get_claim("wdt:P9767", license_qualifier)] + claims.append(self.api.get_claim("wdt:P275", license_QID, qualifiers=qualifier)) + else: + claims.append(self.api.get_claim("wdt:P275", license_QID)) + return claims + + def get_identifier(self): + if self.paper_url is None or self.paper_url == "None": + return(None, None) + elif "http" not in self.paper_url: + return(None,None) + elif "dl.acm.org" in self.paper_url: + return("/".join(self.paper_url.split("/")[-2:]), "doi") + elif "doi=" in self.paper_url: + doi = self.paper_url.split("doi=")[-1] + if "&" in doi: + doi = doi.split("&")[0] + return(doi, "doi") + elif "link.springer" in self.paper_url: + doi = "/".join(self.paper_url.split("/")[-2:]) + if "%" in doi: + return(None, None) + else: + return(doi, "doi") + elif "wiley" in self.paper_url: + doi = "/".join(self.paper_url.split("/")[-2:]) + if "?" in doi: + doi = doi.split("?")[0] + return(doi, "doi") + elif "biomedcentral" in self.paper_url: + doi = "/".join(self.paper_url.split("/")[-2:]) + return(doi, "doi") + elif "tandfonline" in self.paper_url: + doi = "/".join(self.paper_url.split("/")[-2:]) + return(doi, "doi") + elif "arxiv" in self.paper_url: + arxiv_id = self.paper_url.split("/")[-1] + return(arxiv_id, "arxiv") + elif "royalsociety" in self.paper_url: + doi = "/".join(self.paper_url.split("/")[-2:]) + return(doi, "doi") + elif "sagepub" in self.paper_url: + doi = "/".join(self.paper_url.split("/")[-2:]) + return(doi, "doi") + elif "science.org" in self.paper_url: + doi = "/".join(self.paper_url.split("/")[-2:]) + return(doi, "doi") + else: + return(None, None) + \ No newline at end of file diff --git a/mardi_importer/mardi_importer/openml/OpenMLPublication.py b/mardi_importer/mardi_importer/openml/OpenMLPublication.py new file mode 100644 index 0000000..88e49ee --- /dev/null +++ b/mardi_importer/mardi_importer/openml/OpenMLPublication.py @@ -0,0 +1,55 @@ +import sys + +class OpenMLPublication: + """ Class to manage OpenML publications in the local Wikibase instance. + If there is already an item with this doi or arxiv id, it gets fetched. + Attributes: + integrator: + MardiIntegrator instance + identifier: + arxiv id or doi + identifier_type: + 'arxiv' or 'doi' + """ + + def __init__( + self, + integrator, + identifier, + identifier_type, + ): + self.api = integrator + self.identifier = identifier + self.identifier_type = identifier_type + self.item = self.api.item.new() + + def exists(self): + """Checks if there is an item with that identifier in the local wikibase instance. + Returns: + String: Entity ID + """ + if self.identifier_type == "doi": + QID_list = self.api.search_entity_by_value( + "wdt:P356", self.identifier + ) + elif self.identifier_type == "arxiv": + QID_list = self.api.search_entity_by_value( + "wdt:P818", self.identifier + ) + else: + sys.exit("Invalid identifier type") + if not QID_list: + self.QID = None + else: + self.QID = QID_list[0] + + def create(self): + self.item.add_claim("wdt:P31", "wd:Q13442814") + if self.identifier_type == "doi": + self.item.add_claim("wdt:P356", self.identifier) + elif self.identifier_type == "arxiv": + self.item.add_claim("wdt:P356", self.identifier) + self.item.descriptions.set(language="en", value=f"scientific article about an OpenML dataset") + publication_id = self.item.write().id + return publication_id + diff --git a/mardi_importer/mardi_importer/openml/OpenMLSource.py b/mardi_importer/mardi_importer/openml/OpenMLSource.py new file mode 100644 index 0000000..3d8abd9 --- /dev/null +++ b/mardi_importer/mardi_importer/openml/OpenMLSource.py @@ -0,0 +1,171 @@ +from mardi_importer.importer import ADataSource +import openml +from mardi_importer.integrator import MardiIntegrator +from .OpenMLDataset import OpenMLDataset +import os +import json + +class OpenMLSource(ADataSource): + def __init__(self): + self.integrator = MardiIntegrator() + self.filepath = os.path.realpath(os.path.dirname(__file__)) + def setup(self): + """Create all necessary properties and entities for zbMath""" + # Import entities from Wikidata + filename = self.filepath + "/wikidata_entities.txt" + self.integrator.import_entities(filename=filename) + self.create_local_entities() + # self.de_number_prop = self.integrator.get_local_id_by_label( + # "zbMATH DE Number", "property" + # ) + # self.keyword_prop = self.integrator.get_local_id_by_label( + # "zbMATH keyword string", "property" + # ) + + def create_local_entities(self): + filename = self.filepath + "/new_entities.json" + f = open(filename) + entities = json.load(f) + + for prop_element in entities["properties"]: + prop = self.integrator.property.new() + prop.labels.set(language="en", value=prop_element["label"]) + prop.descriptions.set(language="en", value=prop_element["description"]) + prop.datatype = prop_element["datatype"] + if not prop.exists(): + prop.write() + + for item_element in entities["items"]: + item = self.integrator.item.new() + item.labels.set(language="en", value=item_element["label"]) + item.descriptions.set(language="en", value=item_element["description"]) + for key, value in item_element["claims"].items(): + item.add_claim(key, value=value) + if not item.exists(): + item.write() + + def pull(self): + dataset_dict = {"name": [], "dataset_id": [], "version": [], "creators": [], + "contributors": [], "collection_date": [], "upload_date": [], + "license": [], "url":[], "default_target_attribute":[], "row_id_attribute":[], + "tags":[], "original_data_url":[], "paper_url":[], + "md5_checksum": [], "features": [], "num_binary_features":[], + "num_classes":[], "num_features":[], "num_instances":[], "num_instances_missing_vals":[], + "num_missing_vals":[], "num_numeric_features":[], "num_symbolic_features":[], + "format":[]} + dataset_df = openml.datasets.list_datasets(output_format="dataframe") + did_list = dataset_df["did"].unique() + for did in did_list: + ds = openml.datasets.get_dataset(int(did), download_data=False) + dataset_dict["name"].append(ds.name) + dataset_dict["dataset_id"].append(did) + dataset_dict["version"].append(ds.version) + dataset_dict["creators"].append(ds.creator) + dataset_dict["contributors"].append(ds.contributor) + dataset_dict["collection_date"].append(ds.collection_date) + dataset_dict["upload_date"].append(ds.upload_date) + dataset_dict["license"].append(ds.licence) + dataset_dict["url"].append(ds.url) + dataset_dict["default_target_attribute"].append(ds.default_target_attribute) + dataset_dict["row_id_attribute"].append(ds.row_id_attribute) + dataset_dict["tags"].append(ds.tag) + dataset_dict["original_data_url"].append(ds.original_data_url) + dataset_dict["paper_url"].append(ds.paper_url) + dataset_dict["md5_checksum"].append(ds.md5_checksum) + dataset_dict["features"].append(ds.features) + try: + qualities = ds.qualities + except: + dataset_dict["num_binary_features"].append(None) + dataset_dict["num_classes"].append(None) + dataset_dict["num_features"].append(None) + dataset_dict["num_instances"].append(None) + dataset_dict["num_instances_missing_vals"].append(None) + dataset_dict["num_missing_vals"].append(None) + dataset_dict["num_numeric_features"].append(None) + dataset_dict["num_symbolic_features"].append(None) + continue + if 'NumberOfBinaryFeatures' in ds.qualities: + dataset_dict["num_binary_features"].append(ds.qualities['NumberOfBinaryFeatures']) + else: + dataset_dict["num_binary_features"].append(None) + if 'NumberOfClasses' in ds.qualities: + dataset_dict["num_classes"].append(ds.qualities['NumberOfClasses']) + else: + dataset_dict["num_classes"].append(None) + if 'NumberOfFeatures' in ds.qualities: + dataset_dict["num_features"].append(ds.qualities['NumberOfFeatures']) + else: + dataset_dict["num_features"].append(None) + if 'NumberOfInstances' in ds.qualities: + dataset_dict["num_instances"].append(ds.qualities['NumberOfInstances']) + else: + dataset_dict["num_instances"].append(None) + if 'NumberOfInstancesWithMissingValues' in ds.qualities: + dataset_dict["num_instances_missing_vals"].append(ds.qualities['NumberOfInstancesWithMissingValues']) + else: + dataset_dict["num_instances_missing_vals"].append(None) + if 'NumberOfMissingValues' in ds.qualities: + dataset_dict["num_missing_vals"].append(ds.qualities['NumberOfMissingValues']) + else: + dataset_dict["num_missing_vals"].append(None) + if 'NumberOfNumericFeatures' in ds.qualities: + dataset_dict["num_numeric_features"].append(ds.qualities['NumberOfNumericFeatures']) + else: + dataset_dict["num_numeric_features"].append(None) + if 'NumberOfSymbolicFeatures' in ds.qualities: + dataset_dict["num_symbolic_features"].append(ds.qualities['NumberOfSymbolicFeatures']) + else: + dataset_dict["num_symbolic_features"].append(None) + return(dataset_dict) + + def push(self): + dataset_dict = {'name': ['kr-vs-kp'], + 'description': ['Author: Alen Shapiro\nSource: [UCI](https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn))\nPlease cite: [UCI citation policy](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n1. Title: Chess End-Game -- King+Rook versus King+Pawn on a7\n(usually abbreviated KRKPA7). The pawn on a7 means it is one square\naway from queening. It is the King+Rook\'s side (white) to move.\n\n2. Sources:\n(a) Database originally generated and described by Alen Shapiro.\n(b) Donor/Coder: Rob Holte (holte@uottawa.bitnet). The database\nwas supplied to Holte by Peter Clark of the Turing Institute\nin Glasgow (pete@turing.ac.uk).\n(c) Date: 1 August 1989\n\n3. Past Usage:\n- Alen D. Shapiro (1983,1987), "Structured Induction in Expert Systems",\nAddison-Wesley. This book is based on Shapiro\'s Ph.D. thesis (1983)\nat the University of Edinburgh entitled "The Role of Structured\nInduction in Expert Systems".\n- Stephen Muggleton (1987), "Structuring Knowledge by Asking Questions",\npp.218-229 in "Progress in Machine Learning", edited by I. Bratko\nand Nada Lavrac, Sigma Press, Wilmslow, England SK9 5BB.\n- Robert C. Holte, Liane Acker, and Bruce W. Porter (1989),\n"Concept Learning and the Problem of Small Disjuncts",\nProceedings of IJCAI. Also available as technical report AI89-106,\nComputer Sciences Department, University of Texas at Austin,\nAustin, Texas 78712.\n\n4. Relevant Information:\nThe dataset format is described below. Note: the format of this\ndatabase was modified on 2/26/90 to conform with the format of all\nthe other databases in the UCI repository of machine learning databases.\n\n5. Number of Instances: 3196 total\n\n6. Number of Attributes: 36\n\n7. Attribute Summaries:\nClasses (2): -- White-can-win ("won") and White-cannot-win ("nowin").\nI believe that White is deemed to be unable to win if the Black pawn\ncan safely advance.\nAttributes: see Shapiro\'s book.\n\n8. Missing Attributes: -- none\n\n9. Class Distribution:\nIn 1669 of the positions (52%), White can win.\nIn 1527 of the positions (48%), White cannot win.\n\nThe format for instances in this database is a sequence of 37 attribute values.\nEach instance is a board-descriptions for this chess endgame. The first\n36 attributes describe the board. The last (37th) attribute is the\nclassification: "win" or "nowin". There are 0 missing values.\nA typical board-description is\n\nf,f,f,f,f,f,f,f,f,f,f,f,l,f,n,f,f,t,f,f,f,f,f,f,f,t,f,f,f,f,f,f,f,t,t,n,won\n\nThe names of the features do not appear in the board-descriptions.\nInstead, each feature correponds to a particular position in the\nfeature-value list. For example, the head of this list is the value\nfor the feature "bkblk". The following is the list of features, in\nthe order in which their values appear in the feature-value list:\n\n[bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,cntxt,dsopp,dwipd,\nhdchk,katri,mulch,qxmsq,r2ar8,reskd,reskr,rimmx,rkxwp,rxmsq,simpl,skach,skewr,\nskrxp,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg]\n\nIn the file, there is one instance (board position) per line.\n\n\nNum Instances: 3196\nNum Attributes: 37\nNum Continuous: 0 (Int 0 / Real 0)\nNum Discrete: 37\nMissing values: 0 / 0.0%'], + 'dataset_id': [3], + 'version': [1], + 'creators': ['Alen Shapiro'], + 'contributors': ['Rob Holte'], + 'collection_date': ['1989-08-01'], + 'upload_date': ['2014-04-06T23:19:28'], + 'license': ['Public'], + 'url': ['https://api.openml.org/data/v1/download/3/kr-vs-kp.arff'], + 'default_target_attribute': ['class'], + 'row_id_attribute': [None], + 'tags': [['Machine Learning', + 'Mathematics', + 'mythbusting_1', + 'OpenML-CC18', + 'OpenML100', + 'study_1', + 'study_123', + 'study_14', + 'study_144', + 'uci']], + 'original_data_url': ['https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn)'], + 'paper_url': ['https://dl.acm.org/doi/abs/10.5555/32231'], + 'md5_checksum': ['ad6eb32b7492524d4382a40e23cdbb8e'], + 'features': [{0: ["0 - bkblk (nominal)"], + 1: ["1 - bknwy (nominal)"], + 2: ["2 - bkon8 (nominal)"], + 3: ["3 - bkona (nominal)"], + 36: ["36 - class (nominal)"]}], + 'num_binary_features': [35.0], + 'num_classes': [2.0], + 'num_features': [37.0], + 'num_instances': [3196.0], + 'num_instances_missing_vals': [0.0], + 'num_missing_vals': [0.0], + 'num_numeric_features': [0.0], + 'num_symbolic_features': [37.0], + 'format': ['ARFF']} + for elements in zip(*dataset_dict.values()): + lookup_dict = dict(zip(dataset_dict.keys(), elements)) + dataset = OpenMLDataset( + integrator = self.integrator, + **lookup_dict + ) + if not dataset.exists(): + dataset.create() + else: + dataset.update() \ No newline at end of file diff --git a/mardi_importer/mardi_importer/openml/__init__.py b/mardi_importer/mardi_importer/openml/__init__.py new file mode 100644 index 0000000..543ce7a --- /dev/null +++ b/mardi_importer/mardi_importer/openml/__init__.py @@ -0,0 +1 @@ +from .OpenMLSource import OpenMLSource \ No newline at end of file diff --git a/mardi_importer/mardi_importer/openml/misc.py b/mardi_importer/mardi_importer/openml/misc.py new file mode 100644 index 0000000..4430d9d --- /dev/null +++ b/mardi_importer/mardi_importer/openml/misc.py @@ -0,0 +1,21 @@ +from datetime import datetime + +def convert_time_to_iso(time_string): + try: + # Attempt to parse the input time string + dt = datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S") + except ValueError: + try: + dt = datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S") + except ValueError: + try: + dt = datetime.strptime(time_string, "%Y-%m-%d") + except ValueError: + # Handle other formats as needed + print(time_string) + print(a) + return None + + # Convert to ISO format + iso_format = dt.isoformat() + "Z" + return iso_format \ No newline at end of file diff --git a/mardi_importer/mardi_importer/openml/new_entities.json b/mardi_importer/mardi_importer/openml/new_entities.json new file mode 100644 index 0000000..d450ba7 --- /dev/null +++ b/mardi_importer/mardi_importer/openml/new_entities.json @@ -0,0 +1,93 @@ +{ + "properties": [ + { + "label": "dataset version", + "description": "Version of a dataset", + "datatype": "string" + }, + { + "label": "date string", + "description": "date as a string", + "datatype": "string" + }, + { + "label": "upload date", + "description": "upload date of file", + "datatype": "time" + }, + { + "label": "default target attribute", + "description": "the default target attribute", + "datatype": "string" + }, + { + "label": "row id attribute", + "description": "the row id attribute", + "datatype": "string" + }, + { + "label": "OpenML semantic tag", + "description": "the OpenML semantic tag", + "datatype": "string" + }, + { + "label": "has feature", + "description": "it has feature", + "datatype": "string" + }, + { + "label": "data type", + "description": "the data type", + "datatype": "string" + }, + { + "label": "number of binary features", + "description": "the number of binary features", + "datatype": "quantity" + }, + { + "label": "number of classes", + "description": "the number of classes", + "datatype": "quantity" + }, + { + "label": "number of features", + "description": "the number of features", + "datatype": "quantity" + }, + { + "label": "number of instances", + "description": "the number of instances", + "datatype": "quantity" + }, + { + "label": "number of instances with missing values", + "description": "the number of instances with missing values", + "datatype": "quantity" + }, + { + "label": "number of missing values", + "description": "the number of missing values", + "datatype": "quantity" + }, + { + "label": "number of numeric features", + "description": "the number of numeric features", + "datatype": "quantity" + }, + { + "label": "number of symbolic features", + "description": "the number of symbolic features", + "datatype": "quantity" + } + ], + "items": [ + { + "label": "Sparse ARFF", + "description": "File format", + "claims": { + "wdt:P31": "wd:Q235557" + } + } + ] +} \ No newline at end of file diff --git a/mardi_importer/mardi_importer/openml/wikidata_entities.txt b/mardi_importer/mardi_importer/openml/wikidata_entities.txt new file mode 100644 index 0000000..2ac3c04 --- /dev/null +++ b/mardi_importer/mardi_importer/openml/wikidata_entities.txt @@ -0,0 +1,4 @@ +Q1172284 +P2701 +P4092 +Q185235 \ No newline at end of file diff --git a/mardi_importer/mardi_importer/scripts/import.py b/mardi_importer/mardi_importer/scripts/import.py index 0c89ddf..9c13e41 100644 --- a/mardi_importer/mardi_importer/scripts/import.py +++ b/mardi_importer/mardi_importer/scripts/import.py @@ -4,6 +4,7 @@ from argparse import ArgumentParser from mardi_importer.importer import Importer from mardi_importer.zbmath import ZBMathSource, ZBMathConfigParser +from mardi_importer.openml import OpenMLSource from mardi_importer.cran import CRANSource from mardi_importer.polydb import PolyDBSource @@ -11,7 +12,7 @@ def get_parser(): """Get arguments parser""" parser = ArgumentParser() parser.add_argument( - "--mode", type=str, required=True, choices=["ZBMath", "CRAN", "polydb"] + "--mode", type=str, required=True, choices=["ZBMath", "CRAN", "polydb", "OpenML"] ) parser.add_argument("--conf_path", required=False) parser.add_argument("--wikidata_id_file_path", required=False) @@ -40,6 +41,16 @@ def main(**args): importer = Importer(data_source) importer.import_all(pull=False, push=True) + elif args["mode"] == "OpenML": + # if args["conf_path"] is None: + # sys.exit("--conf_path is required for --mode OpenML") + #conf_parser = OpenMLConfigParser(args["conf_path"]) + #conf = conf_parser.parse_config() + + data_source = OpenMLSource() + importer = Importer(data_source) + importer.import_all(pull=False, push=True) + elif args["mode"] == "CRAN": data_source = CRANSource() importer = Importer(data_source) diff --git a/mardi_importer/setup.py b/mardi_importer/setup.py index f5f089f..8a670a5 100644 --- a/mardi_importer/setup.py +++ b/mardi_importer/setup.py @@ -25,6 +25,7 @@ "lxml", "mysql-connector-python", "nameparser", + "openml", "pandas", "sickle", "sparqlwrapper", From eb1b44c0ac09e64b4ea57f945b071ab0cfadda7b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 19 Dec 2023 14:31:45 +0100 Subject: [PATCH 2/6] prototype finished --- .../mardi_importer/openml/OpenMLDataset.py | 95 ++++++++++++++++++- .../openml/OpenMLPublication.py | 1 + .../mardi_importer/openml/OpenMLSource.py | 2 +- .../mardi_importer/openml/new_entities.json | 12 ++- .../openml/wikidata_entities.txt | 11 ++- 5 files changed, 114 insertions(+), 7 deletions(-) diff --git a/mardi_importer/mardi_importer/openml/OpenMLDataset.py b/mardi_importer/mardi_importer/openml/OpenMLDataset.py index c7125b0..75f0619 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLDataset.py +++ b/mardi_importer/mardi_importer/openml/OpenMLDataset.py @@ -1,5 +1,6 @@ import re import sys +from .OpenMLPublication import OpenMLPublication semantic_tags = [ "Agriculture", @@ -106,7 +107,8 @@ def create(self): return(dataset_id) def insert_claims(self): - self.item.add_claim("wdt:P11238", self.dataset_id) + prop_nr = self.api.get_local_id_by_label("OpenML dataset ID", "property") + self.item.add_claim(prop_nr, str(self.dataset_id)) if self.version is not None and self.version != "None": prop_nr = self.api.get_local_id_by_label("dataset version", "property") self.item.add_claim(prop_nr, str(self.version)) @@ -129,11 +131,12 @@ def insert_claims(self): contributor_claims.append(claim) self.item.add_claims(contributor_claims) if self.collection_date and self.collection_date != "None": - prop_nr = self.api.get_local_id_by_label("dataset version", "property") + prop_nr = self.api.get_local_id_by_label("collection date", "property") self.item.add_claim(prop_nr, str(self.collection_date)) if self.upload_date and self.upload_date != "None": prop_nr = self.api.get_local_id_by_label("upload date", "property") - self.item.add_claim(prop_nr, self.upload_date) + date = self.upload_date.split("T")[0] + "T00:00:00Z" + self.item.add_claim(prop_nr, date) if self.license and self.license != "None": claims = self.process_licenses() self.item.add_claims(claims) @@ -270,6 +273,7 @@ def process_licenses(self): item (WBItem): Item representing the R package to which the statement must be added. """ + license_str = self.license claims = [] license_qualifier = "" if re.findall("\(.*?\)", license_str): @@ -339,4 +343,87 @@ def get_identifier(self): return(doi, "doi") else: return(None, None) - \ No newline at end of file + + def get_license_QID(self, license_str: str) -> str: + """Returns the Wikidata item ID corresponding to a software license. + + The same license is often denominated in CRAN using differents names. + This function returns the wikidata item ID corresponding to a single + unique license that is referenced in CRAN under different names (e.g. + *Artistic-2.0* and *Artistic License 2.0* both refer to the same + license, corresponding to item *Q14624826*). + + Args: + license_str (str): String corresponding to a license imported from CRAN. + + Returns: + (str): Wikidata item ID. + """ + def get_license(label: str) -> str: + license_item = self.api.item.new() + license_item.labels.set(language="en", value=label) + return license_item.is_instance_of("wd:Q207621") + + license_mapping = { + "ACM": get_license("ACM Software License Agreement"), + "AGPL":"wd:Q28130012", + "AGPL-3": "wd:Q27017232", + "Apache License": "wd:Q616526", + "Apache License 2.0": "wd:Q13785927", + "Apache License version 1.1": "wd:Q17817999", + "Apache License version 2.0": "wd:Q13785927", + "Artistic-2.0": "wd:Q14624826", + "Artistic License 2.0": "wd:Q14624826", + "BSD 2-clause License": "wd:Q18517294", + "BSD 3-clause License": "wd:Q18491847", + "BSD_2_clause": "wd:Q18517294", + "BSD_3_clause": "wd:Q18491847", + "BSL": "wd:Q2353141", + "BSL-1.0": "wd:Q2353141", + "CC0": "wd:Q6938433", + "CC BY 4.0": "wd:Q20007257", + "CC BY-SA 4.0": "wd:Q18199165", + "CC BY-NC 4.0": "wd:Q34179348", + "CC BY-NC-SA 4.0": "wd:Q42553662", + "CeCILL": "wd:Q1052189", + "CeCILL-2": "wd:Q19216649", + "Common Public License Version 1.0": "wd:Q2477807", + "CPL-1.0": "wd:Q2477807", + "Creative Commons Attribution 4.0 International License": "wd:Q20007257", + "EPL": "wd:Q1281977", + "EUPL": "wd:Q1376919", + "EUPL-1.1": "wd:Q1376919", + "file LICENCE": get_license("File License"), + "file LICENSE": get_license("File License"), + "FreeBSD": "wd:Q34236", + "GNU Affero General Public License": "wd:Q1131681", + "GNU General Public License": "wd:Q7603", + "GNU General Public License version 2": "wd:Q10513450", + "GNU General Public License version 3": "wd:Q10513445", + "GPL": "wd:Q7603", + "GPL-2": "wd:Q10513450", + "GPL-3": "wd:Q10513445", + "LGPL": "wd:Q192897", + "LGPL-2": "wd:Q23035974", + "LGPL-2.1": "wd:Q18534390", + "LGPL-3": "wd:Q18534393", + "Lucent Public License": "wd:Q6696468", + "MIT": "wd:Q334661", + "MIT License": "wd:Q334661", + "Mozilla Public License 1.1": "wd:Q26737735", + "Mozilla Public License 2.0": "wd:Q25428413", + "Mozilla Public License Version 2.0": "wd:Q25428413", + "MPL": "wd:Q308915", + "MPL version 1.0": "wd:Q26737738", + "MPL version 1.1": "wd:Q26737735", + "MPL version 2.0": "wd:Q25428413", + "MPL-1.1": "wd:Q26737735", + "MPL-2.0": "wd:Q25428413", + "Unlimited": get_license("Unlimited License"), + } + + license_info = license_mapping.get(license_str) + if callable(license_info): + return license_info() + else: + return license_info \ No newline at end of file diff --git a/mardi_importer/mardi_importer/openml/OpenMLPublication.py b/mardi_importer/mardi_importer/openml/OpenMLPublication.py index 88e49ee..61899f4 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLPublication.py +++ b/mardi_importer/mardi_importer/openml/OpenMLPublication.py @@ -42,6 +42,7 @@ def exists(self): self.QID = None else: self.QID = QID_list[0] + return(self.QID) def create(self): self.item.add_claim("wdt:P31", "wd:Q13442814") diff --git a/mardi_importer/mardi_importer/openml/OpenMLSource.py b/mardi_importer/mardi_importer/openml/OpenMLSource.py index 3d8abd9..f34e334 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLSource.py +++ b/mardi_importer/mardi_importer/openml/OpenMLSource.py @@ -128,7 +128,7 @@ def push(self): 'contributors': ['Rob Holte'], 'collection_date': ['1989-08-01'], 'upload_date': ['2014-04-06T23:19:28'], - 'license': ['Public'], + 'license': ['CC0'], 'url': ['https://api.openml.org/data/v1/download/3/kr-vs-kp.arff'], 'default_target_attribute': ['class'], 'row_id_attribute': [None], diff --git a/mardi_importer/mardi_importer/openml/new_entities.json b/mardi_importer/mardi_importer/openml/new_entities.json index d450ba7..f80094a 100644 --- a/mardi_importer/mardi_importer/openml/new_entities.json +++ b/mardi_importer/mardi_importer/openml/new_entities.json @@ -6,7 +6,7 @@ "datatype": "string" }, { - "label": "date string", + "label": "collection date", "description": "date as a string", "datatype": "string" }, @@ -50,6 +50,11 @@ "description": "the number of classes", "datatype": "quantity" }, + { + "label": "author name string", + "description": "name string of the author", + "datatype": "string" + }, { "label": "number of features", "description": "the number of features", @@ -79,6 +84,11 @@ "label": "number of symbolic features", "description": "the number of symbolic features", "datatype": "quantity" + }, + { + "label": "OpenML dataset ID", + "description": "identifier for a dataset in the OpenML database of open datasets for machine learning", + "datatype": "string" } ], "items": [ diff --git a/mardi_importer/mardi_importer/openml/wikidata_entities.txt b/mardi_importer/mardi_importer/openml/wikidata_entities.txt index 2ac3c04..728dc2c 100644 --- a/mardi_importer/mardi_importer/openml/wikidata_entities.txt +++ b/mardi_importer/mardi_importer/openml/wikidata_entities.txt @@ -1,4 +1,13 @@ Q1172284 P2701 P4092 -Q185235 \ No newline at end of file +Q185235 +Q4489412 +P356 +P818 +Q28130012 +Q27017232 +Q6938433 +Q13442814 +Q185235 +P459 \ No newline at end of file From a3dc04e80cae6495def6420db360d3f3669e3c93 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 25 Jan 2024 12:25:19 +0100 Subject: [PATCH 3/6] add property for citation text --- mardi_importer/mardi_importer/openml/OpenMLDataset.py | 4 ++++ mardi_importer/mardi_importer/openml/new_entities.json | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/mardi_importer/mardi_importer/openml/OpenMLDataset.py b/mardi_importer/mardi_importer/openml/OpenMLDataset.py index 75f0619..cbe8575 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLDataset.py +++ b/mardi_importer/mardi_importer/openml/OpenMLDataset.py @@ -168,6 +168,7 @@ def insert_claims(self): tag_claims.append(claim) self.item.add_claims(tag_claims) if self.paper_url and self.paper_url != "None": + #create item for this identifier, identifier_type = self.get_identifier() publication = OpenMLPublication(integrator=self.api, identifier=identifier, identifier_type=identifier_type) @@ -175,6 +176,9 @@ def insert_claims(self): if not paper_qid: paper_qid = publication.create() self.item.add_claim("wdt:P2860", paper_qid) + #create item for string + prop_nr = self.api.get_local_id_by_label("citation text", "property") + self.item.add_claim(prop_nr, self.paper_url) if self.md5_checksum and self.md5_checksum != "None": qualifier = [self.api.get_claim("wdt:P459", "wd:Q185235")] self.item.add_claims(self.api.get_claim("wdt:P4092", self.md5_checksum, qualifiers=qualifier)) diff --git a/mardi_importer/mardi_importer/openml/new_entities.json b/mardi_importer/mardi_importer/openml/new_entities.json index f80094a..9dd0b96 100644 --- a/mardi_importer/mardi_importer/openml/new_entities.json +++ b/mardi_importer/mardi_importer/openml/new_entities.json @@ -89,6 +89,11 @@ "label": "OpenML dataset ID", "description": "identifier for a dataset in the OpenML database of open datasets for machine learning", "datatype": "string" + }, + { + "label": "citation text", + "description": "free-form text about citation", + "datatype": "string" } ], "items": [ From 8ae36d452057c9b5c92a987370eedc9056e1b5f3 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Feb 2024 09:21:50 +0100 Subject: [PATCH 4/6] correct code --- .../mardi_importer/openml/OpenMLDataset.py | 49 +++++---- .../openml/OpenMLPublication.py | 8 +- .../mardi_importer/openml/OpenMLSource.py | 99 +++++++++++-------- .../openml/wikidata_entities.txt | 3 +- 4 files changed, 93 insertions(+), 66 deletions(-) diff --git a/mardi_importer/mardi_importer/openml/OpenMLDataset.py b/mardi_importer/mardi_importer/openml/OpenMLDataset.py index cbe8575..106cdd4 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLDataset.py +++ b/mardi_importer/mardi_importer/openml/OpenMLDataset.py @@ -1,6 +1,7 @@ import re import sys from .OpenMLPublication import OpenMLPublication +import validators semantic_tags = [ "Agriculture", @@ -39,7 +40,6 @@ def __init__( self, integrator, name, - description, dataset_id, version, creators, @@ -67,7 +67,7 @@ def __init__( ): self.api = integrator self.name = name #done - self.dataset_id = dataset_id #done + self.dataset_id = str(dataset_id) #done self.version = version #done self.creators = creators self.contributors = contributors @@ -104,18 +104,19 @@ def create(self): self.item.add_claim("wdt:P31", "wd:Q1172284") self.insert_claims() dataset_id = self.item.write().id + print(f"Dataset with id {dataset_id} created") return(dataset_id) def insert_claims(self): prop_nr = self.api.get_local_id_by_label("OpenML dataset ID", "property") - self.item.add_claim(prop_nr, str(self.dataset_id)) + self.item.add_claim(prop_nr, self.dataset_id) if self.version is not None and self.version != "None": prop_nr = self.api.get_local_id_by_label("dataset version", "property") self.item.add_claim(prop_nr, str(self.version)) if self.creators and self.creators != "None": creator_claims = [] prop_nr = self.api.get_local_id_by_label("author name string", "property") - if not isinstance(object, list): + if not isinstance(self.creators, list): self.creators = [self.creators] for c in self.creators: claim = self.api.get_claim(prop_nr, c) @@ -124,7 +125,7 @@ def insert_claims(self): if self.contributors and self.contributors != "None": contributor_claims = [] prop_nr = self.api.get_local_id_by_label("author name string", "property") - if not isinstance(object, list): + if not isinstance(self.contributors, list): self.contributors = [self.contributors] for c in self.contributors: claim = self.api.get_claim(prop_nr, c) @@ -141,10 +142,10 @@ def insert_claims(self): claims = self.process_licenses() self.item.add_claims(claims) url_claims = [] - if self.url and self.url != "None": + if self.url and self.url != "None" and validators.url(self.url): claim = self.api.get_claim("wdt:P953", self.url) url_claims.append(claim) - if self.original_data_url and self.original_data_url != "None": + if self.original_data_url and self.original_data_url != "None" and validators.url(self.original_data_url): claim = self.api.get_claim("wdt:P953", self.original_data_url) url_claims.append(claim) if url_claims: @@ -170,12 +171,13 @@ def insert_claims(self): if self.paper_url and self.paper_url != "None": #create item for this identifier, identifier_type = self.get_identifier() - publication = OpenMLPublication(integrator=self.api, identifier=identifier, + if identifier: + publication = OpenMLPublication(integrator=self.api, identifier=identifier, identifier_type=identifier_type) - paper_qid = publication.exists() - if not paper_qid: - paper_qid = publication.create() - self.item.add_claim("wdt:P2860", paper_qid) + paper_qid = publication.exists() + if not paper_qid: + paper_qid = publication.create() + self.item.add_claim("wdt:P2860", paper_qid) #create item for string prop_nr = self.api.get_local_id_by_label("citation text", "property") self.item.add_claim(prop_nr, self.paper_url) @@ -227,6 +229,9 @@ def insert_claims(self): self.item.add_claim("wdt:P2701", qid) else: sys.exit(f"Invalid file format {self.format}") + profile_prop = self.api.get_local_id_by_label("MaRDI profile type", "property") + profile_target = self.api.get_local_id_by_label("MaRDI dataset profile", "property") + self.item.add_claim(profile_prop, profile_target) def exists(self): """Checks if a WB item corresponding to the dataset already exists. @@ -243,6 +248,8 @@ def exists(self): self.QID = self.item.is_instance_of_with_property( "wd:Q1172284", "wdt:P11238", self.dataset_id ) + if self.QID: + print(f"Dataset exists with QID {self.QID}") return self.QID def update(self): @@ -310,41 +317,41 @@ def get_identifier(self): elif "http" not in self.paper_url: return(None,None) elif "dl.acm.org" in self.paper_url: - return("/".join(self.paper_url.split("/")[-2:]), "doi") + return("/".join(self.paper_url.split("/")[-2:]).lower(), "doi") elif "doi=" in self.paper_url: doi = self.paper_url.split("doi=")[-1] if "&" in doi: doi = doi.split("&")[0] - return(doi, "doi") + return(doi.lower(), "doi") elif "link.springer" in self.paper_url: doi = "/".join(self.paper_url.split("/")[-2:]) if "%" in doi: return(None, None) else: - return(doi, "doi") + return(doi.lower(), "doi") elif "wiley" in self.paper_url: doi = "/".join(self.paper_url.split("/")[-2:]) if "?" in doi: doi = doi.split("?")[0] - return(doi, "doi") + return(doi.lower(), "doi") elif "biomedcentral" in self.paper_url: doi = "/".join(self.paper_url.split("/")[-2:]) - return(doi, "doi") + return(doi.lower(), "doi") elif "tandfonline" in self.paper_url: doi = "/".join(self.paper_url.split("/")[-2:]) - return(doi, "doi") + return(doi.lower(), "doi") elif "arxiv" in self.paper_url: arxiv_id = self.paper_url.split("/")[-1] return(arxiv_id, "arxiv") elif "royalsociety" in self.paper_url: doi = "/".join(self.paper_url.split("/")[-2:]) - return(doi, "doi") + return(doi.lower(), "doi") elif "sagepub" in self.paper_url: doi = "/".join(self.paper_url.split("/")[-2:]) - return(doi, "doi") + return(doi.lower(), "doi") elif "science.org" in self.paper_url: doi = "/".join(self.paper_url.split("/")[-2:]) - return(doi, "doi") + return(doi.lower(), "doi") else: return(None, None) diff --git a/mardi_importer/mardi_importer/openml/OpenMLPublication.py b/mardi_importer/mardi_importer/openml/OpenMLPublication.py index 61899f4..fa46ed1 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLPublication.py +++ b/mardi_importer/mardi_importer/openml/OpenMLPublication.py @@ -37,11 +37,13 @@ def exists(self): "wdt:P818", self.identifier ) else: - sys.exit("Invalid identifier type") + sys.exit(f"Invalid identifier type {self.identifier_type}") if not QID_list: self.QID = None else: self.QID = QID_list[0] + if self.QID: + print(f"Publication with qid {self.QID} exists") return(self.QID) def create(self): @@ -50,7 +52,11 @@ def create(self): self.item.add_claim("wdt:P356", self.identifier) elif self.identifier_type == "arxiv": self.item.add_claim("wdt:P356", self.identifier) + profile_prop = self.api.get_local_id_by_label("MaRDI profile type", "property") + profile_target = self.api.get_local_id_by_label("MaRDI publication profile", "property") + self.item.add_claim(profile_prop, profile_target) self.item.descriptions.set(language="en", value=f"scientific article about an OpenML dataset") publication_id = self.item.write().id + print(f"Publication with the qid {publication_id} has been created.") return publication_id diff --git a/mardi_importer/mardi_importer/openml/OpenMLSource.py b/mardi_importer/mardi_importer/openml/OpenMLSource.py index f34e334..c93b114 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLSource.py +++ b/mardi_importer/mardi_importer/openml/OpenMLSource.py @@ -4,6 +4,7 @@ from .OpenMLDataset import OpenMLDataset import os import json +from itertools import zip_longest class OpenMLSource(ADataSource): def __init__(self): @@ -56,7 +57,13 @@ def pull(self): dataset_df = openml.datasets.list_datasets(output_format="dataframe") did_list = dataset_df["did"].unique() for did in did_list: - ds = openml.datasets.get_dataset(int(did), download_data=False) + try: + ds = openml.datasets.get_dataset(int(did), download_data=False) + except Exception as e: + try: + ds = openml.datasets.get_dataset(int(did), download_data=False, download_qualities=False) + except Exception as e: + ds = openml.datasets.get_dataset(int(did), download_data=False, download_qualities=False, download_features_meta_data=False) dataset_dict["name"].append(ds.name) dataset_dict["dataset_id"].append(did) dataset_dict["version"].append(ds.version) @@ -72,7 +79,10 @@ def pull(self): dataset_dict["original_data_url"].append(ds.original_data_url) dataset_dict["paper_url"].append(ds.paper_url) dataset_dict["md5_checksum"].append(ds.md5_checksum) - dataset_dict["features"].append(ds.features) + try: + dataset_dict["features"].append(ds.features) + except: + dataset_dict["features"].append(None) try: qualities = ds.qualities except: @@ -120,47 +130,50 @@ def pull(self): return(dataset_dict) def push(self): - dataset_dict = {'name': ['kr-vs-kp'], - 'description': ['Author: Alen Shapiro\nSource: [UCI](https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn))\nPlease cite: [UCI citation policy](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n1. Title: Chess End-Game -- King+Rook versus King+Pawn on a7\n(usually abbreviated KRKPA7). The pawn on a7 means it is one square\naway from queening. It is the King+Rook\'s side (white) to move.\n\n2. Sources:\n(a) Database originally generated and described by Alen Shapiro.\n(b) Donor/Coder: Rob Holte (holte@uottawa.bitnet). The database\nwas supplied to Holte by Peter Clark of the Turing Institute\nin Glasgow (pete@turing.ac.uk).\n(c) Date: 1 August 1989\n\n3. Past Usage:\n- Alen D. Shapiro (1983,1987), "Structured Induction in Expert Systems",\nAddison-Wesley. This book is based on Shapiro\'s Ph.D. thesis (1983)\nat the University of Edinburgh entitled "The Role of Structured\nInduction in Expert Systems".\n- Stephen Muggleton (1987), "Structuring Knowledge by Asking Questions",\npp.218-229 in "Progress in Machine Learning", edited by I. Bratko\nand Nada Lavrac, Sigma Press, Wilmslow, England SK9 5BB.\n- Robert C. Holte, Liane Acker, and Bruce W. Porter (1989),\n"Concept Learning and the Problem of Small Disjuncts",\nProceedings of IJCAI. Also available as technical report AI89-106,\nComputer Sciences Department, University of Texas at Austin,\nAustin, Texas 78712.\n\n4. Relevant Information:\nThe dataset format is described below. Note: the format of this\ndatabase was modified on 2/26/90 to conform with the format of all\nthe other databases in the UCI repository of machine learning databases.\n\n5. Number of Instances: 3196 total\n\n6. Number of Attributes: 36\n\n7. Attribute Summaries:\nClasses (2): -- White-can-win ("won") and White-cannot-win ("nowin").\nI believe that White is deemed to be unable to win if the Black pawn\ncan safely advance.\nAttributes: see Shapiro\'s book.\n\n8. Missing Attributes: -- none\n\n9. Class Distribution:\nIn 1669 of the positions (52%), White can win.\nIn 1527 of the positions (48%), White cannot win.\n\nThe format for instances in this database is a sequence of 37 attribute values.\nEach instance is a board-descriptions for this chess endgame. The first\n36 attributes describe the board. The last (37th) attribute is the\nclassification: "win" or "nowin". There are 0 missing values.\nA typical board-description is\n\nf,f,f,f,f,f,f,f,f,f,f,f,l,f,n,f,f,t,f,f,f,f,f,f,f,t,f,f,f,f,f,f,f,t,t,n,won\n\nThe names of the features do not appear in the board-descriptions.\nInstead, each feature correponds to a particular position in the\nfeature-value list. For example, the head of this list is the value\nfor the feature "bkblk". The following is the list of features, in\nthe order in which their values appear in the feature-value list:\n\n[bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,cntxt,dsopp,dwipd,\nhdchk,katri,mulch,qxmsq,r2ar8,reskd,reskr,rimmx,rkxwp,rxmsq,simpl,skach,skewr,\nskrxp,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg]\n\nIn the file, there is one instance (board position) per line.\n\n\nNum Instances: 3196\nNum Attributes: 37\nNum Continuous: 0 (Int 0 / Real 0)\nNum Discrete: 37\nMissing values: 0 / 0.0%'], - 'dataset_id': [3], - 'version': [1], - 'creators': ['Alen Shapiro'], - 'contributors': ['Rob Holte'], - 'collection_date': ['1989-08-01'], - 'upload_date': ['2014-04-06T23:19:28'], - 'license': ['CC0'], - 'url': ['https://api.openml.org/data/v1/download/3/kr-vs-kp.arff'], - 'default_target_attribute': ['class'], - 'row_id_attribute': [None], - 'tags': [['Machine Learning', - 'Mathematics', - 'mythbusting_1', - 'OpenML-CC18', - 'OpenML100', - 'study_1', - 'study_123', - 'study_14', - 'study_144', - 'uci']], - 'original_data_url': ['https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn)'], - 'paper_url': ['https://dl.acm.org/doi/abs/10.5555/32231'], - 'md5_checksum': ['ad6eb32b7492524d4382a40e23cdbb8e'], - 'features': [{0: ["0 - bkblk (nominal)"], - 1: ["1 - bknwy (nominal)"], - 2: ["2 - bkon8 (nominal)"], - 3: ["3 - bkona (nominal)"], - 36: ["36 - class (nominal)"]}], - 'num_binary_features': [35.0], - 'num_classes': [2.0], - 'num_features': [37.0], - 'num_instances': [3196.0], - 'num_instances_missing_vals': [0.0], - 'num_missing_vals': [0.0], - 'num_numeric_features': [0.0], - 'num_symbolic_features': [37.0], - 'format': ['ARFF']} - for elements in zip(*dataset_dict.values()): - lookup_dict = dict(zip(dataset_dict.keys(), elements)) + import pickle + with open('/data/dataset_dict.p', 'rb') as handle: + dataset_dict = pickle.load(handle) + # dataset_dict = {'name': ['kr-vs-kp'], + # 'description': ['Author: Alen Shapiro\nSource: [UCI](https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn))\nPlease cite: [UCI citation policy](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n1. Title: Chess End-Game -- King+Rook versus King+Pawn on a7\n(usually abbreviated KRKPA7). The pawn on a7 means it is one square\naway from queening. It is the King+Rook\'s side (white) to move.\n\n2. Sources:\n(a) Database originally generated and described by Alen Shapiro.\n(b) Donor/Coder: Rob Holte (holte@uottawa.bitnet). The database\nwas supplied to Holte by Peter Clark of the Turing Institute\nin Glasgow (pete@turing.ac.uk).\n(c) Date: 1 August 1989\n\n3. Past Usage:\n- Alen D. Shapiro (1983,1987), "Structured Induction in Expert Systems",\nAddison-Wesley. This book is based on Shapiro\'s Ph.D. thesis (1983)\nat the University of Edinburgh entitled "The Role of Structured\nInduction in Expert Systems".\n- Stephen Muggleton (1987), "Structuring Knowledge by Asking Questions",\npp.218-229 in "Progress in Machine Learning", edited by I. Bratko\nand Nada Lavrac, Sigma Press, Wilmslow, England SK9 5BB.\n- Robert C. Holte, Liane Acker, and Bruce W. Porter (1989),\n"Concept Learning and the Problem of Small Disjuncts",\nProceedings of IJCAI. Also available as technical report AI89-106,\nComputer Sciences Department, University of Texas at Austin,\nAustin, Texas 78712.\n\n4. Relevant Information:\nThe dataset format is described below. Note: the format of this\ndatabase was modified on 2/26/90 to conform with the format of all\nthe other databases in the UCI repository of machine learning databases.\n\n5. Number of Instances: 3196 total\n\n6. Number of Attributes: 36\n\n7. Attribute Summaries:\nClasses (2): -- White-can-win ("won") and White-cannot-win ("nowin").\nI believe that White is deemed to be unable to win if the Black pawn\ncan safely advance.\nAttributes: see Shapiro\'s book.\n\n8. Missing Attributes: -- none\n\n9. Class Distribution:\nIn 1669 of the positions (52%), White can win.\nIn 1527 of the positions (48%), White cannot win.\n\nThe format for instances in this database is a sequence of 37 attribute values.\nEach instance is a board-descriptions for this chess endgame. The first\n36 attributes describe the board. The last (37th) attribute is the\nclassification: "win" or "nowin". There are 0 missing values.\nA typical board-description is\n\nf,f,f,f,f,f,f,f,f,f,f,f,l,f,n,f,f,t,f,f,f,f,f,f,f,t,f,f,f,f,f,f,f,t,t,n,won\n\nThe names of the features do not appear in the board-descriptions.\nInstead, each feature correponds to a particular position in the\nfeature-value list. For example, the head of this list is the value\nfor the feature "bkblk". The following is the list of features, in\nthe order in which their values appear in the feature-value list:\n\n[bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,cntxt,dsopp,dwipd,\nhdchk,katri,mulch,qxmsq,r2ar8,reskd,reskr,rimmx,rkxwp,rxmsq,simpl,skach,skewr,\nskrxp,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg]\n\nIn the file, there is one instance (board position) per line.\n\n\nNum Instances: 3196\nNum Attributes: 37\nNum Continuous: 0 (Int 0 / Real 0)\nNum Discrete: 37\nMissing values: 0 / 0.0%'], + # 'dataset_id': [3], + # 'version': [1], + # 'creators': ['Alen Shapiro'], + # 'contributors': ['Rob Holte'], + # 'collection_date': ['1989-08-01'], + # 'upload_date': ['2014-04-06T23:19:28'], + # 'license': ['CC0'], + # 'url': ['https://api.openml.org/data/v1/download/3/kr-vs-kp.arff'], + # 'default_target_attribute': ['class'], + # 'row_id_attribute': [None], + # 'tags': [['Machine Learning', + # 'Mathematics', + # 'mythbusting_1', + # 'OpenML-CC18', + # 'OpenML100', + # 'study_1', + # 'study_123', + # 'study_14', + # 'study_144', + # 'uci']], + # 'original_data_url': ['https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn)'], + # 'paper_url': ['https://dl.acm.org/doi/abs/10.5555/32231'], + # 'md5_checksum': ['ad6eb32b7492524d4382a40e23cdbb8e'], + # 'features': [{0: ["0 - bkblk (nominal)"], + # 1: ["1 - bknwy (nominal)"], + # 2: ["2 - bkon8 (nominal)"], + # 3: ["3 - bkona (nominal)"], + # 36: ["36 - class (nominal)"]}], + # 'num_binary_features': [35.0], + # 'num_classes': [2.0], + # 'num_features': [37.0], + # 'num_instances': [3196.0], + # 'num_instances_missing_vals': [0.0], + # 'num_missing_vals': [0.0], + # 'num_numeric_features': [0.0], + # 'num_symbolic_features': [37.0], + # 'format': ['ARFF']} + for items in zip_longest(*[dataset_dict[key] for key in dataset_dict], fillvalue=None): + lookup_dict = dict(zip(dataset_dict.keys(), items)) dataset = OpenMLDataset( integrator = self.integrator, **lookup_dict diff --git a/mardi_importer/mardi_importer/openml/wikidata_entities.txt b/mardi_importer/mardi_importer/openml/wikidata_entities.txt index 728dc2c..48f047d 100644 --- a/mardi_importer/mardi_importer/openml/wikidata_entities.txt +++ b/mardi_importer/mardi_importer/openml/wikidata_entities.txt @@ -10,4 +10,5 @@ Q27017232 Q6938433 Q13442814 Q185235 -P459 \ No newline at end of file +P459 +P11238 \ No newline at end of file From b15d60bfe0762ffe3abb9c4a2d74e69f5622e012 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 21 Mar 2024 10:36:05 +0100 Subject: [PATCH 5/6] add python package import --- mardi_importer/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mardi_importer/setup.py b/mardi_importer/setup.py index 8a670a5..48504a9 100644 --- a/mardi_importer/setup.py +++ b/mardi_importer/setup.py @@ -30,6 +30,7 @@ "sickle", "sparqlwrapper", "sqlalchemy", + "validators", "wikibaseintegrator" ], # entry_points={"console_scripts": ["import = scripts.main:main"]}, From dc6e4419d964d4b848e30cfeaf4050b41428286d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 21 Mar 2024 13:55:38 +0100 Subject: [PATCH 6/6] fix unnecessary creation --- mardi_importer/mardi_importer/openml/OpenMLDataset.py | 9 +++------ mardi_importer/mardi_importer/openml/new_entities.json | 10 ---------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/mardi_importer/mardi_importer/openml/OpenMLDataset.py b/mardi_importer/mardi_importer/openml/OpenMLDataset.py index 106cdd4..3c8e0da 100644 --- a/mardi_importer/mardi_importer/openml/OpenMLDataset.py +++ b/mardi_importer/mardi_importer/openml/OpenMLDataset.py @@ -108,27 +108,24 @@ def create(self): return(dataset_id) def insert_claims(self): - prop_nr = self.api.get_local_id_by_label("OpenML dataset ID", "property") - self.item.add_claim(prop_nr, self.dataset_id) + self.item.add_claim("wdt:P11238", self.dataset_id) if self.version is not None and self.version != "None": prop_nr = self.api.get_local_id_by_label("dataset version", "property") self.item.add_claim(prop_nr, str(self.version)) if self.creators and self.creators != "None": creator_claims = [] - prop_nr = self.api.get_local_id_by_label("author name string", "property") if not isinstance(self.creators, list): self.creators = [self.creators] for c in self.creators: - claim = self.api.get_claim(prop_nr, c) + claim = self.api.get_claim("wdt:P2093", c) creator_claims.append(claim) self.item.add_claims(creator_claims) if self.contributors and self.contributors != "None": contributor_claims = [] - prop_nr = self.api.get_local_id_by_label("author name string", "property") if not isinstance(self.contributors, list): self.contributors = [self.contributors] for c in self.contributors: - claim = self.api.get_claim(prop_nr, c) + claim = self.api.get_claim("wdt:P2093", c) contributor_claims.append(claim) self.item.add_claims(contributor_claims) if self.collection_date and self.collection_date != "None": diff --git a/mardi_importer/mardi_importer/openml/new_entities.json b/mardi_importer/mardi_importer/openml/new_entities.json index 9dd0b96..eb2d0c0 100644 --- a/mardi_importer/mardi_importer/openml/new_entities.json +++ b/mardi_importer/mardi_importer/openml/new_entities.json @@ -50,11 +50,6 @@ "description": "the number of classes", "datatype": "quantity" }, - { - "label": "author name string", - "description": "name string of the author", - "datatype": "string" - }, { "label": "number of features", "description": "the number of features", @@ -85,11 +80,6 @@ "description": "the number of symbolic features", "datatype": "quantity" }, - { - "label": "OpenML dataset ID", - "description": "identifier for a dataset in the OpenML database of open datasets for machine learning", - "datatype": "string" - }, { "label": "citation text", "description": "free-form text about citation",