Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization of some functions #462

Merged
merged 40 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
ea3bdd2
Some optimizations
hrshdhgd Nov 13, 2023
d47bf81
poetry update
hrshdhgd Nov 13, 2023
2b93a34
formatted
hrshdhgd Nov 13, 2023
0a32cf8
cleaned list comprehension
hrshdhgd Nov 13, 2023
8a8c0fe
pandas apply() over iterrows()
hrshdhgd Nov 14, 2023
62f7691
filter notnull() rows out before entering the for loop
hrshdhgd Nov 14, 2023
48225df
rollback prev commit
hrshdhgd Nov 14, 2023
e2e7288
check value first and then check if k is multivalued
hrshdhgd Nov 14, 2023
4c738f5
switched to dict comprehension from for loop
hrshdhgd Nov 14, 2023
4ab2660
dict comprehension instead of for loop
hrshdhgd Nov 14, 2023
1618b52
Use sets instead of lists
hrshdhgd Nov 14, 2023
77e02ab
corrected previous commit
hrshdhgd Nov 14, 2023
14b0475
Use sets instead of lists
hrshdhgd Nov 14, 2023
d74e72a
Improved SchemaView instantiation and persistence
hrshdhgd Nov 14, 2023
52b2e12
log warnings only if bad_attrs exists
hrshdhgd Nov 14, 2023
f2fb5b5
avoid unnecessary variable assignments
hrshdhgd Nov 14, 2023
c12dc61
move line into try except
hrshdhgd Nov 14, 2023
c24e0a8
updated lock file
hrshdhgd Nov 14, 2023
a9ed963
Merge branch 'master' into optimize
hrshdhgd Nov 14, 2023
5d2f327
use of @ cached_property
hrshdhgd Nov 14, 2023
964f032
shortened code
hrshdhgd Nov 14, 2023
d5390fb
formatted
hrshdhgd Nov 14, 2023
c8e4ea5
Added test for get_dict_from_mapping()
hrshdhgd Nov 15, 2023
a666ea1
Added extra comments
hrshdhgd Nov 15, 2023
6af4979
Merge branch 'master' into optimize
hrshdhgd Nov 16, 2023
788cc7e
extended prefix map change effect
hrshdhgd Nov 16, 2023
f3d16ae
Revert inclusion in try/except
cthoyt Nov 20, 2023
4ec556e
Revert style change
cthoyt Nov 20, 2023
8681b81
Cache reused constants
cthoyt Nov 20, 2023
92e8158
Add TODOs
cthoyt Nov 20, 2023
558ca15
Reduce diff
cthoyt Nov 20, 2023
2b2be7b
Fix list
cthoyt Nov 20, 2023
aa8a45c
changed assertion test
hrshdhgd Nov 20, 2023
2fe6430
Merge branch 'optimize' of https://github.com/mapping-commons/sssom-p…
hrshdhgd Nov 20, 2023
964aac8
added real orcids
hrshdhgd Nov 20, 2023
036f66e
orcid updated
hrshdhgd Nov 20, 2023
d2338f0
removed comments
hrshdhgd Nov 20, 2023
7cae890
moved constants to the SSSOMSchemaView class as attributes
hrshdhgd Nov 20, 2023
fbeeb88
docstring update and mypy check fix
hrshdhgd Nov 20, 2023
6f8a87f
Merge branch 'master' into optimize
matentzn Nov 20, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 15 additions & 22 deletions src/sssom/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import pathlib
import uuid
from enum import Enum
from functools import lru_cache
from typing import Any, Dict, List, Literal
from functools import cached_property, lru_cache
from typing import Any, Dict, List, Literal, Set

import pkg_resources
import yaml
Expand Down Expand Up @@ -213,48 +213,41 @@ class SSSOMSchemaView(object):
Implemented via PR: https://github.com/mapping-commons/sssom-py/pull/323
"""

_view = None
_dict = None

def __new__(cls):
"""Create a instance of the SSSOM schema view if non-existent."""
if not hasattr(cls, "instance"):
cls.instance = super(SSSOMSchemaView, cls).__new__(cls)
return cls.instance
return cls.instance

@property
@cached_property
def view(self) -> SchemaView:
"""Return SchemaView object."""
if self._view is None:
self._view = SchemaView(SCHEMA_YAML)
return self._view
return SchemaView(SCHEMA_YAML)

@property
@cached_property
def dict(self) -> dict:
"""Return SchemaView as a dictionary."""
if self._dict is None:
self._dict = schema_as_dict(self.view.schema)
return self._dict
return schema_as_dict(self.view.schema)

@property
@cached_property
def mapping_slots(self) -> List[str]:
"""Return list of mapping slots."""
return self.view.get_class("mapping").slots

@property
@cached_property
def mapping_set_slots(self) -> List[str]:
"""Return list of mapping set slots."""
return self.view.get_class("mapping set").slots

@property
def multivalued_slots(self) -> List[str]:
@cached_property
def multivalued_slots(self) -> Set[str]:
"""Return list of multivalued slots."""
return [c for c in self.view.all_slots() if self.view.get_slot(c).multivalued]
return {c for c in self.view.all_slots() if self.view.get_slot(c).multivalued}

@property
def entity_reference_slots(self) -> List[str]:
@cached_property
def entity_reference_slots(self) -> Set[str]:
"""Return list of entity reference slots."""
return [c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE]
return {c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE}
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved


@lru_cache(1)
Expand Down
47 changes: 30 additions & 17 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def _get_prefix_map_and_metadata(


def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]:
if is_multivalued_slot(k) and v is not None and isinstance(v, str):
if isinstance(v, str) and is_multivalued_slot(k):
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved
# IF k is multivalued, then v = List[values]
return [s.strip() for s in v.split("|")]
else:
Expand All @@ -329,20 +329,28 @@ def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet:
return mapping_set


MAPPING_SLOTS = set(_get_sssom_schema_object().mapping_slots)


def _get_mapping_dict(row: pd.Series, bad_attrs: Counter) -> Dict[str, Any]:
mdict = {}
sssom_schema_object = _get_sssom_schema_object()
for k, v in row.items():
if not v or pd.isna(v):
continue
k = cast(str, k)
if k in sssom_schema_object.mapping_slots:
mdict[k] = _address_multivalued_slot(k, v)
else:
# There's the possibility that the key is in
# sssom_schema_object.mapping_set_slots, but
# this is skipped for now
bad_attrs[k] += 1
"""Generate a mapping dictionary from a given row of data.

It also updates the 'bad_attrs' counter for keys that are not present
in the sssom_schema_object's mapping_slots.
"""
# Populate the mapping dictionary with key-value pairs from the row,
# only if the value exists, is not NaN, and the key is in the schema's mapping slots.
# The value could be a string or a list and is handled accordingly via _address_multivalued_slot().
mdict = {
k: _address_multivalued_slot(k, v)
for k, v in row.items()
if v and pd.notna(v) and k in MAPPING_SLOTS
}

# Update bad_attrs for keys not in mapping_slots
bad_keys = set(row.keys()) - MAPPING_SLOTS
for bad_key in bad_keys:
bad_attrs[bad_key] += 1
return mdict


Expand Down Expand Up @@ -795,9 +803,14 @@ def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument:
def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = None) -> MappingSet:
mapping_set = _init_mapping_set(meta)
bad_attrs: Counter = Counter()
for _, row in df.iterrows():
mapping_dict = _get_mapping_dict(row, bad_attrs)
_add_valid_mapping_to_list(mapping_dict, mapping_set.mappings)

df.apply(
lambda row: _add_valid_mapping_to_list(
_get_mapping_dict(row, bad_attrs), mapping_set.mappings
),
axis=1,
)

hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved
for k, v in bad_attrs.items():
logging.warning(f"No attr for {k} [{v} instances]")
return mapping_set
Expand Down
105 changes: 51 additions & 54 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,13 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr
df.replace("", np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True) # remove columns with all row = 'None'-s.

slots = _get_sssom_schema_object().dict["slots"]
slots_with_double_as_range = {
slot
for slot, slot_metadata in _get_sssom_schema_object().dict["slots"].items()
if slot_metadata["range"] == "double"
slot for slot, slot_metadata in slots.items() if slot_metadata["range"] == "double"
}
non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)]
non_double_cols = non_double_cols.replace(np.nan, "")
df[non_double_cols.columns] = non_double_cols
non_double_cols.replace(np.nan, "", inplace=True)
df.update(non_double_cols)

df = sort_df_rows_columns(df)
return cls.with_converter(df=df, converter=doc.converter, metadata=meta)
Expand Down Expand Up @@ -1036,6 +1035,11 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame:
return MappingSetDataFrame.from_mapping_set_document(doc)


DICT_FROM_MAPPING_ENUM_KEYS = set(_get_sssom_schema_object().dict["enums"].keys())
SLOTS = _get_sssom_schema_object().dict["slots"]
DOUBLE_SLOTS = {k for k, v in SLOTS.items() if v["range"] == "double"}
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved


def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) -> dict:
"""
Get information for linkml objects (MatchTypeEnum, PredicateModifierEnum) from the Mapping object and return the dictionary form of the object.
Expand All @@ -1044,46 +1048,36 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) ->
:return: Dictionary
"""
map_dict = {}
slots_with_double_as_range = [
s
for s in _get_sssom_schema_object().dict["slots"].keys()
if _get_sssom_schema_object().dict["slots"][s]["range"] == "double"
]
for property in map_obj:
if map_obj[property] is not None:
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(map_obj[property], list):
# IF object is an enum
if (
_get_sssom_schema_object().dict["slots"][property]["range"]
in _get_sssom_schema_object().dict["enums"].keys()
):
# IF object is a multivalued enum
if _get_sssom_schema_object().dict["slots"][property]["multivalued"]:
map_dict[property] = "|".join(
enum_value.code.text for enum_value in map_obj[property]
)
# If object is NOT multivalued BUT an enum.
else:
map_dict[property] = map_obj[property].code.text
# IF object is NOT an enum but a list
else:
map_dict[property] = "|".join(enum_value for enum_value in map_obj[property])
# IF object NOT a list
mapping_property = map_obj[property]
if mapping_property is None:
map_dict[property] = np.nan if property in DOUBLE_SLOTS else ""
continue

slot_of_interest = SLOTS[property]
is_enum = slot_of_interest["range"] in DICT_FROM_MAPPING_ENUM_KEYS

# Check if the mapping_property is a list
if isinstance(mapping_property, list):
# If the property is an enumeration and it allows multiple values
if is_enum and slot_of_interest["multivalued"]:
# FIXME needs test
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

needs test

# Join all the enum values into a string separated by '|'
map_dict[property] = "|".join(
enum_value.code.text for enum_value in mapping_property
)
else:
# IF object is an enum
if (
_get_sssom_schema_object().dict["slots"][property]["range"]
in _get_sssom_schema_object().dict["enums"].keys()
):
map_dict[property] = map_obj[property].code.text
else:
map_dict[property] = map_obj[property]
# If the property is not an enumeration or doesn't allow multiple values,
# join all the values into a string separated by '|'
map_dict[property] = "|".join(enum_value for enum_value in mapping_property)
elif is_enum:
# FIXME needs test
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

needs test

# Assign the text of the enumeration code to the property in the dictionary
map_dict[property] = mapping_property.code.text
else:
# IF map_obj[property] is None:
if property in slots_with_double_as_range:
map_dict[property] = np.nan
else:
map_dict[property] = ""
# If the mapping_property is neither a list nor an enumeration,
# assign the value directly to the property in the dictionary
map_dict[property] = mapping_property

return map_dict

Expand Down Expand Up @@ -1139,18 +1133,21 @@ def get_prefixes_used_in_table(df: pd.DataFrame, converter: Converter) -> Set[st
prefixes = set(SSSOM_BUILT_IN_PREFIXES)
if df.empty:
return prefixes
for col in _get_sssom_schema_object().entity_reference_slots:
if col not in df.columns:
continue
prefixes.update(
converter.parse_curie(row).prefix
for row in df[col]
# we don't use the converter here since get_prefixes_used_in_table
# is often used to identify prefixes that are not properly registered
# in the converter
if not _is_iri(row) and _is_curie(row)
)
return set(prefixes)
sssom_schema_object = _get_sssom_schema_object()
entity_reference_slots = sssom_schema_object.entity_reference_slots & set(df.columns)
new_prefixes = {
converter.parse_curie(row).prefix
for col in entity_reference_slots
for row in df[col]
if not _is_iri(row) and _is_curie(row)
# we don't use the converter here since get_prefixes_used_in_table
# is often used to identify prefixes that are not properly registered
# in the converter
}

prefixes.update(new_prefixes)

return prefixes


def get_prefixes_used_in_metadata(meta: MetadataType) -> Set[str]:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def test_parse_obographs(self):
self.assertEqual(
# this number went up from 8099 when the curies.Converter was introduced
# since it was able to handle CURIE prefix and URI prefix synonyms
8488,
cthoyt marked this conversation as resolved.
Show resolved Hide resolved
8489,
len(msdf.df),
f"{self.obographs_file} has the wrong number of mappings.",
)
Expand Down
Loading
Loading