From 1d5ee8893b442a9ea3fa379875be686d13614434 Mon Sep 17 00:00:00 2001 From: Zairon Jacobs Date: Thu, 6 Jul 2023 14:48:11 +0200 Subject: [PATCH 1/2] cache instruments --- setup.py | 69 ++++++++++++---------- src/harmony/parsing/wrapper_all_parsers.py | 29 +++++++-- src/harmony/services/__init__.py | 0 src/harmony/services/instruments_cache.py | 40 +++++++++++++ src/harmony/util/cache_heper.py | 7 +++ src/harmony/util/singleton_meta.py | 11 ++++ 6 files changed, 121 insertions(+), 35 deletions(-) create mode 100644 src/harmony/services/__init__.py create mode 100644 src/harmony/services/instruments_cache.py create mode 100644 src/harmony/util/cache_heper.py create mode 100644 src/harmony/util/singleton_meta.py diff --git a/setup.py b/setup.py index 0e43acd..b34aafe 100644 --- a/setup.py +++ b/setup.py @@ -1,48 +1,55 @@ import setuptools -with open('README.md', 'r', encoding='utf-8') as fh: +with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() setuptools.setup( - name='harmonydata', - author='Thomas Wood', - author_email='thomas@fastdatascience.com', - description='Harmony Tool for Retrospective Data Harmonisation', - keywords='harmony, harmonisation, harmonization, harmonise, harmonize', + name="harmonydata", + author="Thomas Wood", + author_email="thomas@fastdatascience.com", + description="Harmony Tool for Retrospective Data Harmonisation", + keywords="harmony, harmonisation, harmonization, harmonise, harmonize", long_description=long_description, - long_description_content_type='text/markdown', - url='https://github.com/harmonydata/harmony', + long_description_content_type="text/markdown", + url="https://github.com/harmonydata/harmony", project_urls={ - 'Documentation': 'https://harmonydata.org/', - 'Bug Reports': - 'https://github.com/harmonydata/harmony/issues', - 'Source Code': 'https://github.com/harmonydata/harmony', + "Documentation": "https://harmonydata.org/", + "Bug Reports": "https://github.com/harmonydata/harmony/issues", + "Source Code": "https://github.com/harmonydata/harmony", # 'Funding': '', # 'Say Thanks!': '', }, - package_dir={'': 'src'}, - packages=setuptools.find_packages(where='src'), + package_dir={"": "src"}, + packages=setuptools.find_packages(where="src"), classifiers=[ # see https://pypi.org/classifiers/ - 'Development Status :: 5 - Production/Stable', - - 'Intended Audience :: Developers', - 'Topic :: Software Development :: Build Tools', - - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3 :: Only', - 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Topic :: Software Development :: Build Tools", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3 :: Only", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", + install_requires=[ + "pydantic==1.10.7", + "pandas==2.0.0", + "tika==2.6.0", + "lxml==4.9.2", + "langdetect==1.0.9", + "XlsxWriter==3.0.9", + "openpyxl==3.1.2", + "spacy==3.5.3", + "azure-storage-blob==12.16.0", ], - python_requires='>=3.6', - install_requires=['pydantic==1.10.7','pandas==2.0.0','tika==2.6.0','lxml==4.9.2','langdetect==1.0.9','XlsxWriter==3.0.9','openpyxl==3.1.2','spacy==3.5.3'], extras_require={ - 'dev': ['check-manifest'], + "dev": ["check-manifest"], # 'test': ['coverage'], }, # entry_points={ diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py index 4fe40ce..db809ca 100644 --- a/src/harmony/parsing/wrapper_all_parsers.py +++ b/src/harmony/parsing/wrapper_all_parsers.py @@ -1,4 +1,3 @@ -import os from typing import List from harmony.parsing.excel_parser import convert_excel_to_instruments @@ -6,6 +5,8 @@ from harmony.parsing.text_parser import convert_text_to_instruments from harmony.schemas.enums.file_types import FileType from harmony.schemas.requests.text import RawFile, Instrument +from harmony.services.instruments_cache import InstrumentsCache +from harmony.util import cache_heper def _get_instruments_from_file(file): @@ -19,11 +20,31 @@ def _get_instruments_from_file(file): instruments_from_this_file = [] return instruments_from_this_file + def convert_files_to_instruments(files: List[RawFile]) -> List[Instrument]: - instruments = [] + """Get cached instruments of files or convert files to instruments""" + + instruments_cache = InstrumentsCache() + + instruments: List[Instrument] = [] + + # A list of files whose instruments are not cached + files_with_no_cached_instruments = [] for file in files: - instruments_from_this_file = _get_instruments_from_file(file) - instruments.extend(instruments_from_this_file) + hash_value = cache_heper.get_hash_value(file.content) + if instruments_cache.has(hash_value): + # If instruments are cached + instruments.extend(instruments_cache.get(hash_value)) + else: + # If instruments are not cached + files_with_no_cached_instruments.append(file) + + # Get instruments that aren't cached yet and cache them + for file_with_no_cached_instruments in files_with_no_cached_instruments: + new_instruments = _get_instruments_from_file(file_with_no_cached_instruments) + hash_value = cache_heper.get_hash_value(file_with_no_cached_instruments.content) + instruments_cache.set(hash_value, new_instruments) + instruments.extend(new_instruments) return instruments diff --git a/src/harmony/services/__init__.py b/src/harmony/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/harmony/services/instruments_cache.py b/src/harmony/services/instruments_cache.py new file mode 100644 index 0000000..3d54ff6 --- /dev/null +++ b/src/harmony/services/instruments_cache.py @@ -0,0 +1,40 @@ +from typing import List + +from harmony.schemas.requests.text import Instrument +from harmony.util.singleton_meta import SingletonMeta + + +class InstrumentsCache(metaclass=SingletonMeta): + """ + This class is responsible for caching instruments (Singleton class) + """ + + def __init__(self): + self.__cache: dict[str, List[Instrument]] = {} + + self.__load() + + def __load(self): + """Load cache""" + + self.__cache = {} + + def set(self, key: str, value: List[Instrument]): + """Set key value pair""" + + self.__cache[key] = value + + def get(self, key: str) -> List[Instrument]: + """Get value by key""" + + return self.__cache.get(key) + + def has(self, key: str) -> bool: + """Check if key is in cache""" + + return key in self.__cache + + def get_all(self) -> dict[str, List[Instrument]]: + """Get the whole cache""" + + return self.__cache diff --git a/src/harmony/util/cache_heper.py b/src/harmony/util/cache_heper.py new file mode 100644 index 0000000..b587fca --- /dev/null +++ b/src/harmony/util/cache_heper.py @@ -0,0 +1,7 @@ +from hashlib import sha256 + + +def get_hash_value(text: str) -> str: + """Get hash value""" + + return sha256(text.encode()).hexdigest() diff --git a/src/harmony/util/singleton_meta.py b/src/harmony/util/singleton_meta.py new file mode 100644 index 0000000..9964543 --- /dev/null +++ b/src/harmony/util/singleton_meta.py @@ -0,0 +1,11 @@ +"""Singleton meta class""" + + +class SingletonMeta(type): + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + instance = super().__call__(*args, **kwargs) + cls._instances[cls] = instance + return cls._instances[cls] From 2efab1fd4f986dbb21a6a5d2d763c0f3c8de7dd4 Mon Sep 17 00:00:00 2001 From: Zairon Jacobs Date: Fri, 7 Jul 2023 12:32:19 +0200 Subject: [PATCH 2/2] undo cache in convert_files_to_instruments --- src/harmony/parsing/wrapper_all_parsers.py | 25 ++------------ src/harmony/services/instruments_cache.py | 40 ---------------------- src/harmony/util/cache_heper.py | 7 ---- src/harmony/util/singleton_meta.py | 11 ------ 4 files changed, 3 insertions(+), 80 deletions(-) delete mode 100644 src/harmony/services/instruments_cache.py delete mode 100644 src/harmony/util/cache_heper.py delete mode 100644 src/harmony/util/singleton_meta.py diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py index db809ca..b541f53 100644 --- a/src/harmony/parsing/wrapper_all_parsers.py +++ b/src/harmony/parsing/wrapper_all_parsers.py @@ -5,8 +5,6 @@ from harmony.parsing.text_parser import convert_text_to_instruments from harmony.schemas.enums.file_types import FileType from harmony.schemas.requests.text import RawFile, Instrument -from harmony.services.instruments_cache import InstrumentsCache -from harmony.util import cache_heper def _get_instruments_from_file(file): @@ -24,27 +22,10 @@ def _get_instruments_from_file(file): def convert_files_to_instruments(files: List[RawFile]) -> List[Instrument]: """Get cached instruments of files or convert files to instruments""" - instruments_cache = InstrumentsCache() - - instruments: List[Instrument] = [] - - # A list of files whose instruments are not cached - files_with_no_cached_instruments = [] + instruments = [] for file in files: - hash_value = cache_heper.get_hash_value(file.content) - if instruments_cache.has(hash_value): - # If instruments are cached - instruments.extend(instruments_cache.get(hash_value)) - else: - # If instruments are not cached - files_with_no_cached_instruments.append(file) - - # Get instruments that aren't cached yet and cache them - for file_with_no_cached_instruments in files_with_no_cached_instruments: - new_instruments = _get_instruments_from_file(file_with_no_cached_instruments) - hash_value = cache_heper.get_hash_value(file_with_no_cached_instruments.content) - instruments_cache.set(hash_value, new_instruments) - instruments.extend(new_instruments) + instruments_from_this_file = _get_instruments_from_file(file) + instruments.extend(instruments_from_this_file) return instruments diff --git a/src/harmony/services/instruments_cache.py b/src/harmony/services/instruments_cache.py deleted file mode 100644 index 3d54ff6..0000000 --- a/src/harmony/services/instruments_cache.py +++ /dev/null @@ -1,40 +0,0 @@ -from typing import List - -from harmony.schemas.requests.text import Instrument -from harmony.util.singleton_meta import SingletonMeta - - -class InstrumentsCache(metaclass=SingletonMeta): - """ - This class is responsible for caching instruments (Singleton class) - """ - - def __init__(self): - self.__cache: dict[str, List[Instrument]] = {} - - self.__load() - - def __load(self): - """Load cache""" - - self.__cache = {} - - def set(self, key: str, value: List[Instrument]): - """Set key value pair""" - - self.__cache[key] = value - - def get(self, key: str) -> List[Instrument]: - """Get value by key""" - - return self.__cache.get(key) - - def has(self, key: str) -> bool: - """Check if key is in cache""" - - return key in self.__cache - - def get_all(self) -> dict[str, List[Instrument]]: - """Get the whole cache""" - - return self.__cache diff --git a/src/harmony/util/cache_heper.py b/src/harmony/util/cache_heper.py deleted file mode 100644 index b587fca..0000000 --- a/src/harmony/util/cache_heper.py +++ /dev/null @@ -1,7 +0,0 @@ -from hashlib import sha256 - - -def get_hash_value(text: str) -> str: - """Get hash value""" - - return sha256(text.encode()).hexdigest() diff --git a/src/harmony/util/singleton_meta.py b/src/harmony/util/singleton_meta.py deleted file mode 100644 index 9964543..0000000 --- a/src/harmony/util/singleton_meta.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Singleton meta class""" - - -class SingletonMeta(type): - _instances = {} - - def __call__(cls, *args, **kwargs): - if cls not in cls._instances: - instance = super().__call__(*args, **kwargs) - cls._instances[cls] = instance - return cls._instances[cls]