From 758821cae2858263ba9c2667dea18d1972cc29bd Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 16 Jan 2024 08:48:28 +0800 Subject: [PATCH] Add migrate_cache() and remove deprecated funcs (#56) * feat: add migrate_cache(); * refactor: remove deprecated functions; * fix: update workflow; --- .github/workflows/testing_ci.yml | 6 +- .github/workflows/testing_daily.yml | 2 +- tests/test_tsdb.py | 3 +- tsdb/__init__.py | 25 ++--- tsdb/config.ini | 2 +- tsdb/data_processing.py | 152 +++++++--------------------- tsdb/database.py | 33 ------ tsdb/utils/config.py | 32 ++++++ tsdb/utils/file.py | 99 ++++++++++++------ 9 files changed, 149 insertions(+), 205 deletions(-) create mode 100644 tsdb/utils/config.py diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml index f9222d2..bee3c39 100644 --- a/.github/workflows/testing_ci.yml +++ b/.github/workflows/testing_ci.yml @@ -31,16 +31,16 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install pytest + pip install pytest pytest-cov pip install coverage - name: Test with pytest run: | - coverage run --source=tsdb -m pytest + python -m pytest -rA tests/test_tsdb.py -s --cov=tsdb - name: Write the LCOV report run: | - coverage lcov + python -m coverage lcov - name: Submit report uses: coverallsapp/github-action@master diff --git a/.github/workflows/testing_daily.yml b/.github/workflows/testing_daily.yml index 3334f71..0da30a5 100644 --- a/.github/workflows/testing_daily.yml +++ b/.github/workflows/testing_daily.yml @@ -40,7 +40,7 @@ jobs: - name: Test with pytest run: | - python -m pytest --cov=tsdb + python -m pytest -rA tests/test_tsdb.py -s --cov=tsdb - name: Generate the LCOV report run: | diff --git a/tests/test_tsdb.py b/tests/test_tsdb.py index ce64de5..abc7e23 100644 --- a/tests/test_tsdb.py +++ b/tests/test_tsdb.py @@ -47,13 +47,14 @@ def test_3_dataset_purging(self): cached_datasets = tsdb.list_cache() assert isinstance(cached_datasets, list) tsdb.delete_cache("physionet_2012") # delete single - tsdb.delete_cache() # delete all def test_4_migrate(self): os.makedirs("dir_for_migration") with open("dir_for_migration/test.txt", "a") as f: f.write("hello world") tsdb.migrate("dir_for_migration", "new_dir/put_it_here") + tsdb.migrate_cache("new_cache_dir") + tsdb.delete_cache() # delete all datasets def test_5_logging(self): # different level logging diff --git a/tsdb/__init__.py b/tsdb/__init__.py index f28f876..d5ecb1c 100644 --- a/tsdb/__init__.py +++ b/tsdb/__init__.py @@ -21,25 +21,22 @@ # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' -__version__ = "0.3" +__version__ = "0.3.1" -from .utils.file import migrate from .data_processing import ( + CACHED_DATASET_DIR, list, load, download_and_extract, list_cache, delete_cache, +) +from .utils.file import ( purge_path, - CACHED_DATASET_DIR, pickle_dump, pickle_load, - # below are deprecated functions, import for now, will be removed in v0.2 - list_database, - list_available_datasets, - list_cached_data, - load_dataset, - delete_cached_data, + migrate, + migrate_cache, ) __all__ = [ @@ -49,15 +46,11 @@ "download_and_extract", "list_cache", "delete_cache", - "purge_path", "CACHED_DATASET_DIR", + # file + "purge_path", "pickle_dump", "pickle_load", "migrate", - # below are deprecated functions, import for now, will be removed in v0.2 - "list_database", - "list_available_datasets", - "list_cached_data", - "load_dataset", - "delete_cached_data", + "migrate_cache", ] diff --git a/tsdb/config.ini b/tsdb/config.ini index ce896fc..d4f20cb 100644 --- a/tsdb/config.ini +++ b/tsdb/config.ini @@ -1,2 +1,2 @@ [path] -data_home = .tsdb +data_home = ~/.tsdb diff --git a/tsdb/data_processing.py b/tsdb/data_processing.py index 0ab99d4..09c93c8 100644 --- a/tsdb/data_processing.py +++ b/tsdb/data_processing.py @@ -7,10 +7,9 @@ import os import shutil -import sys import warnings -from .database import AVAILABLE_DATASETS, CACHED_DATASET_DIR +from .database import AVAILABLE_DATASETS from .loading_funcs import ( load_physionet2012, load_physionet2019, @@ -21,9 +20,11 @@ load_ais, ) from .utils.downloading import download_and_extract -from .utils.file import purge_path, pickle_load, pickle_dump +from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home from .utils.logging import logger +CACHED_DATASET_DIR = determine_data_home() + def list() -> list: """List the database. @@ -146,122 +147,39 @@ def list_cache() -> list: return dir_content -def delete_cache(dataset_name=None) -> None: - """Delete CACHED_DATASET_DIR if exists.""" - # if CACHED_DATASET_DIR does not exist, abort - if not os.path.exists(CACHED_DATASET_DIR): - logger.info("No cached data. Operation aborted.") - sys.exit() - # if CACHED_DATASET_DIR exists, then purge - if dataset_name is not None: - assert ( - dataset_name in AVAILABLE_DATASETS - ), f"{dataset_name} is not available in TSDB, so it has no cache. Please check your dataset name." - dir_to_delete = os.path.join(CACHED_DATASET_DIR, dataset_name) - if not os.path.exists(dir_to_delete): - logger.info(f"Dataset {dataset_name} is not cached. Operation aborted.") - sys.exit() - logger.info(f"Purging cached dataset {dataset_name} under {dir_to_delete}...") - else: - dir_to_delete = CACHED_DATASET_DIR - logger.info(f"Purging all cached data under {CACHED_DATASET_DIR}...") - purge_path(dir_to_delete) - - -# deprecated functions below - - -def list_available_datasets(): - """List all available datasets. - - Returns - ------- - AVAILABLE_DATASETS : list - A list contains all datasets' names. - - Warnings - -------- - The method list_available_datasets is deprecated. Please use ``list()`` instead. - - """ - logger.warning( - "🚨DeprecationWarning: The method list_available_datasets is deprecated. Please use `list()` instead." - ) - return list() - - -def list_database(): - """List the database. - - Returns - ------- - DATABASE : dict - A dict contains all datasets' names and download links. - - Warnings - -------- - The method list_available_datasets is deprecated. Please use `list()` instead. - - """ - logger.warning( - "🚨DeprecationWarning: The method list_available_datasets is deprecated. Please use `list()` instead." - ) - return list() - - -def list_cached_data(): - """List names of all cached datasets. - - Returns - ------- - list, - A list contains all cached datasets' names. - - Warnings - -------- - The method list_cached_data is deprecated. Please use `list_cache()` instead. - - """ - logger.warning( - "🚨DeprecationWarning: The method list_cached_data is deprecated. Please use `list_cache()` instead." - ) - return list_cache() - - -def load_dataset(dataset_name, use_cache=True): - """Load dataset with given name. +def delete_cache(dataset_name: str = None) -> None: + """Delete CACHED_DATASET_DIR if exists. Parameters ---------- - dataset_name : str, + dataset_name : str, optional The name of the specific dataset in database.DATABASE. - - use_cache : bool, - Whether to use cache (including data downloading and processing) - - Returns - ------- - result: - Loaded dataset in a Python dict. - - Warnings - -------- - The method load_dataset is deprecated. Please use `load()` instead. + If dataset is not cached, then abort. + Delete all cached datasets if dataset_name is left as None. """ - logger.warning( - "🚨DeprecationWarning: The method load_dataset is deprecated. Please use `load()` instead." - ) - return load(dataset_name, use_cache) - - -def delete_cached_data(dataset_name=None): - """Delete CACHED_DATASET_DIR if exists. - - Warnings - -------- - The method delete_cached_data is deprecated. Please use `delete_cache()` instead. - """ - logger.warning( - "🚨DeprecationWarning: The method delete_cached_data is deprecated. Please use `delete_cache()` instead." - ) - delete_cache(dataset_name) + # if CACHED_DATASET_DIR does not exist, abort + if not os.path.exists(CACHED_DATASET_DIR): + logger.error("❌ No cached data. Operation aborted.") + else: + # if CACHED_DATASET_DIR exists, then execute purging procedure + if dataset_name is None: # if dataset_name is not given, then purge all + logger.info( + f"`dataset_name` not given. Purging all cached data under {CACHED_DATASET_DIR}..." + ) + purge_path(CACHED_DATASET_DIR) + os.makedirs(CACHED_DATASET_DIR) + else: + assert ( + dataset_name in AVAILABLE_DATASETS + ), f"{dataset_name} is not available in TSDB, so it has no cache. Please check your dataset name." + dir_to_delete = os.path.join(CACHED_DATASET_DIR, dataset_name) + if not os.path.exists(dir_to_delete): + logger.error( + f"❌ Dataset {dataset_name} is not cached. Operation aborted." + ) + return + else: + logger.info( + f"Purging cached dataset {dataset_name} under {dir_to_delete}..." + ) + purge_path(dir_to_delete) diff --git a/tsdb/database.py b/tsdb/database.py index 41fce85..566b14f 100644 --- a/tsdb/database.py +++ b/tsdb/database.py @@ -5,39 +5,6 @@ # Created by Wenjie Du # License: BSD-3-Clause -import os -from configparser import ConfigParser - -from .utils.logging import logger - -config = ConfigParser() -tsdb_config_path = os.path.join(os.path.dirname(__file__), "config.ini") -config.read(tsdb_config_path) - -data_home_path = os.path.abspath(config.get("path", "data_home")) -old_cached_dataset_dir = os.path.abspath("~/.tsdb_cached_datasets") - -if os.path.exists(old_cached_dataset_dir): - # use the old path and warn the user - logger.warning( - "‼️ Detected the home dir of the old version TSDB. " - "Since v0.3, TSDB has changed the default cache dir to '~/.tsdb'. " - "You can migrate downloaded datasets by invoking the new function " - "tsdb.migrate(old='~/.tsdb_cached_datasets', new='~/.tsdb')" - ) - CACHED_DATASET_DIR = old_cached_dataset_dir -elif os.path.exists(data_home_path): - # use the path directly, may be in a portable disk - CACHED_DATASET_DIR = data_home_path -else: - # use the default path - default_path = os.path.abspath("~/.tsdb") - CACHED_DATASET_DIR = default_path - logger.warning( - f"‼️ The preset data_home path '{data_home_path}' doesn't exist. " - f"Using the default path '{default_path}'." - ) - _DATABASE = { # http://www.physionet.org/challenge/2012 diff --git a/tsdb/utils/config.py b/tsdb/utils/config.py new file mode 100644 index 0000000..9cdd4fb --- /dev/null +++ b/tsdb/utils/config.py @@ -0,0 +1,32 @@ +""" +Config functions for TSDB. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +import os +from configparser import ConfigParser + +from .logging import logger + +TSDB_BASE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +TSDB_CONFIG_FILE = os.path.join(TSDB_BASE_PATH, "config.ini") + + +def read_configs(): + config_parser = ConfigParser() + config_parser.read(TSDB_CONFIG_FILE) + return config_parser + + +def write_configs(config_parser, key_value_set): + for section in key_value_set.keys(): + for key in key_value_set[section].keys(): + value = key_value_set[section][key] + config_parser.set(section, key, value) + + with open(TSDB_CONFIG_FILE, "w") as f: + config_parser.write(f) + + logger.info("Wrote new configs to config.ini successfully.") diff --git a/tsdb/utils/file.py b/tsdb/utils/file.py index f8dbdd9..7c03e43 100644 --- a/tsdb/utils/file.py +++ b/tsdb/utils/file.py @@ -10,11 +10,9 @@ import pickle import shutil from typing import Optional -from configparser import ConfigParser - +from .config import read_configs, write_configs from .logging import logger -from ..database import CACHED_DATASET_DIR def pickle_dump(data: object, path: str) -> Optional[str]: @@ -91,15 +89,50 @@ def purge_path(path: str, ignore_errors: bool = True) -> None: if not os.path.exists(path): logger.info(f"Successfully deleted {path}.") else: + cached_dataset_dir = determine_data_home() raise FileExistsError( - f"Deleting operation failed. {CACHED_DATASET_DIR} still exists." + f"Deleting operation failed. {cached_dataset_dir} still exists." ) except shutil.Error: raise shutil.Error("Operation failed.") +def determine_data_home(): + # read data_home from the config file + config = read_configs() + data_home_path = config.get("path", "data_home") + # replace '~' with the absolute path if existing in the path + data_home_path = data_home_path.replace("~", os.path.expanduser("~")) + old_cached_dataset_dir = os.path.join( + os.path.expanduser("~"), ".tsdb_cached_datasets" + ) + + if os.path.exists(old_cached_dataset_dir): + # use the old path and warn the user + logger.warning( + "‼️ Detected the home dir of the old version TSDB. " + "Since v0.3, TSDB has changed the default cache dir to '~/.tsdb'. " + "Auto migrating downloaded datasets to the new path. " + ) + cached_dataset_dir = data_home_path + migrate(old_cached_dataset_dir, cached_dataset_dir) + elif os.path.exists(data_home_path): + # use the path directly, may be in a portable disk + cached_dataset_dir = data_home_path + else: + # use the default path + default_path = os.path.join(os.path.expanduser("~"), ".tsdb") + cached_dataset_dir = default_path + if os.path.abspath(data_home_path) != os.path.abspath(default_path): + logger.warning( + f"‼️ The preset data_home path '{data_home_path}' doesn't exist. " + f"Using the default path '{default_path}'." + ) + return cached_dataset_dir + + def migrate(old_path: str, new_path: str) -> None: - """Migrate datasets from old_path to new_path. + """Migrate files in a directory from old_path to new_path. Parameters ---------- @@ -113,40 +146,40 @@ def migrate(old_path: str, new_path: str) -> None: if not os.path.exists(old_path): raise FileNotFoundError(f"Given old_path {old_path} does not exist.") - if os.path.exists(new_path): - logger.warning(f"Please note that new_path {new_path} already exists.") - # if new_path exists, we have to move everything from old_path into it - all_old_files = os.listdir(old_path) - for f in all_old_files: - old_f_path = os.path.join(old_path, f) - if os.path.isdir(old_f_path): - new_f_path = os.path.join(new_path, f) - shutil.copytree(old_f_path, new_f_path) - else: - shutil.move(old_f_path, new_path) - shutil.rmtree(old_path, ignore_errors=True) - else: + if not os.path.exists(new_path): # if new_path does not exist, just rename the old_path into it new_parent_dir = os.path.abspath(os.path.join(new_path, "..")) if not os.path.exists(new_parent_dir): os.makedirs(new_parent_dir, exist_ok=True) - os.rename(old_path, new_path) - - config = ConfigParser() - parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - tsdb_config_path = os.path.join(parent_dir, "config.ini") - config.read(tsdb_config_path) - if os.path.abspath(old_path) == os.path.abspath(CACHED_DATASET_DIR): - config.set("path", "data_home", new_path) - with open(tsdb_config_path, "w") as f: - config.write(f) - - logger.info( - f"Found the given old_path is the current TSDB dataset cache directory. " - f"Have already set the new cache directory to {new_path}." - ) + logger.warning(f"‼️ Please note that new_path {new_path} already exists.") + # if new_path exists, we have to move everything from old_path into it + all_old_files = os.listdir(old_path) + for f in all_old_files: + old_f_path = os.path.join(old_path, f) + if os.path.isdir(old_f_path): + new_f_path = os.path.join(new_path, f) + shutil.copytree(old_f_path, new_f_path) + else: + shutil.move(old_f_path, new_path) + shutil.rmtree(old_path, ignore_errors=True) logger.info( f"Successfully migrated {old_path} to {new_path}, and deleted {old_path}" ) + + +def migrate_cache(target_path: str) -> None: + """Migrate datasets from old_path to new_path. + + Parameters + ---------- + target_path: + The new path for TSDB to store cached datasets. + + """ + cached_dataset_dir = determine_data_home() + migrate(cached_dataset_dir, target_path) + config_parser = read_configs() + write_configs(config_parser, {"path": {"data_home": target_path}}) + logger.info(f"Have set {target_path} as the default cache dir.")