Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable users to migrate tsdb cache data home #53

Merged
merged 6 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/testing_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
python-version: ['3.7', '3.10']
python-version: ['3.7', '3.11']

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/testing_daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
python-version: ["3.7", "3.10"]
python-version: ["3.7", "3.11"]

steps:
- name: Check out the repo code
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
include tsdb/config.ini
prune tests
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
scikit-learn
pandas
scipy
scipy
pyarrow
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ basic =
scikit-learn
pandas
scipy
pyarrow
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"classification",
"forecasting",
"partially observed",
"irregular sampled",
"irregularly sampled",
"partially-observed time series",
"incomplete time series",
"missing data",
Expand All @@ -48,6 +48,7 @@
"scikit-learn",
"pandas",
"scipy",
"pyarrow",
],
setup_requires=["setuptools>=38.6.0"],
classifiers=[
Expand Down
1 change: 1 addition & 0 deletions tests/environment_for_conda_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- conda-forge::numpy
- conda-forge::scikit-learn
- conda-forge::pandas
- conda-forge::pyarrow

# test
- conda-forge::pytest-cov
14 changes: 12 additions & 2 deletions tests/test_tsdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,13 @@
from tsdb.utils.logging import Logger

DATASETS_TO_TEST = [
"ucr_uea_Wine",
"physionet_2012",
"physionet_2019",
"beijing_multisite_air_quality",
"electricity_load_diagrams",
"electricity_transformer_temperature",
"vessel_ais",
"ucr_uea_Wine",
]


Expand Down Expand Up @@ -45,7 +49,13 @@ def test_3_dataset_purging(self):
tsdb.delete_cache("physionet_2012") # delete single
tsdb.delete_cache() # delete all

def test_4_logging(self):
def test_4_migrate(self):
os.makedirs("dir_for_migration")
with open("dir_for_migration/test.txt", "a") as f:
f.write("hello world")
tsdb.migrate("dir_for_migration", "new_dir/put_it_here")

def test_5_logging(self):
# different level logging
self.logger.debug("debug")
self.logger.info("info")
Expand Down
5 changes: 3 additions & 2 deletions tsdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.2.1"

__version__ = "0.3"

from .utils.file import migrate
from .data_processing import (
list,
load,
Expand Down Expand Up @@ -53,6 +53,7 @@
"CACHED_DATASET_DIR",
"pickle_dump",
"pickle_load",
"migrate",
# below are deprecated functions, import for now, will be removed in v0.2
"list_database",
"list_available_datasets",
Expand Down
2 changes: 2 additions & 0 deletions tsdb/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[path]
data_home = .tsdb
21 changes: 20 additions & 1 deletion tsdb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,27 @@
# License: BSD-3-Clause

import os
from configparser import ConfigParser

from .utils.logging import logger

config = ConfigParser()
tsdb_config_path = os.path.join(os.path.dirname(__file__), "config.ini")
config.read(tsdb_config_path)

old_cached_dataset_dir = os.path.join(os.path.expanduser("~"), ".tsdb_cached_datasets")
CACHED_DATASET_DIR = os.path.join(
os.path.expanduser("~"), config.get("path", "data_home")
)
if os.path.exists(old_cached_dataset_dir):
logger.warning(
"‼️ Detected the home dir of the old version TSDB. "
"Since v0.3, TSDB has changed the default cache dir to '~/.tsdb'. "
"You can migrate downloaded datasets by invoking the new function "
f"tsdb.migrate(old='~/.tsdb_cached_datasets', new={CACHED_DATASET_DIR})"
)
CACHED_DATASET_DIR = old_cached_dataset_dir

CACHED_DATASET_DIR = os.path.join(os.path.expanduser("~"), ".tsdb_cached_datasets")

_DATABASE = {
# http://www.physionet.org/challenge/2012
Expand Down
2 changes: 1 addition & 1 deletion tsdb/loading_funcs/electricity_transformer_temperature.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def load_ett(local_path):
file_path = os.path.join(local_path, sub_set)
df = pd.read_csv(file_path, index_col="date")
df.index = pd.to_datetime(df.index)
df_name = sub_set.removesuffix(".csv")
df_name = sub_set.split(".csv")[0]
data[df_name] = df

return data
58 changes: 57 additions & 1 deletion tsdb/utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
import pickle
import shutil
from typing import Optional
from configparser import ConfigParser


from ..database import CACHED_DATASET_DIR
from .logging import logger
from ..database import CACHED_DATASET_DIR


def pickle_dump(data: object, path: str) -> Optional[str]:
Expand Down Expand Up @@ -94,3 +96,57 @@ def purge_path(path: str, ignore_errors: bool = True) -> None:
)
except shutil.Error:
raise shutil.Error("Operation failed.")


def migrate(old_path: str, new_path: str) -> None:
"""Migrate datasets from old_path to new_path.

Parameters
----------
old_path:
The old path of the dataset.

new_path:
The new path of the dataset.

"""
if not os.path.exists(old_path):
raise FileNotFoundError(f"Given old_path {old_path} does not exist.")

if os.path.exists(new_path):
logger.warning(f"Please note that new_path {new_path} already exists.")
# if new_path exists, we have to move everything from old_path into it
all_old_files = os.listdir(old_path)
for f in all_old_files:
old_f_path = os.path.join(old_path, f)
if os.path.isdir(old_f_path):
new_f_path = os.path.join(new_path, f)
shutil.copytree(old_f_path, new_f_path)
else:
shutil.move(old_f_path, new_path)
shutil.rmtree(old_path, ignore_errors=True)
else:
# if new_path does not exist, just rename the old_path into it
new_parent_dir = os.path.abspath(os.path.join(new_path, ".."))
if not os.path.exists(new_parent_dir):
os.makedirs(new_parent_dir, exist_ok=True)
os.rename(old_path, new_path)

config = ConfigParser()
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
tsdb_config_path = os.path.join(parent_dir, "config.ini")
config.read(tsdb_config_path)

if os.path.abspath(old_path) == os.path.abspath(CACHED_DATASET_DIR):
config.set("path", "data_home", new_path)
with open(tsdb_config_path, "w") as f:
config.write(f)

logger.info(
f"Found the given old_path is the current TSDB dataset cache directory. "
f"Have already set the new cache directory to {new_path}."
)

logger.info(
f"Successfully migrated {old_path} to {new_path}, and deleted {old_path}"
)