Skip to content

Commit

Permalink
Merge pull request #66 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Add PeMS traffic dataset, and enable to delete pickled cache only
  • Loading branch information
WenjieDu authored Jun 25, 2024
2 parents 39f8e66 + 4f8a338 commit b24f303
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 35 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/testing_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macOS-13]
python-version: ['3.7', '3.11']
os: [ ubuntu-latest, windows-latest, macOS-latest ]
python-version: [ "3.8","3.11" ]

steps:
- uses: actions/checkout@v3
Expand Down
23 changes: 12 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,15 @@ That's all. Simple and efficient. Enjoy it! 😃


## ❖ Citing TSDB/PyPOTS
The paper introducing PyPOTS project is available on arXiv at [this URL](https://arxiv.org/abs/2305.18811),
and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (track for
[Machine Learning Open Source Software](https://www.jmlr.org/mloss/)). If you use TSDB in your work,
please cite PyPOTS project as below and 🌟star this repository to make others notice this library. 🤗 Thank you!
The paper introducing PyPOTS is available [on arXiv](https://arxiv.org/abs/2305.18811),
A short version of it is accepted by the 9th SIGKDD international workshop on Mining and Learning from Time Series ([MiLeTS'23](https://kdd-milets.github.io/milets2023/))).
**Additionally**, PyPOTS has been included as a [PyTorch Ecosystem](https://pytorch.org/ecosystem/) project.
We are pursuing to publish it in prestigious academic venues, e.g. JMLR (track for
[Machine Learning Open Source Software](https://www.jmlr.org/mloss/)). If you use PyPOTS in your work,
please cite it as below and 🌟star this repository to make others notice this library. 🤗

There are scientific research projects using PyPOTS and referencing in their papers.
Here is [an incomplete list of them](https://scholar.google.com/scholar?as_ylo=2022&q=%E2%80%9CPyPOTS%E2%80%9D&hl=en).

<p align="center">
<a href="https://github.com/WenjieDu/PyPOTS">
Expand All @@ -131,18 +136,14 @@ please cite PyPOTS project as below and 🌟star this repository to make others
@article{du2023pypots,
title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}},
author={Wenjie Du},
journal={arXiv preprint arXiv:2305.18811},
year={2023},
eprint={2305.18811},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2305.18811},
doi={10.48550/arXiv.2305.18811},
}
```

or
> Wenjie Du. (2023).
> PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series.
> arXiv, abs/2305.18811.https://arxiv.org/abs/2305.18811
> arXiv, abs/2305.18811. https://arxiv.org/abs/2305.18811


Expand Down
5 changes: 5 additions & 0 deletions dataset_profiles/pems_traffic/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# PeMS Traffic

## Citing this dataset 🤗

`https://pems.dot.ca.gov`
2 changes: 2 additions & 0 deletions tests/test_tsdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def test_2_dataset_loading(self):
def test_3_dataset_purging(self):
cached_datasets = tsdb.list_cache()
assert isinstance(cached_datasets, list)
tsdb.delete_cache("physionet_2012", only_pickle=True)
tsdb.delete_cache("physionet_2012") # delete single

def test_4_migrate(self):
Expand All @@ -54,6 +55,7 @@ def test_4_migrate(self):
f.write("hello world")
tsdb.migrate("dir_for_migration", "new_dir/put_it_here")
tsdb.migrate_cache("new_cache_dir")
tsdb.delete_cache(only_pickle=True) # delete all pickle cache
tsdb.delete_cache() # delete all datasets

def test_5_logging(self):
Expand Down
50 changes: 37 additions & 13 deletions tsdb/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
load_ucr_uea_dataset,
load_ais,
load_italy_air_quality,
load_pems_traffic,
)
from .utils.downloading import download_and_extract
from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home
Expand Down Expand Up @@ -105,6 +106,8 @@ def load(dataset_name: str, use_cache: bool = True) -> dict:
result = load_italy_air_quality(dataset_saving_path)
elif dataset_name == "vessel_ais":
result = load_ais(dataset_saving_path)
elif dataset_name == "pems_traffic":
result = load_pems_traffic(dataset_saving_path)
elif "ucr_uea_" in dataset_name:
actual_dataset_name = dataset_name.replace(
"ucr_uea_", ""
Expand Down Expand Up @@ -150,7 +153,7 @@ def list_cache() -> list:
return dir_content


def delete_cache(dataset_name: str = None) -> None:
def delete_cache(dataset_name: str = None, only_pickle: bool = False) -> None:
"""Delete CACHED_DATASET_DIR if exists.
Parameters
Expand All @@ -159,6 +162,12 @@ def delete_cache(dataset_name: str = None) -> None:
The name of the specific dataset in database.DATABASE.
If dataset is not cached, then abort.
Delete all cached datasets if dataset_name is left as None.
only_pickle : bool,
Whether to delete only the cached pickle file.
When the preprocessing pipeline TSDB is changed, users may want to only delete the cached pickle file which is
generated by the old pipeline but keep the downloaded raw data. This option is designed for this purpose.
"""
# if CACHED_DATASET_DIR does not exist, abort
if not os.path.exists(CACHED_DATASET_DIR):
Expand All @@ -169,20 +178,35 @@ def delete_cache(dataset_name: str = None) -> None:
logger.info(
f"`dataset_name` not given. Purging all cached data under {CACHED_DATASET_DIR}..."
)
purge_path(CACHED_DATASET_DIR)
os.makedirs(CACHED_DATASET_DIR)
if only_pickle:
for cached_dataset in os.listdir(CACHED_DATASET_DIR):
for file in os.listdir(
os.path.join(CACHED_DATASET_DIR, cached_dataset)
):
if file.endswith(".pkl"):
purge_path(
os.path.join(CACHED_DATASET_DIR, cached_dataset, file)
)
else:
purge_path(CACHED_DATASET_DIR)
os.makedirs(CACHED_DATASET_DIR)
else:
assert (
dataset_name in AVAILABLE_DATASETS
), f"{dataset_name} is not available in TSDB, so it has no cache. Please check your dataset name."
dir_to_delete = os.path.join(CACHED_DATASET_DIR, dataset_name)
if not os.path.exists(dir_to_delete):
logger.error(
f"❌ Dataset {dataset_name} is not cached. Operation aborted."
)
return
if only_pickle:
for file in os.listdir(os.path.join(CACHED_DATASET_DIR, dataset_name)):
if file.endswith(".pkl"):
purge_path(os.path.join(CACHED_DATASET_DIR, dataset_name, file))
else:
logger.info(
f"Purging cached dataset {dataset_name} under {dir_to_delete}..."
)
purge_path(dir_to_delete)
dir_to_delete = os.path.join(CACHED_DATASET_DIR, dataset_name)
if not os.path.exists(dir_to_delete):
logger.error(
f"❌ Dataset {dataset_name} is not cached. Operation aborted."
)
return
else:
logger.info(
f"Purging cached dataset {dataset_name} under {dir_to_delete}..."
)
purge_path(dir_to_delete)
3 changes: 3 additions & 0 deletions tsdb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
"https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh1.csv",
"https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh2.csv",
],
# https://pems.dot.ca.gov
"pems_traffic": "https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/"
"traffic/traffic.txt.gz",
}


Expand Down
2 changes: 2 additions & 0 deletions tsdb/loading_funcs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .physionet_2019 import load_physionet2019
from .ucr_uea_datasets import load_ucr_uea_dataset
from .vessel_ais import load_ais
from .pems_traffic import load_pems_traffic

__all__ = [
"load_beijing_air_quality",
Expand All @@ -23,4 +24,5 @@
"load_ais",
"load_ett",
"load_italy_air_quality",
"load_pems_traffic",
]
48 changes: 48 additions & 0 deletions tsdb/loading_funcs/pems_traffic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
Scripts related to dataset PeMS Traffic.
For more information please refer to:
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/pems_traffic
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

import os

import pandas as pd


def load_pems_traffic(local_path):
"""Load dataset PeMS Traffic.
Parameters
----------
local_path : str,
The local path of dir saving the raw data of PeMS Traffic.
Returns
-------
data : dict
A dictionary contains X:
X : pandas.DataFrame
The time-series data of PeMS Traffic.
"""
dir_path = os.path.join(local_path, "traffic.txt")

# make columns names
col_names = [str(i) for i in range(862)]
df = pd.read_csv(dir_path, index_col=None, names=col_names)
date = pd.date_range(
start="2015-01-01 00:00:00",
end="2016-12-31 23:00:00",
freq="H",
)
df["date"] = date
col_names.insert(0, "date")
df = df[col_names]

data = {
"X": df,
}
return data
22 changes: 15 additions & 7 deletions tsdb/utils/downloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

import gzip
import os
import shutil
import tempfile
import urllib.request
import warnings
from typing import Optional

from ..database import DATABASE
from .logging import logger
from ..database import DATABASE


def _download_and_extract(url: str, saving_path: str) -> Optional[str]:
Expand Down Expand Up @@ -69,13 +70,20 @@ def _download_and_extract(url: str, saving_path: str) -> Optional[str]:
if suffix in supported_compression_format:
try:
os.makedirs(saving_path, exist_ok=True)
shutil.unpack_archive(raw_data_saving_path, saving_path)
if ".txt.gz" in file_name:
new_name = file_name.split(".txt.gz")[0]
new_name = new_name + ".txt"
saving_path = os.path.join(saving_path, new_name)
with open(raw_data_saving_path, "rb") as f, open(
saving_path, "wb"
) as wf:
wf.write(gzip.decompress(f.read()))
else:
shutil.unpack_archive(raw_data_saving_path, saving_path)
logger.info(f"Successfully extracted data to {saving_path}")
except shutil.Error:
warnings.warn(
"The compressed file is corrupted, aborting.", category=RuntimeWarning
)
return None
except Exception as e:
shutil.rmtree(saving_path, ignore_errors=True)
raise RuntimeError(f"❌ {e}")
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)

Expand Down
4 changes: 2 additions & 2 deletions tsdb/utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def purge_path(path: str, ignore_errors: bool = True) -> None:
os.remove(path)
# check if succeed
if not os.path.exists(path):
logger.info(f"Successfully deleted {path}.")
logger.info(f"Successfully deleted {path}")
else:
cached_dataset_dir = determine_data_home()
raise FileExistsError(
Expand Down Expand Up @@ -126,7 +126,7 @@ def determine_data_home():
if os.path.abspath(data_home_path) != os.path.abspath(default_path):
logger.warning(
f"‼️ The preset data_home path '{data_home_path}' doesn't exist. "
f"Using the default path '{default_path}'."
f"Using the default path '{default_path}'"
)
return cached_dataset_dir

Expand Down

0 comments on commit b24f303

Please sign in to comment.