diff --git a/README.md b/README.md index 51c5111..2b5fe40 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@
a Python toolbox to ease loading 172 public time-series datasets
* +*load 172 public time-series datasets with a single line of code ;-)
*@@ -67,21 +67,38 @@ if it helps with your research. This really means a lot to our open-source resea ## ❖ Usage Examples -TSDB now is available on ❗️ +> [!IMPORTANT] +> TSDB is available on both +> and ❗️ +> +> Install via pip: +> > pip install tsdb +> +> or install from source code: +> > pip install `https://github.com/WenjieDu/TSDB/archive/main.zip` +> +> or install via conda: +> > conda install tsdb -c conda-forge -Install it with `conda install tsdb `, you may need to specify the channel with option `-c conda-forge` - -or install via PyPI: -> pip install tsdb - -or install from source code: -> pip install `https://github.com/WenjieDu/TSDB/archive/main.zip` ```python import tsdb # list all available datasets in TSDB tsdb.list() +# ['physionet_2012', +# 'physionet_2019', +# 'electricity_load_diagrams', +# 'beijing_multisite_air_quality', +# 'italy_air_quality', +# 'vessel_ais', +# 'electricity_transformer_temperature', +# 'pems_traffic', +# 'solar_alabama', +# 'ucr_uea_ACSF1', +# 'ucr_uea_Adiac', +# ... + # select the dataset you need and load it, TSDB will download, extract, and process it automatically data = tsdb.load('physionet_2012') # if you need the raw data, use download_and_extract() diff --git a/requirements.txt b/requirements.txt index fe720b6..3be5b24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ +tqdm numpy -scikit-learn -pandas scipy +pandas pyarrow +requests +scikit-learn diff --git a/setup.cfg b/setup.cfg index 5d5f94d..4ef5c4c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,8 +23,10 @@ extend-ignore = # basic dependencies basic = + tqdm numpy - scikit-learn - pandas scipy + pandas pyarrow + requests + scikit-learn diff --git a/setup.py b/setup.py index 5e9741d..3fdc7d8 100644 --- a/setup.py +++ b/setup.py @@ -44,11 +44,13 @@ packages=find_packages(exclude=["tests"]), include_package_data=True, install_requires=[ + "tqdm", "numpy", - "scikit-learn", - "pandas", "scipy", + "pandas", "pyarrow", + "requests", + "scikit-learn", ], setup_requires=["setuptools>=38.6.0"], classifiers=[ diff --git a/tsdb/utils/downloading.py b/tsdb/utils/downloading.py index 0f7a00a..451d84f 100644 --- a/tsdb/utils/downloading.py +++ b/tsdb/utils/downloading.py @@ -9,10 +9,12 @@ import os import shutil import tempfile -import urllib.request import warnings from typing import Optional +import requests +from tqdm import tqdm + from .logging import logger from ..database import DATABASE @@ -54,7 +56,27 @@ def _download_and_extract(url: str, saving_path: str) -> Optional[str]: # download and save the raw dataset try: - urllib.request.urlretrieve(url, raw_data_saving_path) + with requests.get(url, stream=True) as r: + r.raise_for_status() + chunk_size = 8192 + try: + size = int(r.headers["Content-Length"]) + except KeyError: + size = None + + with tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + miniters=1, + desc=f"Downloading {file_name}", + total=size, + ) as pbar: + with open(raw_data_saving_path, "wb") as f: + for chunk in r.iter_content(chunk_size=chunk_size): + f.write(chunk) + pbar.update(len(chunk)) + except Exception as e: shutil.rmtree(saving_path, ignore_errors=True) shutil.rmtree(raw_data_saving_path, ignore_errors=True)