diff --git a/README.md b/README.md index 5dc8c2e..11f2854 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@

Welcome to TSDB

-*

a Python toolbox to ease loading public time-series datasets

* +*

a Python toolbox to ease loading 172 public time-series datasets

*

@@ -46,7 +46,7 @@

-> 📣 TSDB now supports a total of 1️⃣7️⃣0️⃣ time-series datasets ‼️ +> 📣 TSDB now supports a total of 1️⃣7️⃣2️⃣ time-series datasets ‼️ TSDB is a part of @@ -88,7 +88,9 @@ data = tsdb.load('physionet_2012') tsdb.download_and_extract('physionet_2012', './save_it_here') # datasets you once loaded are cached, and you can check them with list_cached_data() tsdb.list_cache() -# you can delete only one specific dataset and preserve others +# you can delete only one specific dataset's pickled cache +tsdb.delete_cache(dataset_name='physionet_2012', only_pickle=True) +# you can delete only one specific dataset raw files and preserve others tsdb.delete_cache(dataset_name='physionet_2012') # or you can delete all cache with delete_cached_data() to free disk space tsdb.delete_cache() @@ -112,6 +114,8 @@ That's all. Simple and efficient. Enjoy it! 😃 | [Electricity Load Diagrams](dataset_profiles/electricity_load_diagrams) | Forecasting, Imputation | | [Electricity Transformer Temperature (ETT)](dataset_profiles/electricity_transformer_temperature) | Forecasting, Imputation | | [Vessel AIS](dataset_profiles/vessel_ais) | Forecasting, Imputation, Classification | +| [PeMS Traffic](dataset_profiles/pems_traffic) | Forecasting, Imputation | +| [Solar Alabama](dataset_profiles/solar_alabama) | Forecasting, Imputation | | [UCR & UEA Datasets](dataset_profiles/ucr_uea_datasets) (all 163 datasets) | Classification | diff --git a/dataset_profiles/solar_alabama/README.md b/dataset_profiles/solar_alabama/README.md new file mode 100644 index 0000000..dd05371 --- /dev/null +++ b/dataset_profiles/solar_alabama/README.md @@ -0,0 +1,5 @@ +# Solar Alabama + +## Citing this dataset 🤗 + +`https://www.nrel.gov/grid/solar-power-data.html` diff --git a/setup.py b/setup.py index 3f59c1d..5e9741d 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="tsdb", version=__version__, - description="TSDB (Time Series Data Beans): a Python toolbox helping load open-source time-series datasets", + description="TSDB (Time Series Data Beans): a Python toolbox helping load 172 open-source time-series datasets", long_description=README, long_description_content_type="text/markdown", license="BSD-3-Clause", diff --git a/tests/test_tsdb.py b/tests/test_tsdb.py index 9925829..f09600a 100644 --- a/tests/test_tsdb.py +++ b/tests/test_tsdb.py @@ -16,9 +16,12 @@ "physionet_2012", "physionet_2019", "beijing_multisite_air_quality", + "italy_air_quality", "electricity_load_diagrams", "electricity_transformer_temperature", "vessel_ais", + "pems_traffic", + "solar_alabama", "ucr_uea_Wine", ] diff --git a/tsdb/data_processing.py b/tsdb/data_processing.py index f1b48cb..a9e3514 100644 --- a/tsdb/data_processing.py +++ b/tsdb/data_processing.py @@ -20,6 +20,7 @@ load_ais, load_italy_air_quality, load_pems_traffic, + load_solar_alabama, ) from .utils.downloading import download_and_extract from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home @@ -108,6 +109,8 @@ def load(dataset_name: str, use_cache: bool = True) -> dict: result = load_ais(dataset_saving_path) elif dataset_name == "pems_traffic": result = load_pems_traffic(dataset_saving_path) + elif dataset_name == "solar_alabama": + result = load_solar_alabama(dataset_saving_path) elif "ucr_uea_" in dataset_name: actual_dataset_name = dataset_name.replace( "ucr_uea_", "" diff --git a/tsdb/database.py b/tsdb/database.py index 5669ff0..80530d7 100644 --- a/tsdb/database.py +++ b/tsdb/database.py @@ -43,9 +43,12 @@ "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh1.csv", "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh2.csv", ], - # https://pems.dot.ca.gov + # https://pems.dot.ca.gov, https://github.com/laiguokun/multivariate-time-series-data "pems_traffic": "https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/" "traffic/traffic.txt.gz", + # https://www.nrel.gov/grid/solar-power-data.html, https://github.com/laiguokun/multivariate-time-series-data + "solar_alabama": "https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/" + "solar-energy/solar_AL.txt.gz", } diff --git a/tsdb/loading_funcs/__init__.py b/tsdb/loading_funcs/__init__.py index ac28687..ddeafd8 100644 --- a/tsdb/loading_funcs/__init__.py +++ b/tsdb/loading_funcs/__init__.py @@ -14,6 +14,7 @@ from .ucr_uea_datasets import load_ucr_uea_dataset from .vessel_ais import load_ais from .pems_traffic import load_pems_traffic +from .solar_alabama import load_solar_alabama __all__ = [ "load_beijing_air_quality", @@ -25,4 +26,5 @@ "load_ett", "load_italy_air_quality", "load_pems_traffic", + "load_solar_alabama", ] diff --git a/tsdb/loading_funcs/solar_alabama.py b/tsdb/loading_funcs/solar_alabama.py new file mode 100644 index 0000000..674cf28 --- /dev/null +++ b/tsdb/loading_funcs/solar_alabama.py @@ -0,0 +1,50 @@ +""" +Scripts related to dataset Solar Alabama. It contains the solar power production records in the year 2006, +which are sampled every 10 minutes from 137 PV plants in Alabama State. +https://www.nrel.gov/grid/solar-power-data.html + +For more information please refer to: +https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/solar_alabama +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +import os + +import pandas as pd + + +def load_solar_alabama(local_path): + """Load dataset Solar Alabama. + + Parameters + ---------- + local_path : str, + The local path of dir saving the raw data of Solar Alabama. + + Returns + ------- + data : dict + A dictionary contains X: + X : pandas.DataFrame + The time-series data of Solar Alabama. + """ + dir_path = os.path.join(local_path, "solar_AL.txt") + + # make columns names + col_names = [str(i) for i in range(137)] + df = pd.read_csv(dir_path, index_col=None, names=col_names) + date = pd.date_range( + start="2006-01-01 00:00:00", + end="2006-12-31 23:50:00", + freq="10min", + ) + df["date"] = date + col_names.insert(0, "date") + df = df[col_names] + + data = { + "X": df, + } + return data