Skip to content

Commit

Permalink
Merge pull request #61 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Add dataset `Italy Air Quality`
  • Loading branch information
WenjieDu authored May 26, 2024
2 parents 13664f7 + 3d939d1 commit 18264e8
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/testing_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
os: [ubuntu-latest, windows-latest, macOS-13]
python-version: ['3.7', '3.11']

steps:
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
</a>
</p>

> 📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️
> 📣 TSDB now supports a total of 1️⃣7️⃣0️⃣ time-series datasets ‼️
<a href='https://github.com/WenjieDu/PyPOTS'><img src='https://pypots.com/figs/pypots_logos/PyPOTS/logo_FFBG.svg' width='160' align='left' /></a>
TSDB is a part of
Expand Down Expand Up @@ -108,6 +108,7 @@ That's all. Simple and efficient. Enjoy it! 😃
| [PhysioNet Challenge 2012](dataset_profiles/physionet_2012) | Forecasting, Imputation, Classification |
| [PhysioNet Challenge 2019](dataset_profiles/physionet_2019) | Forecasting, Imputation, Classification |
| [Beijing Multi-Site Air-Quality](dataset_profiles/beijing_multisite_air_quality) | Forecasting, Imputation |
| [Italy Air Quality](dataset_profiles/italy_air_quality) | Forecasting, Imputation |
| [Electricity Load Diagrams](dataset_profiles/electricity_load_diagrams) | Forecasting, Imputation |
| [Electricity Transformer Temperature (ETT)](dataset_profiles/electricity_transformer_temperature) | Forecasting, Imputation |
| [Vessel AIS](dataset_profiles/vessel_ais) | Forecasting, Imputation, Classification |
Expand All @@ -121,7 +122,7 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra
please cite PyPOTS project as below and 🌟star this repository to make others notice this library. 🤗 Thank you!

<p align="center">
<a href="https://pypots.com/ecosystem/">
<a href="https://github.com/WenjieDu/PyPOTS">
<img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
</a>
</p>
Expand Down
17 changes: 17 additions & 0 deletions dataset_profiles/italy_air_quality/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Italy Air Quality

## Citing this dataset 🤗

`Vito,Saverio. (2016). Air Quality. UCI Machine Learning Repository. https://doi.org/10.24432/C59K5F`

or

```bibtex
@misc{vito2016air,
author = {Vito,Saverio},
title = {{Air Quality}},
year = {2016},
howpublished = {UCI Machine Learning Repository},
note = {{DOI}: https://doi.org/10.24432/C59K5F}
}
```
2 changes: 1 addition & 1 deletion tsdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.3.1"
__version__ = "0.4"

from .data_processing import (
CACHED_DATASET_DIR,
Expand Down
3 changes: 3 additions & 0 deletions tsdb/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
load_beijing_air_quality,
load_ucr_uea_dataset,
load_ais,
load_italy_air_quality,
)
from .utils.downloading import download_and_extract
from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home
Expand Down Expand Up @@ -100,6 +101,8 @@ def load(dataset_name: str, use_cache: bool = True) -> dict:
result = load_ett(dataset_saving_path)
elif dataset_name == "beijing_multisite_air_quality":
result = load_beijing_air_quality(dataset_saving_path)
elif dataset_name == "italy_air_quality":
result = load_italy_air_quality(dataset_saving_path)
elif dataset_name == "vessel_ais":
result = load_ais(dataset_saving_path)
elif "ucr_uea_" in dataset_name:
Expand Down
2 changes: 2 additions & 0 deletions tsdb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/beijing_multisite_air_quality
"beijing_multisite_air_quality": "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
"PRSA2017_Data_20130301-20170228.zip",
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
"italy_air_quality": "https://archive.ics.uci.edu/static/public/360/air+quality.zip",
#
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/vessel_ais
"vessel_ais": "https://zenodo.org/record/8064564/files/parquets.zip",
Expand Down
4 changes: 3 additions & 1 deletion tsdb/loading_funcs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@

from .beijing_multisite_air_quality import load_beijing_air_quality
from .electricity_load_diagrams import load_electricity
from .electricity_transformer_temperature import load_ett
from .italy_air_quality import load_italy_air_quality
from .physionet_2012 import load_physionet2012
from .physionet_2019 import load_physionet2019
from .ucr_uea_datasets import load_ucr_uea_dataset
from .vessel_ais import load_ais
from .electricity_transformer_temperature import load_ett

__all__ = [
"load_beijing_air_quality",
Expand All @@ -21,4 +22,5 @@
"load_ucr_uea_dataset",
"load_ais",
"load_ett",
"load_italy_air_quality",
]
41 changes: 41 additions & 0 deletions tsdb/loading_funcs/italy_air_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
Scripts related to dataset Italy Air Quality.
For more information please refer to:
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

import os

import pandas as pd


def load_italy_air_quality(local_path):
"""Load dataset Italy Air Quality.
Parameters
----------
local_path : str,
The local path of dir saving the raw data of Beijing Multi-site Air Quality.
Returns
-------
data : dict
A dictionary contains X:
X : pandas.DataFrame
The time-series data of Beijing Multi-site Air Quality.
"""
file_path = os.path.join(local_path, "AirQualityUCI.csv")
df = pd.read_csv(file_path, sep=";", decimal=",")
# remove empty columns
df.drop(columns=["Unnamed: 15", "Unnamed: 16"], inplace=True)
# remove rows with all NaN, i.e. Date is NaN
df = df[~df["Date"].isna()]

data = {
"X": df,
}
return data
24 changes: 14 additions & 10 deletions tsdb/loading_funcs/physionet_2012.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@ def load_physionet2012(local_path):
) # ensure RecordID's type is int
outcome = outcome.set_index("RecordID")
outcome_collector.append(outcome)
y = pd.concat(outcome_collector)

df_collector = []

# iterate over all samples
set_collector = []
for m_ in time_series_measurements_dir:
df_collector = []
raw_data_dir = os.path.join(local_path, m_)
for filename in os.listdir(raw_data_dir):
recordID = int(filename.split(".txt")[0])
Expand All @@ -80,11 +79,16 @@ def load_physionet2012(local_path):
df_temp["Age"] = df_temp.loc[0, "Age"]
df_temp["Height"] = df_temp.loc[0, "Height"]
df_collector.append(df_temp)

df = pd.concat(df_collector, sort=True)
X = df.reset_index(drop=True)
unique_ids = df["RecordID"].unique()
y = y.loc[unique_ids]

data = {"X": X, "y": y, "static_features": ["Age", "Gender", "ICUType", "Height"]}
df = pd.concat(df_collector, sort=True)
set_collector.append(df)

data = {
"set-a": set_collector[0],
"set-b": set_collector[1],
"set-c": set_collector[2],
"outcomes-a": outcome_collector[0],
"outcomes-b": outcome_collector[1],
"outcomes-c": outcome_collector[2],
"static_features": ["Age", "Gender", "ICUType", "Height"],
}
return data

0 comments on commit 18264e8

Please sign in to comment.