Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dataset Italy Air Quality #61

Merged
merged 6 commits into from
May 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/testing_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
os: [ubuntu-latest, windows-latest, macOS-13]
python-version: ['3.7', '3.11']

steps:
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
</a>
</p>

> 📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️
> 📣 TSDB now supports a total of 1️⃣7️⃣0️⃣ time-series datasets ‼️
<a href='https://github.com/WenjieDu/PyPOTS'><img src='https://pypots.com/figs/pypots_logos/PyPOTS/logo_FFBG.svg' width='160' align='left' /></a>
TSDB is a part of
Expand Down Expand Up @@ -108,6 +108,7 @@ That's all. Simple and efficient. Enjoy it! 😃
| [PhysioNet Challenge 2012](dataset_profiles/physionet_2012) | Forecasting, Imputation, Classification |
| [PhysioNet Challenge 2019](dataset_profiles/physionet_2019) | Forecasting, Imputation, Classification |
| [Beijing Multi-Site Air-Quality](dataset_profiles/beijing_multisite_air_quality) | Forecasting, Imputation |
| [Italy Air Quality](dataset_profiles/italy_air_quality) | Forecasting, Imputation |
| [Electricity Load Diagrams](dataset_profiles/electricity_load_diagrams) | Forecasting, Imputation |
| [Electricity Transformer Temperature (ETT)](dataset_profiles/electricity_transformer_temperature) | Forecasting, Imputation |
| [Vessel AIS](dataset_profiles/vessel_ais) | Forecasting, Imputation, Classification |
Expand All @@ -121,7 +122,7 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra
please cite PyPOTS project as below and 🌟star this repository to make others notice this library. 🤗 Thank you!

<p align="center">
<a href="https://pypots.com/ecosystem/">
<a href="https://github.com/WenjieDu/PyPOTS">
<img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
</a>
</p>
Expand Down
17 changes: 17 additions & 0 deletions dataset_profiles/italy_air_quality/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Italy Air Quality

## Citing this dataset 🤗

`Vito,Saverio. (2016). Air Quality. UCI Machine Learning Repository. https://doi.org/10.24432/C59K5F`

or

```bibtex
@misc{vito2016air,
author = {Vito,Saverio},
title = {{Air Quality}},
year = {2016},
howpublished = {UCI Machine Learning Repository},
note = {{DOI}: https://doi.org/10.24432/C59K5F}
}
```
2 changes: 1 addition & 1 deletion tsdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.3.1"
__version__ = "0.4"

from .data_processing import (
CACHED_DATASET_DIR,
Expand Down
3 changes: 3 additions & 0 deletions tsdb/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
load_beijing_air_quality,
load_ucr_uea_dataset,
load_ais,
load_italy_air_quality,
)
from .utils.downloading import download_and_extract
from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home
Expand Down Expand Up @@ -100,6 +101,8 @@ def load(dataset_name: str, use_cache: bool = True) -> dict:
result = load_ett(dataset_saving_path)
elif dataset_name == "beijing_multisite_air_quality":
result = load_beijing_air_quality(dataset_saving_path)
elif dataset_name == "italy_air_quality":
result = load_italy_air_quality(dataset_saving_path)
elif dataset_name == "vessel_ais":
result = load_ais(dataset_saving_path)
elif "ucr_uea_" in dataset_name:
Expand Down
2 changes: 2 additions & 0 deletions tsdb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/beijing_multisite_air_quality
"beijing_multisite_air_quality": "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
"PRSA2017_Data_20130301-20170228.zip",
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
"italy_air_quality": "https://archive.ics.uci.edu/static/public/360/air+quality.zip",
#
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/vessel_ais
"vessel_ais": "https://zenodo.org/record/8064564/files/parquets.zip",
Expand Down
4 changes: 3 additions & 1 deletion tsdb/loading_funcs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@

from .beijing_multisite_air_quality import load_beijing_air_quality
from .electricity_load_diagrams import load_electricity
from .electricity_transformer_temperature import load_ett
from .italy_air_quality import load_italy_air_quality
from .physionet_2012 import load_physionet2012
from .physionet_2019 import load_physionet2019
from .ucr_uea_datasets import load_ucr_uea_dataset
from .vessel_ais import load_ais
from .electricity_transformer_temperature import load_ett

__all__ = [
"load_beijing_air_quality",
Expand All @@ -21,4 +22,5 @@
"load_ucr_uea_dataset",
"load_ais",
"load_ett",
"load_italy_air_quality",
]
41 changes: 41 additions & 0 deletions tsdb/loading_funcs/italy_air_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
Scripts related to dataset Italy Air Quality.
For more information please refer to:
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

import os

import pandas as pd


def load_italy_air_quality(local_path):
"""Load dataset Italy Air Quality.
Parameters
----------
local_path : str,
The local path of dir saving the raw data of Beijing Multi-site Air Quality.
Returns
-------
data : dict
A dictionary contains X:
X : pandas.DataFrame
The time-series data of Beijing Multi-site Air Quality.
"""
file_path = os.path.join(local_path, "AirQualityUCI.csv")
df = pd.read_csv(file_path, sep=";", decimal=",")
# remove empty columns
df.drop(columns=["Unnamed: 15", "Unnamed: 16"], inplace=True)
# remove rows with all NaN, i.e. Date is NaN
df = df[~df["Date"].isna()]

data = {
"X": df,
}
return data
24 changes: 14 additions & 10 deletions tsdb/loading_funcs/physionet_2012.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@ def load_physionet2012(local_path):
) # ensure RecordID's type is int
outcome = outcome.set_index("RecordID")
outcome_collector.append(outcome)
y = pd.concat(outcome_collector)

df_collector = []

# iterate over all samples
set_collector = []
for m_ in time_series_measurements_dir:
df_collector = []
raw_data_dir = os.path.join(local_path, m_)
for filename in os.listdir(raw_data_dir):
recordID = int(filename.split(".txt")[0])
Expand All @@ -80,11 +79,16 @@ def load_physionet2012(local_path):
df_temp["Age"] = df_temp.loc[0, "Age"]
df_temp["Height"] = df_temp.loc[0, "Height"]
df_collector.append(df_temp)

df = pd.concat(df_collector, sort=True)
X = df.reset_index(drop=True)
unique_ids = df["RecordID"].unique()
y = y.loc[unique_ids]

data = {"X": X, "y": y, "static_features": ["Age", "Gender", "ICUType", "Height"]}
df = pd.concat(df_collector, sort=True)
set_collector.append(df)

data = {
"set-a": set_collector[0],
"set-b": set_collector[1],
"set-c": set_collector[2],
"outcomes-a": outcome_collector[0],
"outcomes-b": outcome_collector[1],
"outcomes-c": outcome_collector[2],
"static_features": ["Age", "Gender", "ICUType", "Height"],
}
return data
Loading