This repository has been archived by the owner on Jun 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.py
168 lines (125 loc) · 4.68 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""Module to get datasets in onelens. onelens_pycaret
"""
from typing import Optional
import requests
from onelens.onelens_pycaret.utils._dependencies import _check_soft_dependencies
def get_data(
dataset: str = "index",
folder: Optional[str] = None,
save_copy: bool = False,
profile: bool = False,
verbose: bool = True,
address: Optional[str] = None,
):
"""
Function to load sample datasets.
Order of read:
(1) Tries to read dataset from local folder first.
(2) Then tries to read dataset from folder in GitHub "address" (see below)
(3) Then tries to read from sktime (if installed)
(4) Raises error if none exist
List of available datasets on GitHub can be checked using
(1) ``get_data('index')`` or
(2) ``get_data('index', folder='time_series/seasonal)``
(see available "folder" options below)
Example
-------
>>> from onelens.onelens_pycaret.datasets import get_data
>>> all_datasets = get_data('index')
>>> juice = get_data('juice')
dataset: str, default = 'index'
Index value of dataset.
folder: Optional[str], default = None
The folder from which to get the data.
If 'None', gets it from the "common" folder. Other options are:
- time_series/seasonal
- time_series/random_walk
- time_series/white_noise
save_copy: bool, default = False
When set to true, it saves a copy in current working directory.
profile: bool, default = False
When set to true, an interactive EDA report is displayed.
verbose: bool, default = True
When set to False, head of data is not displayed.
address: string, default = None
Download url of dataset. Defaults to None which fetches the dataset from
"https://raw.githubusercontent.com/pycaret/datasets/main/". For people
having difficulty linking to github, they can change the default address
to their own
(e.g. "https://gitee.com/IncubatorShokuhou/pycaret/raw/master/datasets/")
Returns:
pandas.DataFrame
Warnings
--------
- Use of ``get_data`` requires internet connection.
Raises
------
ImportError
(1) When trying to import time series datasets that require sktime,
but sktime has not been installed.
(2) If the data does not exist
"""
import os.path
import pandas as pd
from onelens.onelens_pycaret.internal.display import CommonDisplay
extension = ".csv"
filename = str(dataset) + extension
if address is None:
root = "https://raw.githubusercontent.com/pycaret/datasets/main/"
data_dir, meta_dir = "data/", "meta/"
folder = "common" if folder is None else folder
if dataset == "index":
complete_address = root + meta_dir + folder + "/" + filename
else:
complete_address = root + data_dir + folder + "/" + filename
else:
complete_address = address + "/" + filename
sktime_datasets = ["airline", "lynx", "uschange"]
# Read the file name from local folder first
# If it does not exist, then read the file from GitHub
# If that does not exist then read sktime datasets
if os.path.isfile(filename):
data = pd.read_csv(filename)
elif requests.get(complete_address).status_code == 200:
data = pd.read_csv(complete_address)
elif dataset in sktime_datasets:
from sktime.datasets import load_airline, load_lynx, load_uschange
ts_dataset_mapping = {
"airline": load_airline,
"lynx": load_lynx,
"uschange": load_uschange,
}
data = ts_dataset_mapping.get(dataset)()
if isinstance(data, tuple):
y = data[0]
X = data[1]
data = pd.concat([y, X], axis=1)
else:
raise ValueError("Data could not be read. Please check your inputs...")
data = data.infer_objects()
if save_copy:
save_name = filename
data.to_csv(save_name, index=False)
display = CommonDisplay(
verbose=verbose,
html_param=True,
)
if dataset == "index":
display.display(data)
else:
if profile:
_check_soft_dependencies(
"ydata_profiling",
extra="analysis",
severity="error",
install_name="ydata-profiling",
)
import ydata_profiling
# create a copy for pandas profiler
data_for_profiling = data.copy()
pf = ydata_profiling.ProfileReport(data_for_profiling)
display.display(pf)
else:
if verbose:
display.display(data.head())
return data