From 49e019dec569070d57f5cac84056af4125897e94 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Sun, 14 Jul 2024 12:38:15 -0500
Subject: [PATCH 01/13] Starting new version.
---
pyproject.toml | 4 ++--
src/pandahelper/__init__.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index f403170..e6d4f68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "panda-helper"
-version = "0.1.0"
+version = "0.1.1"
dependencies = [
"beautifulsoup4>=4.12.2",
"numpy>=1.26.0",
@@ -44,7 +44,7 @@ test = ["pytest>=7.4", "pylint>=3.0"]
extend-include = ["*.ipynb"]
[tool.ruff.lint]
-select = ["D", "F", "B"] # pydocstyle, pyflakes, flake8-bugbear, isort
+select = ["D", "F", "B"] # pydocstyle, pyflakes, flake8-bugbear
[tool.ruff.lint.pydocstyle]
convention = "google"
diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py
index 8b3e9b1..c0600a2 100644
--- a/src/pandahelper/__init__.py
+++ b/src/pandahelper/__init__.py
@@ -5,5 +5,5 @@
from pandahelper.profiles import DataFrameProfile, SeriesProfile
from pandahelper.stats import distribution_stats, frequency_table
-__version__ = "0.1.0"
+__version__ = "0.1.1"
__all__ = ["frequency_table", "distribution_stats", "DataFrameProfile", "SeriesProfile"]
From dbb18aaf38bc48163cafc7bd01b206dfd453a1b5 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Sun, 14 Jul 2024 14:43:56 -0500
Subject: [PATCH 02/13] Change 'count' to 'number of columns' in nulls per row
DataFrameProfile table.
---
.gitignore | 6 ++--
src/pandahelper/profiles.py | 9 +++++-
tests/test_data/test_df_profile_name.txt | 32 +++++++++----------
tests/test_data/test_df_profile_name_311.txt | 2 +-
tests/test_data/test_df_profile_no_name.txt | 32 +++++++++----------
.../test_data/test_df_profile_no_name_311.txt | 2 +-
6 files changed, 45 insertions(+), 38 deletions(-)
diff --git a/.gitignore b/.gitignore
index b950a07..e73bcbb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,14 +9,14 @@
conda_environment_dev_*
# folders
-.coverage
.idea
-data
-notes
dist
htmlcov
+notebooks
+notes
site
__pycache__
# files
+.coverage
.DS_Store
diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py
index 1940cd1..40337dc 100644
--- a/src/pandahelper/profiles.py
+++ b/src/pandahelper/profiles.py
@@ -43,9 +43,16 @@ def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"):
self.memory_usage = df.memory_usage(index=True, deep=True) / 1000000 # MB
self.num_duplicates = sum(df.duplicated(keep="first"))
self.nulls_per_row = df.isna().sum(axis=1)
- self.null_stats = phs.dist_stats_dict(self.nulls_per_row)
+ self.null_stats = self.__null_stats()
self._format = fmt
+ def __null_stats(self, delete_key="count"):
+ """Prepare distribution statistics for the number of nulls per row."""
+ stats = phs.dist_stats_dict(self.nulls_per_row)
+ new_stats = {"Number of Columns": self.shape[1]}
+ del stats[delete_key]
+ return new_stats | stats
+
def __create_tables(self, table_fmt: str):
"""Create DataFrameProfile summary tables.
diff --git a/tests/test_data/test_df_profile_name.txt b/tests/test_data/test_df_profile_name.txt
index b71f123..d643fe4 100644
--- a/tests/test_data/test_df_profile_name.txt
+++ b/tests/test_data/test_df_profile_name.txt
@@ -40,19 +40,19 @@ VEHICLE TYPE CODE 5 object 0.006452
FLAG bool 0.0002
Summary of Nulls Per Row
--------------------------- ---------
-count 200
-min 3
-1% 3.99
-5% 6
-25% 7
-50% 8
-75% 10
-95% 12
-99% 14.01
-max 15
-mean 8.71
-standard deviation 2.04863
-median 8
-median absolute deviation 1
-skew 0.36218
+-------------------------- --------
+Number of Columns 30
+min 3
+1% 3.99
+5% 6
+25% 7
+50% 8
+75% 10
+95% 12
+99% 14.01
+max 15
+mean 8.71
+standard deviation 2.04863
+median 8
+median absolute deviation 1
+skew 0.36218
diff --git a/tests/test_data/test_df_profile_name_311.txt b/tests/test_data/test_df_profile_name_311.txt
index 917e60e..16c9c05 100644
--- a/tests/test_data/test_df_profile_name_311.txt
+++ b/tests/test_data/test_df_profile_name_311.txt
@@ -41,7 +41,7 @@ FLAG bool 0.0002
Summary of Nulls Per Row
-------------------------- ---------
-count 200
+Number of Columns 30
min 3
1% 3.99
5% 6
diff --git a/tests/test_data/test_df_profile_no_name.txt b/tests/test_data/test_df_profile_no_name.txt
index b15610c..0f160d4 100644
--- a/tests/test_data/test_df_profile_no_name.txt
+++ b/tests/test_data/test_df_profile_no_name.txt
@@ -39,19 +39,19 @@ VEHICLE TYPE CODE 5 object 0.006452
FLAG bool 0.0002
Summary of Nulls Per Row
--------------------------- ---------
-count 200
-min 3
-1% 3.99
-5% 6
-25% 7
-50% 8
-75% 10
-95% 12
-99% 14.01
-max 15
-mean 8.71
-standard deviation 2.04863
-median 8
-median absolute deviation 1
-skew 0.36218
+-------------------------- --------
+Number of Columns 30
+min 3
+1% 3.99
+5% 6
+25% 7
+50% 8
+75% 10
+95% 12
+99% 14.01
+max 15
+mean 8.71
+standard deviation 2.04863
+median 8
+median absolute deviation 1
+skew 0.36218
diff --git a/tests/test_data/test_df_profile_no_name_311.txt b/tests/test_data/test_df_profile_no_name_311.txt
index 601a43d..150dca7 100644
--- a/tests/test_data/test_df_profile_no_name_311.txt
+++ b/tests/test_data/test_df_profile_no_name_311.txt
@@ -40,7 +40,7 @@ FLAG bool 0.0002
Summary of Nulls Per Row
-------------------------- ---------
-count 200
+Number of Columns 30
min 3
1% 3.99
5% 6
From 9b1121e6dc8b32390bfbe4f9750af4319d50412a Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Sun, 14 Jul 2024 15:07:32 -0500
Subject: [PATCH 03/13] Remove flaky test.
---
tests/test_data/test_df_profile_name_311.txt | 58 -------------------
.../test_data/test_df_profile_no_name_311.txt | 57 ------------------
tests/test_profiles.py | 19 ------
3 files changed, 134 deletions(-)
delete mode 100644 tests/test_data/test_df_profile_name_311.txt
delete mode 100644 tests/test_data/test_df_profile_no_name_311.txt
diff --git a/tests/test_data/test_df_profile_name_311.txt b/tests/test_data/test_df_profile_name_311.txt
deleted file mode 100644
index 16c9c05..0000000
--- a/tests/test_data/test_df_profile_name_311.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-DataFrame-Level Info
----------------------- ---------
-DF Name test_name
-DF Shape (200, 30)
-Duplicated Rows 0
-Memory Usage (MB) 0.200
-
-Series Name Data Type Memory Usage (MB)
------------------------------ ----------- -------------------
-Index int64 0.000132
-CRASH DATE object 0.012785
-CRASH TIME object 0.01235
-BOROUGH object 0.010519
-ZIP CODE float64 0.0016
-LATITUDE float64 0.0016
-LONGITUDE float64 0.0016
-LOCATION object 0.014763
-ON STREET NAME object 0.015064
-CROSS STREET NAME object 0.01001
-OFF STREET NAME object 0.00952
-NUMBER OF PERSONS INJURED int64 0.0016
-NUMBER OF PERSONS KILLED int64 0.0016
-NUMBER OF PEDESTRIANS INJURED int64 0.0016
-NUMBER OF PEDESTRIANS KILLED int64 0.0016
-NUMBER OF CYCLIST INJURED int64 0.0016
-NUMBER OF CYCLIST KILLED int64 0.0016
-NUMBER OF MOTORIST INJURED int64 0.0016
-NUMBER OF MOTORIST KILLED int64 0.0016
-CONTRIBUTING FACTOR VEHICLE 1 object 0.015643
-CONTRIBUTING FACTOR VEHICLE 2 object 0.012727
-CONTRIBUTING FACTOR VEHICLE 3 object 0.007012
-CONTRIBUTING FACTOR VEHICLE 4 object 0.006652
-CONTRIBUTING FACTOR VEHICLE 5 object 0.006436
-COLLISION_ID int64 0.0016
-VEHICLE TYPE CODE 1 object 0.014306
-VEHICLE TYPE CODE 2 object 0.012294
-VEHICLE TYPE CODE 3 object 0.00715
-VEHICLE TYPE CODE 4 object 0.00673
-VEHICLE TYPE CODE 5 object 0.00646
-FLAG bool 0.0002
-
-Summary of Nulls Per Row
--------------------------- ---------
-Number of Columns 30
-min 3
-1% 3.99
-5% 6
-25% 7
-50% 8
-75% 10
-95% 12
-99% 14.01
-max 15
-mean 8.71
-standard deviation 2.04863
-median 8
-median absolute deviation 1
-skew 0.36218
diff --git a/tests/test_data/test_df_profile_no_name_311.txt b/tests/test_data/test_df_profile_no_name_311.txt
deleted file mode 100644
index 150dca7..0000000
--- a/tests/test_data/test_df_profile_no_name_311.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-DataFrame-Level Info
----------------------- ---------
-DF Shape (200, 30)
-Duplicated Rows 0
-Memory Usage (MB) 0.200
-
-Series Name Data Type Memory Usage (MB)
------------------------------ ----------- -------------------
-Index int64 0.000132
-CRASH DATE object 0.012785
-CRASH TIME object 0.01235
-BOROUGH object 0.010519
-ZIP CODE float64 0.0016
-LATITUDE float64 0.0016
-LONGITUDE float64 0.0016
-LOCATION object 0.014763
-ON STREET NAME object 0.015064
-CROSS STREET NAME object 0.01001
-OFF STREET NAME object 0.00952
-NUMBER OF PERSONS INJURED int64 0.0016
-NUMBER OF PERSONS KILLED int64 0.0016
-NUMBER OF PEDESTRIANS INJURED int64 0.0016
-NUMBER OF PEDESTRIANS KILLED int64 0.0016
-NUMBER OF CYCLIST INJURED int64 0.0016
-NUMBER OF CYCLIST KILLED int64 0.0016
-NUMBER OF MOTORIST INJURED int64 0.0016
-NUMBER OF MOTORIST KILLED int64 0.0016
-CONTRIBUTING FACTOR VEHICLE 1 object 0.015643
-CONTRIBUTING FACTOR VEHICLE 2 object 0.012727
-CONTRIBUTING FACTOR VEHICLE 3 object 0.007012
-CONTRIBUTING FACTOR VEHICLE 4 object 0.006652
-CONTRIBUTING FACTOR VEHICLE 5 object 0.006436
-COLLISION_ID int64 0.0016
-VEHICLE TYPE CODE 1 object 0.014306
-VEHICLE TYPE CODE 2 object 0.012294
-VEHICLE TYPE CODE 3 object 0.00715
-VEHICLE TYPE CODE 4 object 0.00673
-VEHICLE TYPE CODE 5 object 0.00646
-FLAG bool 0.0002
-
-Summary of Nulls Per Row
--------------------------- ---------
-Number of Columns 30
-min 3
-1% 3.99
-5% 6
-25% 7
-50% 8
-75% 10
-95% 12
-99% 14.01
-max 15
-mean 8.71
-standard deviation 2.04863
-median 8
-median absolute deviation 1
-skew 0.36218
diff --git a/tests/test_profiles.py b/tests/test_profiles.py
index cde7cf1..e27ecbd 100644
--- a/tests/test_profiles.py
+++ b/tests/test_profiles.py
@@ -35,25 +35,6 @@ def test_dataframe_profile_valid_312(test_df):
assert filecmp.cmp(compare_file, test_file, shallow=False)
-@pytest.mark.skipif(
- not ((3, 11) <= sys.version_info < (3, 12)), reason="Runs on Python 3.11"
-)
-def test_dataframe_profile_valid_311(test_df):
- """Generated DataFrame profile should match test profile (Python 3.11)."""
- compare_profile_name = "test_df_profile_name_311.txt"
- compare_profile_no_name = "test_df_profile_no_name_311.txt"
- compare_files = [
- os.path.join(TEST_DATA_DIR, compare_profile_name),
- os.path.join(TEST_DATA_DIR, compare_profile_no_name),
- ]
- names = ["test_name", ""]
- with tempfile.TemporaryDirectory() as tmp:
- for name, compare_file in zip(names, compare_files):
- test_file = os.path.join(tmp, "temp.txt")
- php.DataFrameProfile(test_df, name=name).save(test_file)
- assert filecmp.cmp(compare_file, test_file, shallow=False)
-
-
def test_dataframe_profile_invalid(non_series_invalid, num_series, cat_like_series):
"""DataFrame profile should not accept invalid data types."""
invalid_types = [*non_series_invalid, num_series, cat_like_series]
From f30f8e7ebb4e38818685c7b058c21c20223464b4 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Mon, 15 Jul 2024 11:14:28 -0500
Subject: [PATCH 04/13] Lower default value for SeriesProfile frequency table.
Also minor code formatting changes.
---
src/pandahelper/profiles.py | 42 ++++++++++++++++++-------------------
src/pandahelper/stats.py | 4 ++--
2 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py
index 40337dc..a09f1fa 100644
--- a/src/pandahelper/profiles.py
+++ b/src/pandahelper/profiles.py
@@ -137,7 +137,7 @@ def __init__(
series: pd.Series,
*,
fmt: str = "simple",
- freq_most_least: tuple = (20, 5),
+ freq_most_least: tuple = (10, 5),
):
"""Initialize SeriesProfile.
@@ -233,26 +233,6 @@ def save(self, path):
fh.write(str(self))
-def _format_html_table(table: str, align: str = "left", font: str = "monospace") -> str:
- """Add additional formatting to HTML table prepared by tabulate."""
- soup = bs4.BeautifulSoup(table, "html.parser")
- for row in soup.find_all("tr"):
- tags = row.find_all(["th", "td"]) # row in thead will have 'th'
- for tag in tags:
- tag["style"] = f"font-family: {font}, monospace; text-align: {align};"
- return str(soup)
-
-
-def _decimal_align_col(table: str, col: int):
- """Create decimal-aligned numbers in column of HTML table."""
- soup = bs4.BeautifulSoup(table, "html.parser")
- for row in soup.find_all("tr"):
- tags = row.find_all("td")
- if tags:
- tags[col].string = tags[col].string.replace(" ", "\u2007") # figure space
- return str(soup)
-
-
def _abbreviate_df(df, first=20, last=5):
"""Return a shortened DataFrame or Series.
@@ -282,3 +262,23 @@ def _abbreviate_df(df, first=20, last=5):
else:
abbrev = pd.concat([df.iloc[:first], df.iloc[(len(df) - last) : len(df)]])
return abbrev
+
+
+def _format_html_table(table: str, align: str = "left", font: str = "monospace") -> str:
+ """Add additional formatting to HTML table prepared by tabulate."""
+ soup = bs4.BeautifulSoup(table, "html.parser")
+ for row in soup.find_all("tr"):
+ tags = row.find_all(["th", "td"]) # row in thead will have 'th'
+ for tag in tags:
+ tag["style"] = f"font-family: {font}, monospace; text-align: {align};"
+ return str(soup)
+
+
+def _decimal_align_col(table: str, col: int):
+ """Create decimal-aligned numbers in column of HTML table."""
+ soup = bs4.BeautifulSoup(table, "html.parser")
+ for row in soup.find_all("tr"):
+ tags = row.find_all("td")
+ if tags:
+ tags[col].string = tags[col].string.replace(" ", "\u2007") # figure space
+ return str(soup)
diff --git a/src/pandahelper/stats.py b/src/pandahelper/stats.py
index 25ae251..9f8ee63 100644
--- a/src/pandahelper/stats.py
+++ b/src/pandahelper/stats.py
@@ -31,7 +31,7 @@ def frequency_table(series: pd.Series) -> pd.DataFrame:
return output.sort_values(by="Count", ascending=False)
-def _abbreviate_string(s, limit=60):
+def _abbreviate_string(s, limit=60) -> str:
"""Return first x characters of a string.
Args:
@@ -157,7 +157,7 @@ def _add_quantiles(series: pd.Series, d: dict):
d["99%"] = series.quantile(0.99)
-def _order_stats(stats: dict):
+def _order_stats(stats: dict) -> dict:
"""Sort stats dictionary by order provided in all_stats.
Helper function used in distribution_stats.
From 9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Tue, 16 Jul 2024 00:39:07 -0500
Subject: [PATCH 05/13] SeriesProfile now reports gaps for timeseries data.
Also added time series functions to calculate time gaps.
gh-20
---
docs/api.md | 4 ++
src/pandahelper/__init__.py | 10 ++++-
src/pandahelper/profiles.py | 76 +++++++++++++++++++++++--------------
src/pandahelper/times.py | 50 ++++++++++++++++++++++++
tests/conftest.py | 13 ++++++-
tests/test_profiles.py | 60 +++++++++++++++++++++++++++++
tests/test_times.py | 44 +++++++++++++++++++++
tests/utils.py | 16 ++++++++
8 files changed, 242 insertions(+), 31 deletions(-)
create mode 100644 src/pandahelper/times.py
create mode 100644 tests/test_times.py
create mode 100644 tests/utils.py
diff --git a/docs/api.md b/docs/api.md
index b8e6d06..63313ad 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -8,3 +8,7 @@ description: Panda-Helper API Reference. Detailed description of the Panda-Helpe
::: pandahelper.stats
+
+
+
+::: pandahelper.times
diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py
index c0600a2..b0a5288 100644
--- a/src/pandahelper/__init__.py
+++ b/src/pandahelper/__init__.py
@@ -4,6 +4,14 @@
from pandahelper.profiles import DataFrameProfile, SeriesProfile
from pandahelper.stats import distribution_stats, frequency_table
+from pandahelper.times import time_diffs, time_diffs_index
__version__ = "0.1.1"
-__all__ = ["frequency_table", "distribution_stats", "DataFrameProfile", "SeriesProfile"]
+__all__ = [
+ "frequency_table",
+ "distribution_stats",
+ "DataFrameProfile",
+ "SeriesProfile",
+ "time_diffs",
+ "time_diffs_index",
+]
diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py
index a09f1fa..3aea3fa 100644
--- a/src/pandahelper/profiles.py
+++ b/src/pandahelper/profiles.py
@@ -5,6 +5,7 @@
import pandas.api.types as pat
from tabulate import tabulate
import pandahelper.stats as phs
+import pandahelper.times as pht
class DataFrameProfile:
@@ -61,7 +62,6 @@ def __create_tables(self, table_fmt: str):
Returns:
list(str): List of Tabulate tables.
-
"""
df_info = [
("DF Shape", self.shape),
@@ -129,7 +129,10 @@ class SeriesProfile:
num_unique (int): Number of unique values.
num_nulls (int): Number of null values.
frequency (pd.DataFrame): Frequency table with counts and percentage.
- stats (list): Distribution statistics for Series.
+ stats (dict): Distribution statistics for Series.
+ time_diffs (pd.Series): Time diffs (gaps) if series is of type `datetime64`.
+ Alternately, can be time diffs in a Series with a DateTimeIndex if the
+ `time_index` parameter was set to `True` when creating Series Profile.
"""
def __init__(
@@ -138,48 +141,57 @@ def __init__(
*,
fmt: str = "simple",
freq_most_least: tuple = (10, 5),
+ time_index: bool = False,
):
"""Initialize SeriesProfile.
Args:
- series (pd.Series): DataFrame to profile.
- fmt (str: optional): Printed table format. See
- https://github.com/astanin/python-tabulate for options.
+ series (pd.Series): Pandas Series to profile.
+ fmt (str: optional): Printed table format. See:
+ for options.
freq_most_least (tuple: optional): Tuple (x, y) of the x most common and
y least common values to display in frequency table.
+ time_index (bool: optional): Whether to use the index for calculating time
+ diffs for a `datetime64`-related Pandas Series. Not relevant for
+ non-time related Series.
Raises:
- TypeError: If input is not a pd.Series.
+ TypeError: If input is not a Pandas Series.
"""
if not isinstance(series, pd.Series):
raise TypeError(f"{series}, is not pd.DataFrame")
if freq_most_least[0] < 0 or freq_most_least[1] < 0:
raise ValueError("Tuple values must be >= 0!")
+ self._format = fmt
+ self._freq_table = freq_most_least
self.name = series.name
self.dtype = series.dtype
self.count = series.count() # counts non-null values
self.num_unique = series.nunique()
self.num_nulls = series.size - self.count # NAs, nans, NaT, but not ""
self.frequency = phs.frequency_table(series)
- self.stats = None
- if not (
- pat.is_object_dtype(self.dtype)
- or isinstance(self.dtype, pd.CategoricalDtype)
- ):
- self.stats = phs.dist_stats_dict(series)
- self._format = fmt
- self._freq_table = freq_most_least
+ self.stats = self.__calc_stats(series)
+ self.time_diffs = self.__calc_time_diffs(series, time_index)
- def __create_tables(self, table_fmt: str):
- """Create SeriesProfile summary tables.
-
- Args:
- table_fmt (str): Tabulate table format name.
-
- Returns:
- list(str): List of Tabulate tables.
-
- """
+ def __calc_stats(self, series):
+ """Calculate distribution stats if allowed dtype, else return None."""
+ if pat.is_object_dtype(self.dtype) or isinstance(
+ self.dtype, pd.CategoricalDtype
+ ):
+ return None
+ return phs.dist_stats_dict(series)
+
+ @staticmethod
+ def __calc_time_diffs(series, use_time_index: bool) -> pd.Series or None:
+ """Calculate time diffs for time-indexed series or datetime64 series."""
+ if use_time_index and pat.is_datetime64_any_dtype(series.index):
+ return pht.time_diffs_index(series)
+ if (not use_time_index) and pat.is_datetime64_any_dtype(series):
+ return pht.time_diffs(series)
+ return None
+
+ def __create_tables(self, table_fmt: str) -> list[str]:
+ """Create and return SeriesProfile summary tables."""
series_info = [
("Data Type", self.dtype),
("Count", self.count),
@@ -201,16 +213,22 @@ def __create_tables(self, table_fmt: str):
stats_table = ""
if self.stats is not None:
stats = self.stats
- if pat.is_complex_dtype(
- self.dtype
- ): # tabulate converts complex numbers to real numbers
+ # tabulate casts complex numbers to real numbers, dropping imaginary part
+ if pat.is_complex_dtype(self.dtype):
stats = {k: str(v) for k, v in self.stats.items()}
stats_table = tabulate(
list(stats.items()),
headers=["Statistic", "Value"],
tablefmt=table_fmt,
)
- return [series_table, freq_table, stats_table]
+ time_diffs_table = ""
+ if self.time_diffs is not None:
+ time_diffs_table = tabulate(
+ phs.frequency_table(self.time_diffs),
+ headers=["Time Gaps (Diffs)", "Count", "% of total"],
+ tablefmt=table_fmt,
+ )
+ return [series_table, freq_table, stats_table, time_diffs_table]
def __repr__(self):
"""Printable version of profile."""
@@ -221,7 +239,7 @@ def _repr_html_(self):
"""HTML representation of profile."""
tables = [_format_html_table(t) for t in self.__create_tables("html")]
tables[2] = _decimal_align_col(tables[2], 1)
- return tables[0] + "
" + tables[1] + "
" + tables[2]
+ return tables[0] + "
" + tables[1] + "
" + tables[2] + "
" + tables[3]
def save(self, path):
"""Save profile to provided path.
diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py
new file mode 100644
index 0000000..c1aabd1
--- /dev/null
+++ b/src/pandahelper/times.py
@@ -0,0 +1,50 @@
+"""Panda-Helper time-series functions."""
+
+import pandas as pd
+import pandas.api.types as pat
+
+
+def time_diffs(series: pd.Series | pd.DatetimeIndex) -> pd.Series(pd.Timedelta):
+ """Calculate time diffs (gaps) for Pandas Series or Index of timestamps.
+
+ Sorts input by time before calculating diffs.
+
+ Args:
+ series (pd.Series or pd.DatetimeIndex): Pandas Series or DatetimeIndex
+ to calculate time diffs on.
+
+ Returns:
+ Series of diffs (gaps) indexed by the time the diff was calculated.
+
+ Raises:
+ TypeError: If input is not Series of type datetime64 or DatetimeIndex.
+ """
+ if not pat.is_datetime64_any_dtype(series.dtype):
+ raise TypeError("Should be Series of datetime64 dtype.")
+ series = series.sort_values()
+ diffs = pd.Series(series.diff(), name="diffs")
+ diffs.index = series
+ return diffs
+
+
+def time_diffs_index(df: pd.DataFrame | pd.Series) -> pd.Series(pd.Timedelta):
+ """Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe.
+
+ Sorts input by time before calculating diffs.
+
+ Args:
+ df (pd.Series or pd.DataFrame): Pandas Series or DataFrame with DateTimeIndex
+ to calculate time diffs on.
+
+ Returns:
+ Series of diffs (gaps) indexed by the time the diff was calculated.
+
+ Raises:
+ TypeError: If input does not have a DatetimeIndex.
+ """
+ if isinstance(df.index, pd.DatetimeIndex):
+ df = df.sort_index()
+ diffs = pd.Series(df.index.diff(), name="diffs")
+ diffs.index = df.index
+ return diffs
+ raise TypeError(f"Index should be of type {pd.DatetimeIndex}")
diff --git a/tests/conftest.py b/tests/conftest.py
index e04f82f..f19a424 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,13 +2,14 @@
Note that fixtures with a package-scope are run once and then available as
cached value.
-
"""
+from datetime import datetime
import os
import numpy as np
import pandas as pd
import pytest
+from .utils import make_category_data
TEST_DATA_DIR = "tests/test_data"
TEST_DATA_FILE = "sample_collisions.csv"
@@ -16,6 +17,16 @@
NUM_SERIES = "NUMBER OF PERSONS INJURED"
+@pytest.fixture
+def cat_df(scope="package"): # pylint: disable=W0613
+ """Return test pd.DataFrame."""
+ start = datetime(year=1999, month=1, day=1, hour=0, minute=0)
+ end = start + pd.Timedelta(hours=10)
+ df = make_category_data("Springfield", start, end, freq="h")
+ df = df.sample(frac=1, random_state=2) # index is out of order
+ return df
+
+
@pytest.fixture
def test_df(scope="package"): # pylint: disable=W0613
"""Return test pd.DataFrame."""
diff --git a/tests/test_profiles.py b/tests/test_profiles.py
index e27ecbd..a00b8a1 100644
--- a/tests/test_profiles.py
+++ b/tests/test_profiles.py
@@ -9,6 +9,7 @@
import bs4
import numpy as np
import pandas as pd
+import pandas.api.types as pat
import pytest
import pandahelper.profiles as php
@@ -200,6 +201,65 @@ def test_series_profile_frequency_table(test_df):
assert len(freq_table.find_all("tr")) == v + 1 # +1 for header
+def test_series_profile_time_index_true(cat_df):
+ """time_index=True calculates time diffs for Series with DateTimeIndex."""
+ series = cat_df["category"]
+ profile = php.SeriesProfile(series, time_index=True)
+ assert pat.is_datetime64_any_dtype(series.index)
+ assert profile.time_diffs.iloc[0] is pd.NaT
+ assert all(profile.time_diffs[1:] == pd.Timedelta(hours=1))
+
+
+def test_series_profile_time_index_false(cat_df):
+ """time_index=False does not calculate time diffs for Series with DateTimeIndex."""
+ series = cat_df["category"]
+ profile = php.SeriesProfile(series, time_index=False)
+ assert pat.is_datetime64_any_dtype(series.index)
+ assert profile.time_diffs is None
+
+
+@pytest.fixture
+def ts_timeindex(scope="module"): # pylint: disable=W0613
+ """Return pd.Series of type datetime64 with DatetimeIndex."""
+ start = datetime(year=1999, month=1, day=1, hour=0, minute=0)
+ end = start + pd.Timedelta(hours=40)
+ time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left"))
+ index_end = start + pd.Timedelta(hours=10)
+ time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left")
+ return time_series
+
+
+def test_series_profile_ts_range_index_true(ts_timeindex): # pylint: disable=W0621
+ """time_index=True does not calculate time diffs for Series with RangeIndex."""
+ series = ts_timeindex
+ series.index = range(len(ts_timeindex))
+ profile = php.SeriesProfile(series, time_index=True)
+ assert not pat.is_datetime64_any_dtype(series.index)
+ assert profile.time_diffs is None
+
+
+def test_series_profile_both_time_index_false(ts_timeindex): # pylint: disable=W0621
+ """SeriesProfile should have time diffs from series, (not index).
+
+ Given for Series(datetime64) with TimeIndex and time_index=False.
+ """
+ profile = php.SeriesProfile(ts_timeindex, time_index=False)
+ assert pat.is_datetime64_any_dtype(ts_timeindex.index)
+ assert profile.time_diffs.iloc[0] is pd.NaT
+ assert all(profile.time_diffs[1:] == pd.Timedelta(hours=4))
+
+
+def test_series_profile_both_time_index_true(ts_timeindex): # pylint: disable=W0621
+ """SeriesProfile should have time diffs from index, (not series).
+
+ Given for Series(datetime64) with TimeIndex and time_index=True.
+ """
+ profile = php.SeriesProfile(ts_timeindex, time_index=True)
+ assert pat.is_datetime64_any_dtype(ts_timeindex.index)
+ assert profile.time_diffs.iloc[0] is pd.NaT
+ assert all(profile.time_diffs[1:] == pd.Timedelta(hours=1))
+
+
def test_series_profile_frequency_table_invalid(test_df):
"""Invalid frequency table most_least tuples should raise ValueError."""
invalid_tuples = [(0, -1), (-1, 0), (-1, -1)]
diff --git a/tests/test_times.py b/tests/test_times.py
new file mode 100644
index 0000000..17665ab
--- /dev/null
+++ b/tests/test_times.py
@@ -0,0 +1,44 @@
+"""Tests for functions in times.py."""
+
+import pandas as pd
+import pytest
+import pandahelper.times as pht
+
+
+def test_time_diffs_index(cat_df):
+ """time_diffs_index should work on shuffled pd.Series or pd.DataFrame."""
+ # test DF
+ df_result = pht.time_diffs_index(cat_df)
+ assert df_result.iloc[0] is pd.NaT
+ assert all(df_result[1:] == pd.Timedelta(hours=1))
+ # test Series
+ series_result = pht.time_diffs_index(cat_df["B"])
+ assert series_result.iloc[0] is pd.NaT
+ assert all(series_result[1:] == pd.Timedelta(hours=1))
+
+
+def test_time_diffs_index_exception():
+ """pd.DataFrame and pd.Series without time index raise exception."""
+ data = {"A": list(range(5))}
+ dtypes = [pd.DataFrame(data), pd.Series(data)]
+ for tipo in dtypes:
+ with pytest.raises(TypeError) as exc:
+ pht.time_diffs_index(tipo)
+ assert str(pd.DatetimeIndex) in str(exc)
+
+
+def test_time_diffs(cat_df):
+ """time_diffs should work on shuffled pd.Series or Index of timestamps."""
+ valid = [cat_df.index, pd.Series(cat_df.index)]
+ for v in valid:
+ result = pht.time_diffs(v)
+ assert result.iloc[0] is pd.NaT
+ assert all(result[1:] == pd.Timedelta(hours=1))
+
+
+def test_time_diffs_exception():
+ """Non-datetime64 pd.Series raises exception."""
+ invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)]
+ for tipo in invalid:
+ with pytest.raises(TypeError):
+ pht.time_diffs(tipo)
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..52d462c
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,16 @@
+"""Test-related utility functions."""
+
+import pandas as pd
+
+
+def make_category_data(cat_name, start, end, freq):
+ """Return pd.DataFrame of arbitrary data for specified 'category'."""
+ rng = pd.date_range(start, end, freq=freq, inclusive="left")
+ data = {
+ "A": list(range(1, len(rng) + 1, 1)),
+ "B": [chr(ord("A") + (x % 26)) for x in range(0, len(rng), 1)],
+ "C": [float((-1) ** (x % 2) * x) for x in range(0, len(rng), 1)],
+ }
+ df = pd.DataFrame(data, index=rng)
+ df["category"] = cat_name
+ return df
From 4c172d7cb614cada8939ed93bd36b59911b95997 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Tue, 16 Jul 2024 01:01:57 -0500
Subject: [PATCH 06/13] Fix type hint for Python 3.9.
gh-20
---
src/pandahelper/times.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py
index c1aabd1..98461a1 100644
--- a/src/pandahelper/times.py
+++ b/src/pandahelper/times.py
@@ -1,10 +1,11 @@
"""Panda-Helper time-series functions."""
+from typing import Union # TODO: Remove when deprecating Python 3.9
import pandas as pd
import pandas.api.types as pat
-def time_diffs(series: pd.Series | pd.DatetimeIndex) -> pd.Series(pd.Timedelta):
+def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta):
"""Calculate time diffs (gaps) for Pandas Series or Index of timestamps.
Sorts input by time before calculating diffs.
@@ -27,7 +28,7 @@ def time_diffs(series: pd.Series | pd.DatetimeIndex) -> pd.Series(pd.Timedelta):
return diffs
-def time_diffs_index(df: pd.DataFrame | pd.Series) -> pd.Series(pd.Timedelta):
+def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta):
"""Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe.
Sorts input by time before calculating diffs.
From cc506d17b0f8da34f5df002d195a755f31b13921 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Tue, 16 Jul 2024 01:19:41 -0500
Subject: [PATCH 07/13] Update CHANGELOG.
---
CHANGELOG.md | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 271b43b..fce8c94 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,10 +1,14 @@
# Changelog
## Unreleased
+- Add functionality to perform some common data cleaning tasks.
+- Add `geo.py` module and functionality to set 'close' lat-long coordinates to same value.
## 0.1.1 - Unreleased
### Added
-- functionality to detect time series gaps
+- SeriesProfile now reports gaps in pd.Series with type `datetime64` or for Series with `DatetimeIndex`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
+- `times.py` module has been added with public functions `time_diffs` and `time_diffs_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
+- [`freq_most_least` default parameter for SeriesProfile has been changed to `(10, 5)`.](https://github.com/ray310/Panda-Helper/commit/9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb)
____
## 0.1.0 - 2024-07-14
From 7001a4fa2deb1061bae643e2e3b6feae23fe6f8a Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Tue, 16 Jul 2024 23:17:04 -0500
Subject: [PATCH 08/13] Add id_gaps and id_gaps_index functions to times.py.
gh-20
---
CHANGELOG.md | 2 +-
mkdocs.yml | 1 +
src/pandahelper/__init__.py | 4 +-
src/pandahelper/times.py | 122 ++++++++++++++++++++++++++++++++++--
tests/conftest.py | 18 ++++--
tests/test_profiles.py | 11 ----
tests/test_times.py | 65 +++++++++++++++----
7 files changed, 188 insertions(+), 35 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fce8c94..d373552 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@
## 0.1.1 - Unreleased
### Added
- SeriesProfile now reports gaps in pd.Series with type `datetime64` or for Series with `DatetimeIndex`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
-- `times.py` module has been added with public functions `time_diffs` and `time_diffs_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
+- `times.py` module has been added with public functions `time_diffs`, `time_diffs_index`, `id_gaps`, `id_gaps_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
- [`freq_most_least` default parameter for SeriesProfile has been changed to `(10, 5)`.](https://github.com/ray310/Panda-Helper/commit/9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb)
____
diff --git a/mkdocs.yml b/mkdocs.yml
index 778b6d1..df63eb4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -101,6 +101,7 @@ plugins:
python:
paths: [src]
options:
+ members_order: alphabetical
docstring_style: google
docstring_section_style: list
docstring_options:
diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py
index b0a5288..58894b2 100644
--- a/src/pandahelper/__init__.py
+++ b/src/pandahelper/__init__.py
@@ -4,7 +4,7 @@
from pandahelper.profiles import DataFrameProfile, SeriesProfile
from pandahelper.stats import distribution_stats, frequency_table
-from pandahelper.times import time_diffs, time_diffs_index
+from pandahelper.times import time_diffs, time_diffs_index, id_gaps, id_gaps_index
__version__ = "0.1.1"
__all__ = [
@@ -14,4 +14,6 @@
"SeriesProfile",
"time_diffs",
"time_diffs_index",
+ "id_gaps",
+ "id_gaps_index",
]
diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py
index 98461a1..1a99154 100644
--- a/src/pandahelper/times.py
+++ b/src/pandahelper/times.py
@@ -6,7 +6,7 @@
def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta):
- """Calculate time diffs (gaps) for Pandas Series or Index of timestamps.
+ """Calculate time difference between subsequent observations.
Sorts input by time before calculating diffs.
@@ -19,19 +19,39 @@ def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timed
Raises:
TypeError: If input is not Series of type datetime64 or DatetimeIndex.
+
+ Examples:
+ Calculate time differences between observations on Series of timestamps after
+ it has been randomized:
+
+ >>> import pandahelper as ph
+ >>> import pandas as pd
+ >>>
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
+ >>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8])
+ >>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order
+
+ >>> ph.time_diffs(series)
+ 1999-01-01 NaT
+ 1999-01-02 1 days
+ 1999-01-03 1 days
+ 1999-01-07 4 days
+ 1999-01-08 1 days
+ 1999-01-10 2 days
+ Name: diffs, dtype: timedelta64[ns]
"""
if not pat.is_datetime64_any_dtype(series.dtype):
- raise TypeError("Should be Series of datetime64 dtype.")
+ raise TypeError("Should be of datetime64 dtype.")
series = series.sort_values()
diffs = pd.Series(series.diff(), name="diffs")
diffs.index = series
return diffs
-def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta):
- """Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe.
+def time_diffs_index(df: Union[pd.Series, pd.DataFrame]) -> pd.Series(pd.Timedelta):
+ """Calculate time difference between subsequent time-indexed observations.
- Sorts input by time before calculating diffs.
+ Sorts input by time index before calculating diffs.
Args:
df (pd.Series or pd.DataFrame): Pandas Series or DataFrame with DateTimeIndex
@@ -42,6 +62,27 @@ def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Tim
Raises:
TypeError: If input does not have a DatetimeIndex.
+
+ Examples:
+ Calculate time differences between observations on time-indexed DataFrame after
+ it has been randomized:
+
+ >>> import pandahelper as ph
+ >>> import pandas as pd
+ >>>
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
+ >>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8])
+ >>> # index by time then randomize order
+ >>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3)
+
+ >>> ph.time_diffs_index(df)
+ 1999-01-01 NaT
+ 1999-01-02 1 days
+ 1999-01-03 1 days
+ 1999-01-07 4 days
+ 1999-01-08 1 days
+ 1999-01-10 2 days
+ Name: diffs, dtype: timedelta64[ns]
"""
if isinstance(df.index, pd.DatetimeIndex):
df = df.sort_index()
@@ -49,3 +90,74 @@ def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Tim
diffs.index = df.index
return diffs
raise TypeError(f"Index should be of type {pd.DatetimeIndex}")
+
+
+def id_gaps(
+ series: Union[pd.Series, pd.DatetimeIndex], threshold: pd.Timedelta
+) -> pd.DataFrame:
+ """Identify time gaps above `threshold` in datetime64 Series or DatetimeIndex.
+
+ Sorts input by time before calculating gaps.
+
+ Args:
+ series (pd.Series or pd.DatetimeIndex): `datetime64` Series or DatetimeIndex.
+ threshold (pd.Timedelta): Threshold to identify gaps
+ (and not expected time differences).
+
+ Returns:
+ One-column Pandas DataFrame of gaps indexed by when gap was calculated.
+
+ Examples:
+ Identify time gaps on Series of timestamps with a 2 and 4 hour
+ gap after it has been randomized:
+
+ >>> import pandahelper as ph
+ >>> import pandas as pd
+ >>>
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
+ >>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 4, 8, 9, 10])
+ >>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order
+
+ >>> ph.id_gaps(series, pd.Timedelta(hours=1))
+ diffs
+ 1999-01-01 11:00:00 0 days 04:00:00
+ 1999-01-01 04:00:00 0 days 02:00:00
+ """
+ diffs = time_diffs(series)
+ return diffs[diffs > threshold].sort_values(ascending=False).to_frame()
+
+
+def id_gaps_index(
+ df: Union[pd.Series, pd.DataFrame], threshold: pd.Timedelta
+) -> pd.DataFrame:
+ """Identify time gaps above `threshold` in time-indexed Series or DataFrame.
+
+ Sorts input by time index before calculating diffs.
+
+ Args:
+ df (pd.Series or pd.DataFrame): Time-indexed Series or DataFrame.
+ threshold (pd.Timedelta): Threshold to identify gaps
+ (and not expected time differences).
+
+ Returns:
+ One-column Pandas DataFrame of gaps indexed by when gap was calculated.
+
+ Examples:
+ Identify time gaps on an hourly, time-indexed Series with a 2 and 4 hour
+ gap after it has been randomized:
+
+ >>> import pandahelper as ph
+ >>> import pandas as pd
+ >>>
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
+ >>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 8, 9, 10])
+ >>> # index by time then randomize order
+ >>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3)
+
+ >>> ph.id_gaps_index(df, pd.Timedelta(hours=1))
+ diffs
+ 1999-01-01 11:00:00 0 days 04:00:00
+ 1999-01-01 04:00:00 0 days 02:00:00
+ """
+ diffs = time_diffs_index(df)
+ return diffs[diffs > threshold].sort_values(ascending=False).to_frame()
diff --git a/tests/conftest.py b/tests/conftest.py
index f19a424..a7bb3d8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,6 @@
cached value.
"""
-from datetime import datetime
import os
import numpy as np
import pandas as pd
@@ -19,17 +18,28 @@
@pytest.fixture
def cat_df(scope="package"): # pylint: disable=W0613
- """Return test pd.DataFrame."""
- start = datetime(year=1999, month=1, day=1, hour=0, minute=0)
+ """Return test pd.DataFrame with DatetimeIndex."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
end = start + pd.Timedelta(hours=10)
df = make_category_data("Springfield", start, end, freq="h")
df = df.sample(frac=1, random_state=2) # index is out of order
return df
+@pytest.fixture
+def ts_timeindex(scope="package"): # pylint: disable=W0613
+ """Return pd.Series of type datetime64 with DatetimeIndex."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(hours=40)
+ time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left"))
+ index_end = start + pd.Timedelta(hours=10)
+ time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left")
+ return time_series
+
+
@pytest.fixture
def test_df(scope="package"): # pylint: disable=W0613
- """Return test pd.DataFrame."""
+ """Return test pd.DataFrame from sample of NYC collisions dataset."""
return pd.read_csv(os.path.join(TEST_DATA_DIR, TEST_DATA_FILE))
diff --git a/tests/test_profiles.py b/tests/test_profiles.py
index a00b8a1..713b544 100644
--- a/tests/test_profiles.py
+++ b/tests/test_profiles.py
@@ -218,17 +218,6 @@ def test_series_profile_time_index_false(cat_df):
assert profile.time_diffs is None
-@pytest.fixture
-def ts_timeindex(scope="module"): # pylint: disable=W0613
- """Return pd.Series of type datetime64 with DatetimeIndex."""
- start = datetime(year=1999, month=1, day=1, hour=0, minute=0)
- end = start + pd.Timedelta(hours=40)
- time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left"))
- index_end = start + pd.Timedelta(hours=10)
- time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left")
- return time_series
-
-
def test_series_profile_ts_range_index_true(ts_timeindex): # pylint: disable=W0621
"""time_index=True does not calculate time diffs for Series with RangeIndex."""
series = ts_timeindex
diff --git a/tests/test_times.py b/tests/test_times.py
index 17665ab..83dfba2 100644
--- a/tests/test_times.py
+++ b/tests/test_times.py
@@ -5,6 +5,23 @@
import pandahelper.times as pht
+def test_time_diffs(cat_df):
+ """time_diffs should work on shuffled pd.Series or Index of timestamps."""
+ valid = [cat_df.index, pd.Series(cat_df.index)]
+ for v in valid:
+ result = pht.time_diffs(v)
+ assert result.iloc[0] is pd.NaT
+ assert all(result[1:] == pd.Timedelta(hours=1))
+
+
+def test_time_diffs_exception():
+ """Non-datetime64 pd.Series raises exception."""
+ invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)]
+ for tipo in invalid:
+ with pytest.raises(TypeError):
+ pht.time_diffs(tipo)
+
+
def test_time_diffs_index(cat_df):
"""time_diffs_index should work on shuffled pd.Series or pd.DataFrame."""
# test DF
@@ -27,18 +44,40 @@ def test_time_diffs_index_exception():
assert str(pd.DatetimeIndex) in str(exc)
-def test_time_diffs(cat_df):
- """time_diffs should work on shuffled pd.Series or Index of timestamps."""
- valid = [cat_df.index, pd.Series(cat_df.index)]
- for v in valid:
- result = pht.time_diffs(v)
- assert result.iloc[0] is pd.NaT
- assert all(result[1:] == pd.Timedelta(hours=1))
+def test_id_gaps_index(ts_timeindex):
+ """id_gap_index returns expected gap from time-Series with DatetimeIndex."""
+ result = pht.id_gaps_index(
+ ts_timeindex, pd.Timedelta(minutes=59, microseconds=999999)
+ )
+ expected = pd.DataFrame(
+ [pd.Timedelta(hours=1)] * 9,
+ index=pd.date_range(pd.Timestamp(1999, 1, 1, 1), periods=9, freq="h"),
+ columns=["diffs"],
+ )
+ pd.testing.assert_frame_equal(expected, result, check_index_type=True)
-def test_time_diffs_exception():
- """Non-datetime64 pd.Series raises exception."""
- invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)]
- for tipo in invalid:
- with pytest.raises(TypeError):
- pht.time_diffs(tipo)
+def test_id_gaps_index_no_gaps(ts_timeindex):
+ """id_gap_index returns empty Dataframe when threshold exceeds diffs."""
+ result = pht.id_gaps_index(ts_timeindex, pd.Timedelta(minutes=60, microseconds=1))
+ assert len(result) == 0
+
+
+def test_id_gaps_(ts_timeindex):
+ """id_gap returns expected gap from time-Series with DatetimeIndex."""
+ result = pht.id_gaps(
+ ts_timeindex, pd.Timedelta(hours=3, minutes=59, microseconds=999999)
+ )
+ expected = pd.DataFrame(
+ [pd.Timedelta(hours=4)] * 9,
+ index=pd.date_range(pd.Timestamp(1999, 1, 1, 4), periods=9, freq="4h"),
+ columns=["diffs"],
+ )
+ expected.index.freq = None # diffs won't have freq set
+ pd.testing.assert_frame_equal(expected, result, check_index_type=True)
+
+
+def test_id_gaps_no_gaps(ts_timeindex):
+ """id_gap_index returns empty Dataframe when threshold exceeds diffs."""
+ result = pht.id_gaps(ts_timeindex, pd.Timedelta(hours=4, microseconds=1))
+ assert len(result) == 0
From 90f236e78e93e07beebe5a484839c1724af6ed8d Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Wed, 17 Jul 2024 01:39:43 -0500
Subject: [PATCH 09/13] Add examples to stats documentation.
gh-2
---
src/pandahelper/stats.py | 61 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 61 insertions(+)
diff --git a/src/pandahelper/stats.py b/src/pandahelper/stats.py
index 9f8ee63..9de9925 100644
--- a/src/pandahelper/stats.py
+++ b/src/pandahelper/stats.py
@@ -18,6 +18,20 @@ def frequency_table(series: pd.Series) -> pd.DataFrame:
Raises:
TypeError: If input is not a Pandas Series.
+
+ Examples:
+ >>> import random
+ >>> import pandahelper as ph
+ >>>
+ >>> random.seed(314)
+ >>> cities = ["Springfield", "Quahog", "Philadelphia", "Shelbyville"]
+ >>> series = pd.Series(random.choices(cities, k = 200))
+ >>> ph.frequency_table(series)
+ Count % of Total
+ Springfield 66 33.00%
+ Quahog 51 25.50%
+ Philadelphia 44 22.00%
+ Shelbyville 39 19.50%
"""
if not isinstance(series, pd.Series):
raise TypeError(f"{series}, is not pd.Series")
@@ -70,6 +84,53 @@ def distribution_stats(series: pd.Series) -> pd.DataFrame:
Raises:
TypeError: If input is not a numeric-like pd.Series.
+
+ Examples:
+ Distribution stats for Pandas Series of type `float64`:
+ >>> from random import seed, gauss, expovariate
+ >>> import pandahelper as ph
+ >>> import pandas as pd
+ >>>
+ >>> seed(314)
+ >>> series = pd.Series([gauss(mu=30, sigma=20) for x in range(200)])
+ >>> ph.distribution_stats(series)
+ Statistic Value
+ count 200.000000
+ min -23.643007
+ 1% -11.918955
+ 5% 2.833604
+ 25% 17.553793
+ 50% 31.420759
+ 75% 42.074998
+ 95% 60.305435
+ 99% 72.028633
+ max 81.547828
+ mean 30.580535
+ standard deviation 18.277706
+ median 31.420759
+ median absolute deviation 12.216607
+ skew -0.020083
+
+ Distribution stats for Pandas Series of type `datetime64`:
+ >>> start = pd.Timestamp(2000, 1, 1)
+ >>> tds = [pd.Timedelta(hours=int(expovariate(lambd=.003))) for x in range(200)]
+ >>> times = [start + td for td in tds]
+ >>> series = pd.Series(times)
+ >>> ph.distribution_stats(series)
+ Statistic Value
+ count 200
+ min 2000-01-01 00:00:00
+ 1% 2000-01-01 01:59:24
+ 5% 2000-01-01 09:00:00
+ 25% 2000-01-04 08:00:00
+ 50% 2000-01-08 04:30:00
+ 75% 2000-01-16 21:00:00
+ 95% 2000-02-08 01:36:00
+ 99% 2000-02-22 10:20:24
+ max 2000-04-01 17:00:00
+ mean 2000-01-12 14:24:18
+ standard deviation 12 days 16:47:15.284423042
+ median 2000-01-08 04:30:00
"""
stats = dist_stats_dict(series)
return pd.DataFrame.from_dict(stats, orient="index", columns=["Statistic Value"])
From 24a1e9cdeef74046e2545c83932c2469f1d47685 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Wed, 17 Jul 2024 22:55:18 -0500
Subject: [PATCH 10/13] Add 'category_gaps' function to times.py.
gh-20
---
CHANGELOG.md | 2 +-
src/pandahelper/__init__.py | 9 ++-
src/pandahelper/times.py | 70 +++++++++++++++++++++++
tests/test_times.py | 110 ++++++++++++++++++++++++++++++++++++
4 files changed, 189 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d373552..c2df74c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@
## 0.1.1 - Unreleased
### Added
- SeriesProfile now reports gaps in pd.Series with type `datetime64` or for Series with `DatetimeIndex`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
-- `times.py` module has been added with public functions `time_diffs`, `time_diffs_index`, `id_gaps`, `id_gaps_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
+- `times.py` module has been added with public functions `time_diffs`, `time_diffs_index`, `id_gaps`, `id_gaps_index`, `category_gaps`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
- [`freq_most_least` default parameter for SeriesProfile has been changed to `(10, 5)`.](https://github.com/ray310/Panda-Helper/commit/9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb)
____
diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py
index 58894b2..0a3cbd1 100644
--- a/src/pandahelper/__init__.py
+++ b/src/pandahelper/__init__.py
@@ -4,7 +4,13 @@
from pandahelper.profiles import DataFrameProfile, SeriesProfile
from pandahelper.stats import distribution_stats, frequency_table
-from pandahelper.times import time_diffs, time_diffs_index, id_gaps, id_gaps_index
+from pandahelper.times import (
+ time_diffs,
+ time_diffs_index,
+ id_gaps,
+ id_gaps_index,
+ category_gaps,
+)
__version__ = "0.1.1"
__all__ = [
@@ -16,4 +22,5 @@
"time_diffs_index",
"id_gaps",
"id_gaps_index",
+ "category_gaps",
]
diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py
index 1a99154..d7bbbb2 100644
--- a/src/pandahelper/times.py
+++ b/src/pandahelper/times.py
@@ -1,5 +1,6 @@
"""Panda-Helper time-series functions."""
+from warnings import warn
from typing import Union # TODO: Remove when deprecating Python 3.9
import pandas as pd
import pandas.api.types as pat
@@ -161,3 +162,72 @@ def id_gaps_index(
"""
diffs = time_diffs_index(df)
return diffs[diffs > threshold].sort_values(ascending=False).to_frame()
+
+
+def category_gaps(
+ series: pd.Series, threshold: pd.Timedelta, max_cat: int = 50
+) -> [pd.DataFrame, None]:
+ """Calculate sum of gaps for each category in time-indexed Series.
+
+ Gaps are time differences in excess of expected time increment (threshold). Gap per
+ category is relative to the minimum and maximum times in the Series.
+ Intended for use with categorical-like Series.
+
+ Args:
+ series (pd.Series): Categorical-like Series.
+ threshold (pd.Timedelta): Threshold for the time difference to be considered
+ a gap. For hourly data, threshold should be pd.Timedelta(hours=1).
+ max_cat (int): Maximum number categories (unique values) before issuing
+ warning and returning `None`.
+
+ Returns:
+ Key-value pairs with category name and associated gap. Will return None if
+ number of categories exceeds `max_cat`.
+
+ Warns:
+ UserWarning: If the number of categories (unique values) in the series
+ exceeds `max_cat`.
+
+ Examples:
+ >>> import pandahelper as ph
+ >>> import pandas as pd
+ >>>
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
+ >>> a = pd.Series(["A"] * 30, index=pd.date_range(start, periods=30, freq="D"))
+ >>> b = pd.Series(["B"] * 15, index=pd.date_range(start, periods=15, freq="2D"))
+ >>> c = pd.Series(["C"] * 10, index=pd.date_range(start, periods=10, freq="D"))
+ >>> ph.category_gaps(pd.concat([a, b, c]), threshold=pd.Timedelta(days=1))
+ Cumulative Gap
+ C 20 days
+ B 15 days
+ A 0 days
+ """
+ if not isinstance(series, pd.Series) or not isinstance(
+ series.index, pd.DatetimeIndex
+ ):
+ raise TypeError(
+ f"Series should be {pd.Series} with index of type {pd.DatetimeIndex}"
+ )
+ if not isinstance(threshold, pd.Timedelta):
+ raise TypeError(f"Increment should be {pd.Timedelta}")
+ gaps = {}
+ time_range = series.index.max() - series.index.min()
+ categories = series.unique()
+ if len(categories) > max_cat:
+ msg = (
+ f"Number of categories is greater than f{max_cat}. To proceed "
+ f"increase 'max_cat' and run function again."
+ )
+ warn(msg, stacklevel=2)
+ return None
+ for cat in categories:
+ cat_slice = series.loc[series == cat]
+ if pd.isnull(cat): # treat nulls as distinct category
+ nulls = series.apply(lambda x: x is cat) # pylint: disable=W0640
+ cat_slice = series[nulls]
+ cat_range = cat_slice.index.max() - cat_slice.index.min()
+ diffs = time_diffs_index(cat_slice)
+ gap = (diffs[diffs > threshold] - threshold).sum()
+ gaps[cat] = time_range - cat_range + gap
+ df = pd.Series(gaps.values(), index=gaps.keys(), name="Cumulative Gap")
+ return df.sort_values(ascending=False).to_frame()
diff --git a/tests/test_times.py b/tests/test_times.py
index 83dfba2..2197c09 100644
--- a/tests/test_times.py
+++ b/tests/test_times.py
@@ -1,8 +1,10 @@
"""Tests for functions in times.py."""
+import numpy as np
import pandas as pd
import pytest
import pandahelper.times as pht
+from .utils import make_category_data
def test_time_diffs(cat_df):
@@ -81,3 +83,111 @@ def test_id_gaps_no_gaps(ts_timeindex):
"""id_gap_index returns empty Dataframe when threshold exceeds diffs."""
result = pht.id_gaps(ts_timeindex, pd.Timedelta(hours=4, microseconds=1))
assert len(result) == 0
+
+
+def test_category_gaps_frequency():
+ """Gaps are calculated correctly for categories of varying frequency in Series."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ duration = pd.Timedelta(days=365)
+ end = start + duration
+ delay = pd.Timedelta(days=180)
+ c1 = make_category_data("Springfield", start, end, freq="h")
+ c2 = make_category_data("Quahog", start + delay, end, freq="h")
+ c3 = make_category_data("Park South", start, end, freq="2h")
+ c4 = make_category_data("East Midtown", start, end, freq="4h")
+ c5 = make_category_data("San Diego", start, end, freq="W")
+ c6 = make_category_data("South Philadelphia", start, end, freq="MS")
+ df = pd.concat([c1, c2, c3, c4, c5, c6])
+ gaps = {
+ "South Philadelphia": duration - pd.Timedelta(hours=12),
+ "San Diego": duration - pd.Timedelta(hours=52),
+ "East Midtown": duration - duration / 4,
+ "Park South": duration / 2,
+ "Quahog": delay,
+ "Springfield": pd.Timedelta(hours=0),
+ }
+ expected = pd.DataFrame(
+ gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys())
+ )
+ result = pht.category_gaps(df["category"], pd.Timedelta(hours=1))
+ pd.testing.assert_frame_equal(expected, result, check_index_type=True)
+
+
+def test_category_gaps_no_gaps():
+ """Series with no gaps should show 0 gaps."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(hours=1)
+ c1 = make_category_data("Springfield", start, end, freq="h")
+ c2 = make_category_data("Park South", start, end, freq="2h")
+ df = pd.concat([c1, c2])
+ gaps = {
+ "Springfield": pd.Timedelta(hours=0),
+ "Park South": pd.Timedelta(hours=0),
+ }
+ expected = pd.DataFrame(
+ gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys())
+ )
+ result = pht.category_gaps(df["category"], pd.Timedelta(hours=1))
+ pd.testing.assert_frame_equal(expected, result, check_index_type=True)
+
+
+def test_category_gaps_nulls():
+ """Nulls should be treated as separate categories with correctly calculated gaps."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(hours=25) # to get 24 hour range with freq='2h'
+ df = make_category_data("Quahog", start, end, freq="2h")
+ df.iloc[:2, 3] = None
+ df.iloc[2:4, 3] = pd.NA
+ df.iloc[4:6, 3] = np.nan
+ df.iloc[6:8, 3] = pd.NaT
+ gaps = {
+ None: pd.Timedelta(hours=23),
+ pd.NA: pd.Timedelta(hours=23),
+ np.nan: pd.Timedelta(hours=23),
+ pd.NaT: pd.Timedelta(hours=23),
+ "Quahog": pd.Timedelta(hours=20),
+ }
+ expected = pd.DataFrame(
+ gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys())
+ )
+ result = pht.category_gaps(df["category"], pd.Timedelta(hours=1))
+ pd.testing.assert_frame_equal(expected, result, check_index_type=True)
+
+
+def test_category_gaps_not_series_exception():
+ """Non-series input raises Exception."""
+ df = pd.DataFrame({"A": list(range(5))})
+ with pytest.raises(TypeError) as exc:
+ pht.category_gaps(df, pd.Timedelta(hours=1))
+ assert str(pd.Series) in str(exc.value)
+
+
+def test_category_gaps_wrong_series_exception():
+ """Non-time indexed series raises Exception."""
+ series = pd.Series({"A": list(range(5))})
+ with pytest.raises(TypeError) as exc:
+ pht.category_gaps(series, pd.Timedelta(hours=1))
+ assert str(pd.DatetimeIndex) in str(exc.value)
+
+
+def test_category_gaps_timedelta_wrong_type_exception():
+ """Wrong input type for threshold raises exception."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(days=365)
+ df = make_category_data("Springfield", start, end, freq="h")
+ with pytest.raises(TypeError) as exc:
+ pht.category_gaps(df["category"], start)
+ assert str(pd.Timedelta) in str(exc.value)
+
+
+def test_category_gaps_warning():
+ """Series with more categories than max_cat raises warning and returns None."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(hours=1)
+ c1 = make_category_data("Springfield", start, end, freq="h")
+ c2 = make_category_data("Park South", start, end, freq="2h")
+ df = pd.concat([c1, c2])
+ with pytest.warns(UserWarning):
+ assert (
+ pht.category_gaps(df["category"], pd.Timedelta(hours=1), max_cat=1) is None
+ )
From f3b7a23fae2bcf0d132b229f89341030e3d26f90 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Thu, 18 Jul 2024 01:23:50 -0500
Subject: [PATCH 11/13] DataFrameProfile now includes time_diffs if DataFrame
is time-indexed.
Also adjusted formatting of __repr__ and _repr_html_.
---
src/pandahelper/profiles.py | 41 ++++++++++----
tests/conftest.py | 57 ++++++++++++--------
tests/test_data/test_df_time_profile.txt | 36 +++++++++++++
tests/test_data/test_series_time_profile.txt | 20 +++++++
tests/test_profiles.py | 46 ++++++++++++----
tests/test_times.py | 45 +++++-----------
6 files changed, 172 insertions(+), 73 deletions(-)
create mode 100644 tests/test_data/test_df_time_profile.txt
create mode 100644 tests/test_data/test_series_time_profile.txt
diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py
index 3aea3fa..e5bbd11 100644
--- a/src/pandahelper/profiles.py
+++ b/src/pandahelper/profiles.py
@@ -22,6 +22,7 @@ class DataFrameProfile:
num_duplicates (int): Number of duplicated rows.
nulls_per_row (pd.Series): Count of null values per row.
null_stats (list): Distribution statistics on nulls per row.
+ time_diffs (pd.Series): Time diffs (gaps) if DataFrame has a DateTimeIndex.
"""
def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"):
@@ -44,6 +45,7 @@ def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"):
self.memory_usage = df.memory_usage(index=True, deep=True) / 1000000 # MB
self.num_duplicates = sum(df.duplicated(keep="first"))
self.nulls_per_row = df.isna().sum(axis=1)
+ self.time_diffs = self.__calc_time_diffs(df)
self.null_stats = self.__null_stats()
self._format = fmt
@@ -54,6 +56,13 @@ def __null_stats(self, delete_key="count"):
del stats[delete_key]
return new_stats | stats
+ @staticmethod
+ def __calc_time_diffs(df: pd.DataFrame) -> pd.Series or None:
+ """Calculate time diffs if DataFrame is time-indexed."""
+ if pat.is_datetime64_any_dtype(df.index):
+ return pht.time_diffs_index(df)
+ return None
+
def __create_tables(self, table_fmt: str):
"""Create DataFrameProfile summary tables.
@@ -92,7 +101,15 @@ def __create_tables(self, table_fmt: str):
headers=["Summary of Nulls Per Row", ""],
tablefmt=table_fmt,
)
- return [df_table, dtype_usage_table, null_table]
+ tables = [df_table, dtype_usage_table, null_table]
+ if self.time_diffs is not None:
+ time_diffs_table = tabulate(
+ phs.frequency_table(self.time_diffs),
+ headers=["Time Gaps (Diffs)", "Count", "% of total"],
+ tablefmt=table_fmt,
+ )
+ tables.append(time_diffs_table)
+ return tables
def __repr__(self):
"""Printable version of profile."""
@@ -104,7 +121,8 @@ def _repr_html_(self):
tables = [_format_html_table(t) for t in self.__create_tables("html")]
tables[1] = _decimal_align_col(tables[1], 2) # type/memory usage table
tables[2] = _decimal_align_col(tables[2], 1) # stats table
- return tables[0] + "
" + tables[1] + "
" + tables[2]
+ output = "".join([table + "
" for table in tables])
+ return output[:-4] # remove last
def save(self, path: str):
"""Save profile to provided path.
@@ -159,7 +177,7 @@ def __init__(
TypeError: If input is not a Pandas Series.
"""
if not isinstance(series, pd.Series):
- raise TypeError(f"{series}, is not pd.DataFrame")
+ raise TypeError(f"{series}, is not pd.Series")
if freq_most_least[0] < 0 or freq_most_least[1] < 0:
raise ValueError("Tuple values must be >= 0!")
self._format = fmt
@@ -173,7 +191,7 @@ def __init__(
self.stats = self.__calc_stats(series)
self.time_diffs = self.__calc_time_diffs(series, time_index)
- def __calc_stats(self, series):
+ def __calc_stats(self, series: pd.Series):
"""Calculate distribution stats if allowed dtype, else return None."""
if pat.is_object_dtype(self.dtype) or isinstance(
self.dtype, pd.CategoricalDtype
@@ -182,7 +200,7 @@ def __calc_stats(self, series):
return phs.dist_stats_dict(series)
@staticmethod
- def __calc_time_diffs(series, use_time_index: bool) -> pd.Series or None:
+ def __calc_time_diffs(series: pd.Series, use_time_index: bool) -> pd.Series or None:
"""Calculate time diffs for time-indexed series or datetime64 series."""
if use_time_index and pat.is_datetime64_any_dtype(series.index):
return pht.time_diffs_index(series)
@@ -210,7 +228,7 @@ def __create_tables(self, table_fmt: str) -> list[str]:
freq_table = tabulate(
freq_info, headers=["Value", "Count", "% of total"], tablefmt=table_fmt
)
- stats_table = ""
+ tables = [series_table, freq_table]
if self.stats is not None:
stats = self.stats
# tabulate casts complex numbers to real numbers, dropping imaginary part
@@ -221,14 +239,15 @@ def __create_tables(self, table_fmt: str) -> list[str]:
headers=["Statistic", "Value"],
tablefmt=table_fmt,
)
- time_diffs_table = ""
+ tables.append(stats_table)
if self.time_diffs is not None:
time_diffs_table = tabulate(
phs.frequency_table(self.time_diffs),
headers=["Time Gaps (Diffs)", "Count", "% of total"],
tablefmt=table_fmt,
)
- return [series_table, freq_table, stats_table, time_diffs_table]
+ tables.append(time_diffs_table)
+ return tables
def __repr__(self):
"""Printable version of profile."""
@@ -238,8 +257,10 @@ def __repr__(self):
def _repr_html_(self):
"""HTML representation of profile."""
tables = [_format_html_table(t) for t in self.__create_tables("html")]
- tables[2] = _decimal_align_col(tables[2], 1)
- return tables[0] + "
" + tables[1] + "
" + tables[2] + "
" + tables[3]
+ if self.stats is not None:
+ tables[2] = _decimal_align_col(tables[2], 1)
+ output = "".join([table + "
" for table in tables])
+ return output[:-4] # remove last
def save(self, path):
"""Save profile to provided path.
diff --git a/tests/conftest.py b/tests/conftest.py
index a7bb3d8..c25ac19 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -16,27 +16,6 @@
NUM_SERIES = "NUMBER OF PERSONS INJURED"
-@pytest.fixture
-def cat_df(scope="package"): # pylint: disable=W0613
- """Return test pd.DataFrame with DatetimeIndex."""
- start = pd.Timestamp(year=1999, month=1, day=1)
- end = start + pd.Timedelta(hours=10)
- df = make_category_data("Springfield", start, end, freq="h")
- df = df.sample(frac=1, random_state=2) # index is out of order
- return df
-
-
-@pytest.fixture
-def ts_timeindex(scope="package"): # pylint: disable=W0613
- """Return pd.Series of type datetime64 with DatetimeIndex."""
- start = pd.Timestamp(year=1999, month=1, day=1)
- end = start + pd.Timedelta(hours=40)
- time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left"))
- index_end = start + pd.Timedelta(hours=10)
- time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left")
- return time_series
-
-
@pytest.fixture
def test_df(scope="package"): # pylint: disable=W0613
"""Return test pd.DataFrame from sample of NYC collisions dataset."""
@@ -75,3 +54,39 @@ def non_series_invalid(scope="package"): # pylint: disable=W0613
np.array([1, 2, 3]),
]
return invalid_types
+
+
+@pytest.fixture
+def simple_df(scope="package"): # pylint: disable=W0613
+ """Return test pd.DataFrame with DatetimeIndex."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(hours=10)
+ df = make_category_data("Springfield", start, end, freq="h")
+ df = df.sample(frac=1, random_state=2) # index is out of order
+ return df
+
+
+@pytest.fixture
+def ts_timeindex(scope="package"): # pylint: disable=W0613
+ """Return pd.Series of type datetime64 with DatetimeIndex."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(hours=40)
+ time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left"))
+ index_end = start + pd.Timedelta(hours=10)
+ time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left")
+ return time_series
+
+
+@pytest.fixture
+def cat_df(scope="package"): # pylint: disable=W0613
+ """Return pd.DataFrame with DatetimeIndex."""
+ start = pd.Timestamp(year=1999, month=1, day=1)
+ end = start + pd.Timedelta(days=365)
+ delay = pd.Timedelta(days=180)
+ c1 = make_category_data("Springfield", start, end, freq="h")
+ c2 = make_category_data("Quahog", start + delay, end, freq="h")
+ c3 = make_category_data("Park South", start, end, freq="2h")
+ c4 = make_category_data("East Midtown", start, end, freq="4h")
+ c5 = make_category_data("San Diego", start, end, freq="W")
+ c6 = make_category_data("South Philadelphia", start, end, freq="MS")
+ return pd.concat([c1, c2, c3, c4, c5, c6])
diff --git a/tests/test_data/test_df_time_profile.txt b/tests/test_data/test_df_time_profile.txt
new file mode 100644
index 0000000..2978417
--- /dev/null
+++ b/tests/test_data/test_df_time_profile.txt
@@ -0,0 +1,36 @@
+DataFrame-Level Info
+---------------------- ----------
+DF Shape (19834, 4)
+Duplicated Rows 0
+Memory Usage (MB) 2.633
+
+Series Name Data Type Memory Usage (MB)
+------------- -------------- -------------------
+Index datetime64[ns] 0.158672
+A int64 0.158672
+B object 0.9917
+C float64 0.158672
+category object 1.16563
+
+Summary of Nulls Per Row
+-------------------------- --
+Number of Columns 4
+min 0
+1% 0
+5% 0
+25% 0
+50% 0
+75% 0
+95% 0
+99% 0
+max 0
+mean 0
+standard deviation 0
+median 0
+median absolute deviation 0
+skew 0
+
+Time Gaps (Diffs) Count % of total
+------------------- ------- ------------
+0 days 00:00:00 11074 55.84%
+0 days 01:00:00 8759 44.16%
diff --git a/tests/test_data/test_series_time_profile.txt b/tests/test_data/test_series_time_profile.txt
new file mode 100644
index 0000000..977b7fc
--- /dev/null
+++ b/tests/test_data/test_series_time_profile.txt
@@ -0,0 +1,20 @@
+category Info
+--------------- ------
+Data Type object
+Count 19834
+Unique Values 6
+Null Values 0
+
+Value Count % of total
+------------------ ------- ------------
+Springfield 8760 44.17%
+Quahog 4440 22.39%
+Park South 4380 22.08%
+East Midtown 2190 11.04%
+San Diego 52 0.26%
+South Philadelphia 12 0.06%
+
+Time Gaps (Diffs) Count % of total
+------------------- ------- ------------
+0 days 00:00:00 11074 55.84%
+0 days 01:00:00 8759 44.16%
diff --git a/tests/test_profiles.py b/tests/test_profiles.py
index 713b544..487ca90 100644
--- a/tests/test_profiles.py
+++ b/tests/test_profiles.py
@@ -36,6 +36,18 @@ def test_dataframe_profile_valid_312(test_df):
assert filecmp.cmp(compare_file, test_file, shallow=False)
+@pytest.mark.skipif(
+ not ((3, 12) <= sys.version_info < (3, 13)), reason="Runs on Python 3.12"
+)
+def test_dataframe_time_profile_valid_312(cat_df):
+ """Time-indexed DataFrame profile should match test profile (Python 3.12)."""
+ compare_file = os.path.join(TEST_DATA_DIR, "test_df_time_profile.txt")
+ with tempfile.TemporaryDirectory() as tmp:
+ test_file = os.path.join(tmp, "temp.txt")
+ php.DataFrameProfile(cat_df).save(test_file)
+ assert filecmp.cmp(compare_file, test_file, shallow=False)
+
+
def test_dataframe_profile_invalid(non_series_invalid, num_series, cat_like_series):
"""DataFrame profile should not accept invalid data types."""
invalid_types = [*non_series_invalid, num_series, cat_like_series]
@@ -44,17 +56,18 @@ def test_dataframe_profile_invalid(non_series_invalid, num_series, cat_like_seri
php.DataFrameProfile(invalid)
-def test_dataframe_profile_html(test_df):
+def test_dataframe_profile_html(cat_df):
"""Test html representation of DataFrameProfile."""
- profile = php.DataFrameProfile(test_df)
+ profile = php.DataFrameProfile(cat_df)
# fmt: off
soup = bs4.BeautifulSoup(profile._repr_html_(), "html.parser") # pylint: disable=W0212
# fmt: on
tables = soup.find_all("table")
- assert len(tables) == 3 # null_table
+ assert len(tables) == 4
assert len(tables[2].find_all("tr")) == 16 # 15 dist stats + head row
first_td = tables[2].find("td")
assert first_td["style"] == "font-family: monospace, monospace; text-align: left;"
+ assert len(tables[3].find_all("tr")) == 3 # 2 deltas + head row
def test_series_profile_text_valid_numerical_format(num_series):
@@ -77,6 +90,16 @@ def test_series_profile_text_valid_object_format(cat_like_series):
assert filecmp.cmp(compare_file, test_file, shallow=False)
+def test_series_profile_text_valid_time_format(cat_df):
+ """Text version of SeriesProfile for time data matches test profile."""
+ comparison_profile = "test_series_time_profile.txt"
+ compare_file = os.path.join(TEST_DATA_DIR, comparison_profile)
+ with tempfile.TemporaryDirectory() as tmp:
+ test_file = os.path.join(tmp, "temp.txt")
+ php.SeriesProfile(cat_df["category"], time_index=True).save(test_file)
+ assert filecmp.cmp(compare_file, test_file, shallow=False)
+
+
def test_series_profile_series_dtypes():
"""pd.Series should create SeriesProfile for allowed data types."""
start = datetime(year=1999, month=1, day=1)
@@ -168,18 +191,19 @@ def test_series_profile_invalid(non_series_invalid, test_df):
php.SeriesProfile(invalid)
-def test_series_profile_html(num_series):
+def test_series_profile_html(cat_df):
"""Test html representation of SeriesProfile."""
- profile = php.SeriesProfile(num_series)
+ profile = php.SeriesProfile(cat_df["C"], time_index=True)
# fmt: off
soup = bs4.BeautifulSoup(profile._repr_html_(), "html.parser") # pylint: disable=W0212
# fmt: on
tables = soup.find_all("table")
- assert len(tables) == 3 # null_table
- assert len(tables[1].find_all("tr")) == 6 # freq table
+ assert len(tables) == 4
+ assert len(tables[1].find_all("tr")) == 16 # freq table
assert len(tables[2].find_all("tr")) == 16 # 15 dist stats + head row
first_td = tables[2].find("td")
assert first_td["style"] == "font-family: monospace, monospace; text-align: left;"
+ assert len(tables[3].find_all("tr")) == 3 # 2 deltas + head row
def test_series_profile_frequency_table(test_df):
@@ -201,18 +225,18 @@ def test_series_profile_frequency_table(test_df):
assert len(freq_table.find_all("tr")) == v + 1 # +1 for header
-def test_series_profile_time_index_true(cat_df):
+def test_series_profile_time_index_true(simple_df):
"""time_index=True calculates time diffs for Series with DateTimeIndex."""
- series = cat_df["category"]
+ series = simple_df["category"]
profile = php.SeriesProfile(series, time_index=True)
assert pat.is_datetime64_any_dtype(series.index)
assert profile.time_diffs.iloc[0] is pd.NaT
assert all(profile.time_diffs[1:] == pd.Timedelta(hours=1))
-def test_series_profile_time_index_false(cat_df):
+def test_series_profile_time_index_false(simple_df):
"""time_index=False does not calculate time diffs for Series with DateTimeIndex."""
- series = cat_df["category"]
+ series = simple_df["category"]
profile = php.SeriesProfile(series, time_index=False)
assert pat.is_datetime64_any_dtype(series.index)
assert profile.time_diffs is None
diff --git a/tests/test_times.py b/tests/test_times.py
index 2197c09..24321e2 100644
--- a/tests/test_times.py
+++ b/tests/test_times.py
@@ -7,9 +7,9 @@
from .utils import make_category_data
-def test_time_diffs(cat_df):
+def test_time_diffs(simple_df):
"""time_diffs should work on shuffled pd.Series or Index of timestamps."""
- valid = [cat_df.index, pd.Series(cat_df.index)]
+ valid = [simple_df.index, pd.Series(simple_df.index)]
for v in valid:
result = pht.time_diffs(v)
assert result.iloc[0] is pd.NaT
@@ -24,14 +24,14 @@ def test_time_diffs_exception():
pht.time_diffs(tipo)
-def test_time_diffs_index(cat_df):
+def test_time_diffs_index(simple_df):
"""time_diffs_index should work on shuffled pd.Series or pd.DataFrame."""
# test DF
- df_result = pht.time_diffs_index(cat_df)
+ df_result = pht.time_diffs_index(simple_df)
assert df_result.iloc[0] is pd.NaT
assert all(df_result[1:] == pd.Timedelta(hours=1))
# test Series
- series_result = pht.time_diffs_index(cat_df["B"])
+ series_result = pht.time_diffs_index(simple_df["B"])
assert series_result.iloc[0] is pd.NaT
assert all(series_result[1:] == pd.Timedelta(hours=1))
@@ -85,19 +85,10 @@ def test_id_gaps_no_gaps(ts_timeindex):
assert len(result) == 0
-def test_category_gaps_frequency():
+def test_category_gaps_frequency(cat_df):
"""Gaps are calculated correctly for categories of varying frequency in Series."""
- start = pd.Timestamp(year=1999, month=1, day=1)
duration = pd.Timedelta(days=365)
- end = start + duration
delay = pd.Timedelta(days=180)
- c1 = make_category_data("Springfield", start, end, freq="h")
- c2 = make_category_data("Quahog", start + delay, end, freq="h")
- c3 = make_category_data("Park South", start, end, freq="2h")
- c4 = make_category_data("East Midtown", start, end, freq="4h")
- c5 = make_category_data("San Diego", start, end, freq="W")
- c6 = make_category_data("South Philadelphia", start, end, freq="MS")
- df = pd.concat([c1, c2, c3, c4, c5, c6])
gaps = {
"South Philadelphia": duration - pd.Timedelta(hours=12),
"San Diego": duration - pd.Timedelta(hours=52),
@@ -109,7 +100,7 @@ def test_category_gaps_frequency():
expected = pd.DataFrame(
gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys())
)
- result = pht.category_gaps(df["category"], pd.Timedelta(hours=1))
+ result = pht.category_gaps(cat_df["category"], pd.Timedelta(hours=1))
pd.testing.assert_frame_equal(expected, result, check_index_type=True)
@@ -154,11 +145,10 @@ def test_category_gaps_nulls():
pd.testing.assert_frame_equal(expected, result, check_index_type=True)
-def test_category_gaps_not_series_exception():
+def test_category_gaps_not_series_exception(cat_df):
"""Non-series input raises Exception."""
- df = pd.DataFrame({"A": list(range(5))})
with pytest.raises(TypeError) as exc:
- pht.category_gaps(df, pd.Timedelta(hours=1))
+ pht.category_gaps(cat_df, pd.Timedelta(hours=1))
assert str(pd.Series) in str(exc.value)
@@ -170,24 +160,17 @@ def test_category_gaps_wrong_series_exception():
assert str(pd.DatetimeIndex) in str(exc.value)
-def test_category_gaps_timedelta_wrong_type_exception():
+def test_category_gaps_timedelta_wrong_type_exception(cat_df):
"""Wrong input type for threshold raises exception."""
- start = pd.Timestamp(year=1999, month=1, day=1)
- end = start + pd.Timedelta(days=365)
- df = make_category_data("Springfield", start, end, freq="h")
with pytest.raises(TypeError) as exc:
- pht.category_gaps(df["category"], start)
+ pht.category_gaps(cat_df["category"], pd.Timestamp(year=1999, month=1, day=1))
assert str(pd.Timedelta) in str(exc.value)
-def test_category_gaps_warning():
+def test_category_gaps_warning(cat_df):
"""Series with more categories than max_cat raises warning and returns None."""
- start = pd.Timestamp(year=1999, month=1, day=1)
- end = start + pd.Timedelta(hours=1)
- c1 = make_category_data("Springfield", start, end, freq="h")
- c2 = make_category_data("Park South", start, end, freq="2h")
- df = pd.concat([c1, c2])
with pytest.warns(UserWarning):
assert (
- pht.category_gaps(df["category"], pd.Timedelta(hours=1), max_cat=1) is None
+ pht.category_gaps(cat_df["category"], pd.Timedelta(hours=1), max_cat=5)
+ is None
)
From ebcae875c7a55e5f374320cdae066ebebb903449 Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Sun, 21 Jul 2024 21:16:07 -0500
Subject: [PATCH 12/13] Minor change to profile format.
---
src/pandahelper/profiles.py | 4 ++--
tests/test_data/test_df_time_profile.txt | 8 ++++----
tests/test_data/test_series_time_profile.txt | 8 ++++----
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py
index e5bbd11..6a5b1d0 100644
--- a/src/pandahelper/profiles.py
+++ b/src/pandahelper/profiles.py
@@ -105,7 +105,7 @@ def __create_tables(self, table_fmt: str):
if self.time_diffs is not None:
time_diffs_table = tabulate(
phs.frequency_table(self.time_diffs),
- headers=["Time Gaps (Diffs)", "Count", "% of total"],
+ headers=["Time Diffs", "Count", "% of total"],
tablefmt=table_fmt,
)
tables.append(time_diffs_table)
@@ -243,7 +243,7 @@ def __create_tables(self, table_fmt: str) -> list[str]:
if self.time_diffs is not None:
time_diffs_table = tabulate(
phs.frequency_table(self.time_diffs),
- headers=["Time Gaps (Diffs)", "Count", "% of total"],
+ headers=["Time Diffs", "Count", "% of total"],
tablefmt=table_fmt,
)
tables.append(time_diffs_table)
diff --git a/tests/test_data/test_df_time_profile.txt b/tests/test_data/test_df_time_profile.txt
index 2978417..f4a1784 100644
--- a/tests/test_data/test_df_time_profile.txt
+++ b/tests/test_data/test_df_time_profile.txt
@@ -30,7 +30,7 @@ median 0
median absolute deviation 0
skew 0
-Time Gaps (Diffs) Count % of total
-------------------- ------- ------------
-0 days 00:00:00 11074 55.84%
-0 days 01:00:00 8759 44.16%
+Time Diffs Count % of total
+--------------- ------- ------------
+0 days 00:00:00 11074 55.84%
+0 days 01:00:00 8759 44.16%
diff --git a/tests/test_data/test_series_time_profile.txt b/tests/test_data/test_series_time_profile.txt
index 977b7fc..413b170 100644
--- a/tests/test_data/test_series_time_profile.txt
+++ b/tests/test_data/test_series_time_profile.txt
@@ -14,7 +14,7 @@ East Midtown 2190 11.04%
San Diego 52 0.26%
South Philadelphia 12 0.06%
-Time Gaps (Diffs) Count % of total
-------------------- ------- ------------
-0 days 00:00:00 11074 55.84%
-0 days 01:00:00 8759 44.16%
+Time Diffs Count % of total
+--------------- ------- ------------
+0 days 00:00:00 11074 55.84%
+0 days 01:00:00 8759 44.16%
From 2e8fe740dc8b9dc4602c15a23d841291c4f995ca Mon Sep 17 00:00:00 2001
From: ray310 <64942339+ray310@users.noreply.github.com>
Date: Sun, 21 Jul 2024 21:48:45 -0500
Subject: [PATCH 13/13] Updating documentation.
-Add tutorial to project site.
-Fix README
---
README.md | 300 ++++++++++++++++++++++++++++++++++++++++++++-
docs/index.md | 4 +-
docs/tutorial.md | 297 ++++++++++++++++++++++++++++++++++++++++++++
docs/user_guide.md | 5 -
mkdocs.yml | 10 +-
5 files changed, 607 insertions(+), 9 deletions(-)
create mode 100644 docs/tutorial.md
delete mode 100644 docs/user_guide.md
diff --git a/README.md b/README.md
index 127ccf3..0406284 100644
--- a/README.md
+++ b/README.md
@@ -11,4 +11,302 @@ Assess data quality and usefulness with minimal effort.
Quickly perform initial data exploration, _so you can move on to more in-depth analysis_.
-Please see [project website](https://ray310.github.io/Panda-Helper/).
+Please see the [project website](https://ray310.github.io/Panda-Helper/) for more information.
+
+## Installing Panda-Helper
+Panda-Helper can be installed with: `pip install panda-helper`.
+
+## Using Panda Helper
+For our Panda-Helper tutorial, we are going to use a dataset that counts how many
+ bicycles have passed through bike counting sensors at various locations in New York
+ City over time. We are going to merge the dataset with some additional metadata for
+ the sensors. The datasets can be downloaded from:
+
+- Bicycle Counts: [https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data)
+- Metadata: [https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data)
+
+### Loading Data
+```Python
+import pandas as pd
+
+metadata = pd.read_csv("data/Bicycle_Counters.csv")
+bike_counts = pd.read_csv(
+ "data/Bicycle_Counts.csv",
+ index_col="date",
+ parse_dates=["date"],
+ date_format="%m/%d/%Y %I:%M:%S %p",
+)
+bike_counts = bike_counts.join(metadata.set_index("id"), on="id", how="left")
+```
+
+### DataFrame Profile
+The `DataFrameProfile` is used to get a quick overview of the contents of a Pandas
+ DataFrame. It is an object that can be later referenced or saved if desired.
+In a single view it provides:
+
+- DataFrame shape.
+- Memory usage.
+- The number of duplicated rows (if any).
+- The datatypes of the individual Series.
+- Statistics nulls per row to provide a view on data completeness.
+- Time Differences (Diffs or Gaps) if it is a time-indexed DataFrame.
+ - In the below example we see that most observations occur at the same time as
+ another observation or 15 minutes after the previous observation. There are a few
+ gaps where more than 15 minutes has passed since the last observation.
+
+
+```Python
+import pandahelper as ph
+
+ph.DataFrameProfile(bike_counts)
+```
+```
+DataFrame-Level Info
+---------------------- -------------
+DF Shape (5589249, 12)
+Duplicated Rows 0
+Memory Usage (MB) 1,926.950
+
+Series Name Data Type Memory Usage (MB)
+------------- -------------- -------------------
+Index datetime64[ns] 44.714
+countid int64 44.714
+id int64 44.714
+counts int64 44.714
+status int64 44.714
+name object 438.682
+domain object 368.89
+latitude float64 44.714
+longitude float64 44.714
+interval int64 44.714
+timezone object 419.194
+sens int64 44.714
+counter object 297.758
+
+Summary of Nulls Per Row
+-------------------------- ---------
+Number of Columns 12
+min 0
+1% 0
+5% 0
+25% 0
+50% 0
+75% 0
+95% 1
+99% 1
+max 1
+mean 0.240237
+standard deviation 0.427228
+median 0
+median absolute deviation 0
+skew 1.21604
+
+Time Diffs Count % of total
+--------------- ------- ------------
+0 days 00:00:00 5176050 92.61%
+0 days 00:15:00 413183 7.39%
+0 days 01:15:00 12 0.00%
+0 days 02:15:00 1 0.00%
+0 days 00:30:00 1 0.00%
+0 days 06:15:00 1 0.00%
+```
+
+### Series Profile (Numeric)
+The `SeriesProfile` is used to get a quick overview of the contents of a Pandas
+ Series. It is an object that can be later referenced or saved if desired.
+In a single view it provides:
+
+- Series data type (dtype).
+- The number of non-null values.
+- The number of unique values.
+- The number of null values.
+- The counts of some of the most common and least common values in the series which
+ can be configured with the optional `freq_most_least` flag
+- Distribution statistics for the Series based on the data type.
+
+_Counts are the number of bike crossings at a bike sensor in a window of time_
+```Python
+ph.SeriesProfile(bike_counts["counts"])
+```
+
+```
+counts Info
+------------- -------
+Data Type int64
+Count 5589249
+Unique Values 897
+Null Values 0
+
+ Value Count % of total
+------- ------- ------------
+ 0 860809 15.40%
+ 1 373805 6.69%
+ 2 279622 5.00%
+ 3 217329 3.89%
+ 4 177636 3.18%
+ 5 150857 2.70%
+ 6 131232 2.35%
+ 7 117491 2.10%
+ 8 106717 1.91%
+ 9 98373 1.76%
+ 824 1 0.00%
+ 1092 1 0.00%
+ 925 1 0.00%
+ 894 1 0.00%
+ 1081 1 0.00%
+
+Statistic Value
+------------------------- --------------
+count 5.58925e+06
+min 0
+1% 0
+5% 0
+25% 2
+50% 13
+75% 37
+95% 93
+99% 164
+max 1133
+mean 26.4127
+standard deviation 39.3405
+median 13
+median absolute deviation 13
+skew 5.17677
+```
+
+### Series Profile (Object)
+A `SeriesProfile` for an `object` Series will provide similar information as a numeric
+ Series but without distribution statistics. Here we use the optional `freq_most_least`
+ parameter to show a longer frequency table.
+
+_Name is the designation of the bike sensor station_
+```Python
+ph.SeriesProfile(bike_counts["name"], freq_most_least=(20, 20))
+```
+```
+name Info
+------------- -------
+Data Type object
+Count 5589249
+Unique Values 34
+Null Values 0
+
+Value Count % of total
+----------------------------------------------------------- ------- ------------
+Manhattan Bridge Bike Comprehensive 381148 6.82%
+Manhattan Bridge Display Bike Counter 381148 6.82%
+Manhattan Bridge Ped Path 368665 6.60%
+Ed Koch Queensboro Bridge Shared Path 368504 6.59%
+Williamsburg Bridge Bike Path 368433 6.59%
+Brooklyn Bridge Bike Path 366111 6.55%
+Comprehensive Brooklyn Bridge Counter 365948 6.55%
+Staten Island Ferry 287203 5.14%
+Prospect Park West 266080 4.76%
+Kent Ave btw North 8th St and North 9th St 264522 4.73%
+Pulaski Bridge 243868 4.36%
+1st Avenue - 26th St N - Interference testing 218169 3.90%
+Manhattan Bridge 2012 to 2019 Bike Counter 202785 3.63%
+8th Ave at 50th St. 195920 3.51%
+Manhattan Bridge 2013 to 2018 Bike Counter 165505 2.96%
+Columbus Ave at 86th St. 162481 2.91%
+Amsterdam Ave at 86th St. 162369 2.91%
+2nd Avenue - 26th St S 136388 2.44%
+Brooklyn Bridge Bicycle Path (Roadway) 95955 1.72%
+Kent Ave btw South 6th St. and Broadway 78478 1.40%
+111th St at 50th Ave 72567 1.30%
+Fountain Ave 63146 1.13%
+Willis Ave 62148 1.11%
+Willis Ave Bikes 62148 1.11%
+Willis Ave Peds 62148 1.11%
+Manhattan Bridge 2012 Test Bike Counter 36179 0.65%
+Manhattan Bridge Interference Calibration 2019 Bike Counter 27675 0.50%
+Ocean Pkwy at Avenue J 27260 0.49%
+Pelham Pkwy 21452 0.38%
+Broadway at 50th St 20544 0.37%
+High Bridge 16276 0.29%
+Emmons Ave 16267 0.29%
+Forsyth Plaza 14998 0.27%
+Concrete Plant Park 6761 0.12%
+```
+
+### Time Series Functionality
+#### Calculate the cumulative gaps in time series data by category
+In the above example we saw a notable difference in the number of observations per
+ bike counter station. We can use `category_gaps` to check for gaps in
+ time-indexed, categorical-like data. We use the `threshold` parameter to define the
+ maximum expected increment in the time-indexed data. Some of the bike stations report
+ data every 15 minutes and some report data every hour so we can use a threshold of one
+ hour.
+
+```Python
+ph.category_gaps(bike_counts["name"], threshold=pd.Timedelta(hours=1))
+```
+```
+ Cumulative Gap
+Concrete Plant Park 4234 days 13:45:00
+Forsyth Plaza 4148 days 16:15:00
+Emmons Ave 4135 days 12:30:00
+High Bridge 4135 days 10:15:00
+Broadway at 50th St 4090 days 10:30:00
+Pelham Pkwy 4081 days 12:15:00
+Ocean Pkwy at Avenue J 4021 days 00:15:00
+Manhattan Bridge Interference Calibration 2019 ... 4016 days 15:00:00
+Manhattan Bridge 2012 Test Bike Counter 3928 days 01:30:00
+Willis Ave Peds 3657 days 12:45:00
+Willis Ave Bikes 3657 days 12:45:00
+Willis Ave 3657 days 12:45:00
+Fountain Ave 3647 days 01:45:00
+111th St at 50th Ave 3548 days 21:45:00
+Kent Ave btw South 6th St. and Broadway 3487 days 06:30:00
+Brooklyn Bridge Bicycle Path (Roadway) 3305 days 06:45:00
+2nd Avenue - 26th St S 2884 days 02:30:00
+Amsterdam Ave at 86th St. 2613 days 09:30:00
+Columbus Ave at 86th St. 2612 days 06:00:00
+Manhattan Bridge 2013 to 2018 Bike Counter 2580 days 19:15:00
+8th Ave at 50th St. 2263 days 19:00:00
+Manhattan Bridge 2012 to 2019 Bike Counter 2192 days 07:30:00
+1st Avenue - 26th St N - Interference testing 2032 days 00:00:00
+Pulaski Bridge 1764 days 08:45:00
+Kent Ave btw North 8th St and North 9th St 1549 days 04:30:00
+Prospect Park West 1533 days 00:30:00
+Staten Island Ferry 1312 days 22:15:00
+Comprehensive Brooklyn Bridge Counter 492 days 13:45:00
+Brooklyn Bridge Bike Path 490 days 21:45:00
+Williamsburg Bridge Bike Path 466 days 15:00:00
+Ed Koch Queensboro Bridge Shared Path 465 days 22:45:00
+Manhattan Bridge Ped Path 464 days 07:15:00
+Manhattan Bridge Bike Comprehensive 333 days 14:45:00
+Manhattan Bridge Display Bike Counter 333 days 14:45:00
+```
+#### Identify when gaps occur in time series data
+It looks like the 'Manhattan Bridge Bike Comprehensive' category has the smallest
+ amount of missing time. We can use `id_gaps_index` to identify when the gaps occur.
+ We see that the largest gap for this bike sensor is ~328 days long in 2013.
+
+```Python
+mbc = bike_counts["name"][bike_counts["name"] == "Manhattan Bridge Bike Comprehensive"]
+ph.id_gaps_index(mbc, threshold=pd.Timedelta(hours=1))
+```
+```
+ diffs
+date
+2013-12-03 00:00:00 328 days 00:15:00
+2023-09-27 02:15:00 2 days 02:30:00
+2024-01-21 02:15:00 1 days 02:30:00
+2023-07-03 02:15:00 1 days 02:30:00
+2023-07-01 02:15:00 1 days 02:30:00
+2013-12-03 11:00:00 0 days 06:15:00
+2012-10-12 15:00:00 0 days 02:15:00
+2021-03-14 03:00:00 0 days 01:15:00
+2023-03-12 03:00:00 0 days 01:15:00
+2022-03-13 03:00:00 0 days 01:15:00
+2019-03-10 03:00:00 0 days 01:15:00
+2020-03-08 03:00:00 0 days 01:15:00
+2018-03-11 03:00:00 0 days 01:15:00
+2017-03-12 03:00:00 0 days 01:15:00
+2016-03-13 03:00:00 0 days 01:15:00
+2015-03-08 03:00:00 0 days 01:15:00
+2014-11-04 05:00:00 0 days 01:15:00
+2014-03-09 03:00:00 0 days 01:15:00
+2024-03-10 03:00:00 0 days 01:15:00
+```
diff --git a/docs/index.md b/docs/index.md
index 9ef480f..603c223 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -26,11 +26,11 @@ that allows you to assess data quality and usefulness with minimal effort.
Detailed description of the Panda-Helper API
-- [:material-television-guide:{ .lg .middle } __User Guide__](user_guide.md)
+- [:material-television-guide:{ .lg .middle } __Tutorial__](tutorial.md)
---
- How to use Panda-Helper with examples
+ Panda-Helper Tutorial
- [:simple-github:{ .lg .middle } __Source Code__](https://github.com/ray310/Panda-Helper)
diff --git a/docs/tutorial.md b/docs/tutorial.md
new file mode 100644
index 0000000..4fe02df
--- /dev/null
+++ b/docs/tutorial.md
@@ -0,0 +1,297 @@
+---
+description: Panda Helper Tutorial
+---
+# Panda Helper Tutorial
+For our Panda-Helper tutorial, we are going to use a dataset that counts how many
+ bicycles have passed through bike counting sensors at various locations in New York
+ City over time. We are going to merge the dataset with some additional metadata for
+ the sensors. The datasets can be downloaded from:
+
+- Bicycle Counts: [https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data)
+- Metadata: [https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data)
+
+## Loading Data
+```Python
+import pandas as pd
+
+metadata = pd.read_csv("data/Bicycle_Counters.csv")
+bike_counts = pd.read_csv(
+ "data/Bicycle_Counts.csv",
+ index_col="date",
+ parse_dates=["date"],
+ date_format="%m/%d/%Y %I:%M:%S %p",
+)
+bike_counts = bike_counts.join(metadata.set_index("id"), on="id", how="left")
+```
+
+## DataFrame Profile
+The `DataFrameProfile` is used to get a quick overview of the contents of a Pandas
+ DataFrame. It is an object that can be later referenced or saved if desired.
+In a single view it provides:
+
+- DataFrame shape.
+- Memory usage.
+- The number of duplicated rows (if any).
+- The datatypes of the individual Series.
+- Statistics nulls per row to provide a view on data completeness.
+- Time Differences (Diffs or Gaps) if it is a time-indexed DataFrame.
+ - In the below example we see that most observations occur at the same time as
+ another observation or 15 minutes after the previous observation. There are a few
+ gaps where more than 15 minutes has passed since the last observation.
+
+
+```Python
+import pandahelper as ph
+
+ph.DataFrameProfile(bike_counts)
+```
+```
+DataFrame-Level Info
+---------------------- -------------
+DF Shape (5589249, 12)
+Duplicated Rows 0
+Memory Usage (MB) 1,926.950
+
+Series Name Data Type Memory Usage (MB)
+------------- -------------- -------------------
+Index datetime64[ns] 44.714
+countid int64 44.714
+id int64 44.714
+counts int64 44.714
+status int64 44.714
+name object 438.682
+domain object 368.89
+latitude float64 44.714
+longitude float64 44.714
+interval int64 44.714
+timezone object 419.194
+sens int64 44.714
+counter object 297.758
+
+Summary of Nulls Per Row
+-------------------------- ---------
+Number of Columns 12
+min 0
+1% 0
+5% 0
+25% 0
+50% 0
+75% 0
+95% 1
+99% 1
+max 1
+mean 0.240237
+standard deviation 0.427228
+median 0
+median absolute deviation 0
+skew 1.21604
+
+Time Diffs Count % of total
+--------------- ------- ------------
+0 days 00:00:00 5176050 92.61%
+0 days 00:15:00 413183 7.39%
+0 days 01:15:00 12 0.00%
+0 days 02:15:00 1 0.00%
+0 days 00:30:00 1 0.00%
+0 days 06:15:00 1 0.00%
+```
+
+## Series Profile (Numeric)
+The `SeriesProfile` is used to get a quick overview of the contents of a Pandas
+ Series. It is an object that can be later referenced or saved if desired.
+In a single view it provides:
+
+- Series data type (dtype).
+- The number of non-null values.
+- The number of unique values.
+- The number of null values.
+- The counts of some of the most common and least common values in the series which
+ can be configured with the optional `freq_most_least` flag
+- Distribution statistics for the Series based on the data type.
+
+_Counts are the number of bike crossings at a bike sensor in a window of time._
+```Python
+ph.SeriesProfile(bike_counts["counts"])
+```
+
+```
+counts Info
+------------- -------
+Data Type int64
+Count 5589249
+Unique Values 897
+Null Values 0
+
+ Value Count % of total
+------- ------- ------------
+ 0 860809 15.40%
+ 1 373805 6.69%
+ 2 279622 5.00%
+ 3 217329 3.89%
+ 4 177636 3.18%
+ 5 150857 2.70%
+ 6 131232 2.35%
+ 7 117491 2.10%
+ 8 106717 1.91%
+ 9 98373 1.76%
+ 824 1 0.00%
+ 1092 1 0.00%
+ 925 1 0.00%
+ 894 1 0.00%
+ 1081 1 0.00%
+
+Statistic Value
+------------------------- --------------
+count 5.58925e+06
+min 0
+1% 0
+5% 0
+25% 2
+50% 13
+75% 37
+95% 93
+99% 164
+max 1133
+mean 26.4127
+standard deviation 39.3405
+median 13
+median absolute deviation 13
+skew 5.17677
+```
+
+## Series Profile (Object)
+A `SeriesProfile` for an `object` Series will provide similar information as a numeric
+ Series but without distribution statistics. Here we use the optional `freq_most_least`
+ parameter to show a longer frequency table.
+
+_Name is the designation of the bike sensor station._
+```Python
+ph.SeriesProfile(bike_counts["name"], freq_most_least=(20, 20))
+```
+```
+name Info
+------------- -------
+Data Type object
+Count 5589249
+Unique Values 34
+Null Values 0
+
+Value Count % of total
+----------------------------------------------------------- ------- ------------
+Manhattan Bridge Bike Comprehensive 381148 6.82%
+Manhattan Bridge Display Bike Counter 381148 6.82%
+Manhattan Bridge Ped Path 368665 6.60%
+Ed Koch Queensboro Bridge Shared Path 368504 6.59%
+Williamsburg Bridge Bike Path 368433 6.59%
+Brooklyn Bridge Bike Path 366111 6.55%
+Comprehensive Brooklyn Bridge Counter 365948 6.55%
+Staten Island Ferry 287203 5.14%
+Prospect Park West 266080 4.76%
+Kent Ave btw North 8th St and North 9th St 264522 4.73%
+Pulaski Bridge 243868 4.36%
+1st Avenue - 26th St N - Interference testing 218169 3.90%
+Manhattan Bridge 2012 to 2019 Bike Counter 202785 3.63%
+8th Ave at 50th St. 195920 3.51%
+Manhattan Bridge 2013 to 2018 Bike Counter 165505 2.96%
+Columbus Ave at 86th St. 162481 2.91%
+Amsterdam Ave at 86th St. 162369 2.91%
+2nd Avenue - 26th St S 136388 2.44%
+Brooklyn Bridge Bicycle Path (Roadway) 95955 1.72%
+Kent Ave btw South 6th St. and Broadway 78478 1.40%
+111th St at 50th Ave 72567 1.30%
+Fountain Ave 63146 1.13%
+Willis Ave 62148 1.11%
+Willis Ave Bikes 62148 1.11%
+Willis Ave Peds 62148 1.11%
+Manhattan Bridge 2012 Test Bike Counter 36179 0.65%
+Manhattan Bridge Interference Calibration 2019 Bike Counter 27675 0.50%
+Ocean Pkwy at Avenue J 27260 0.49%
+Pelham Pkwy 21452 0.38%
+Broadway at 50th St 20544 0.37%
+High Bridge 16276 0.29%
+Emmons Ave 16267 0.29%
+Forsyth Plaza 14998 0.27%
+Concrete Plant Park 6761 0.12%
+```
+
+## Time Series Functionality
+### Calculate the cumulative gaps in time series data by category
+In the above example we saw a notable difference in the number of observations per
+ bike counter station. We can use `category_gaps` to check for gaps in
+ time-indexed, categorical-like data. We use the `threshold` parameter to define the
+ maximum expected increment in the time-indexed data. Some of the bike stations report
+ data every 15 minutes and some report data every hour so we can use a threshold of one
+ hour.
+
+```Python
+ph.category_gaps(bike_counts["name"], threshold=pd.Timedelta(hours=1))
+```
+```
+ Cumulative Gap
+Concrete Plant Park 4234 days 13:45:00
+Forsyth Plaza 4148 days 16:15:00
+Emmons Ave 4135 days 12:30:00
+High Bridge 4135 days 10:15:00
+Broadway at 50th St 4090 days 10:30:00
+Pelham Pkwy 4081 days 12:15:00
+Ocean Pkwy at Avenue J 4021 days 00:15:00
+Manhattan Bridge Interference Calibration 2019 ... 4016 days 15:00:00
+Manhattan Bridge 2012 Test Bike Counter 3928 days 01:30:00
+Willis Ave Peds 3657 days 12:45:00
+Willis Ave Bikes 3657 days 12:45:00
+Willis Ave 3657 days 12:45:00
+Fountain Ave 3647 days 01:45:00
+111th St at 50th Ave 3548 days 21:45:00
+Kent Ave btw South 6th St. and Broadway 3487 days 06:30:00
+Brooklyn Bridge Bicycle Path (Roadway) 3305 days 06:45:00
+2nd Avenue - 26th St S 2884 days 02:30:00
+Amsterdam Ave at 86th St. 2613 days 09:30:00
+Columbus Ave at 86th St. 2612 days 06:00:00
+Manhattan Bridge 2013 to 2018 Bike Counter 2580 days 19:15:00
+8th Ave at 50th St. 2263 days 19:00:00
+Manhattan Bridge 2012 to 2019 Bike Counter 2192 days 07:30:00
+1st Avenue - 26th St N - Interference testing 2032 days 00:00:00
+Pulaski Bridge 1764 days 08:45:00
+Kent Ave btw North 8th St and North 9th St 1549 days 04:30:00
+Prospect Park West 1533 days 00:30:00
+Staten Island Ferry 1312 days 22:15:00
+Comprehensive Brooklyn Bridge Counter 492 days 13:45:00
+Brooklyn Bridge Bike Path 490 days 21:45:00
+Williamsburg Bridge Bike Path 466 days 15:00:00
+Ed Koch Queensboro Bridge Shared Path 465 days 22:45:00
+Manhattan Bridge Ped Path 464 days 07:15:00
+Manhattan Bridge Bike Comprehensive 333 days 14:45:00
+Manhattan Bridge Display Bike Counter 333 days 14:45:00
+```
+### Identify when gaps occur in time series data
+It looks like the 'Manhattan Bridge Bike Comprehensive' category has the smallest
+ amount of missing time. We can use `id_gaps_index` to identify when the gaps occur.
+ We see that the largest gap for this bike sensor is ~328 days long in 2013.
+
+```Python
+mbc = bike_counts["name"][bike_counts["name"] == "Manhattan Bridge Bike Comprehensive"]
+ph.id_gaps_index(mbc, threshold=pd.Timedelta(hours=1))
+```
+```
+ diffs
+date
+2013-12-03 00:00:00 328 days 00:15:00
+2023-09-27 02:15:00 2 days 02:30:00
+2024-01-21 02:15:00 1 days 02:30:00
+2023-07-03 02:15:00 1 days 02:30:00
+2023-07-01 02:15:00 1 days 02:30:00
+2013-12-03 11:00:00 0 days 06:15:00
+2012-10-12 15:00:00 0 days 02:15:00
+2021-03-14 03:00:00 0 days 01:15:00
+2023-03-12 03:00:00 0 days 01:15:00
+2022-03-13 03:00:00 0 days 01:15:00
+2019-03-10 03:00:00 0 days 01:15:00
+2020-03-08 03:00:00 0 days 01:15:00
+2018-03-11 03:00:00 0 days 01:15:00
+2017-03-12 03:00:00 0 days 01:15:00
+2016-03-13 03:00:00 0 days 01:15:00
+2015-03-08 03:00:00 0 days 01:15:00
+2014-11-04 05:00:00 0 days 01:15:00
+2014-03-09 03:00:00 0 days 01:15:00
+2024-03-10 03:00:00 0 days 01:15:00
+```
diff --git a/docs/user_guide.md b/docs/user_guide.md
deleted file mode 100644
index 3ea7ebd..0000000
--- a/docs/user_guide.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-description: User Guide. How to use Panda-Helper with examples.
----
-
-Coming soon...
diff --git a/mkdocs.yml b/mkdocs.yml
index df63eb4..4971039 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -9,7 +9,7 @@ copyright: >
nav:
- Home: index.md
- Installation: install.md
- - User Guide: user_guide.md
+ - Tutorial: tutorial.md
- API Reference: api.md
- Issue Tracker: https://github.com/ray310/Panda-Helper/issues
extra_css:
@@ -31,6 +31,7 @@ theme:
- navigation.instant.progress
- toc.integrate
- navigation.footer
+ - content.code.copy
palette:
# Palette toggle for light mode
- media: "(prefers-color-scheme: light)"
@@ -93,6 +94,13 @@ markdown_extensions:
- pymdownx.emoji:
emoji_index: !!python/name:material.extensions.emoji.twemoji
emoji_generator: !!python/name:material.extensions.emoji.to_svg
+ - pymdownx.highlight:
+ anchor_linenums: true
+ line_spans: __span
+ pygments_lang_class: true
+ - pymdownx.inlinehilite
+ - pymdownx.snippets
+ - pymdownx.superfences
plugins:
- search
- mkdocstrings: