From 49e019dec569070d57f5cac84056af4125897e94 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Sun, 14 Jul 2024 12:38:15 -0500 Subject: [PATCH 01/13] Starting new version. --- pyproject.toml | 4 ++-- src/pandahelper/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f403170..e6d4f68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "panda-helper" -version = "0.1.0" +version = "0.1.1" dependencies = [ "beautifulsoup4>=4.12.2", "numpy>=1.26.0", @@ -44,7 +44,7 @@ test = ["pytest>=7.4", "pylint>=3.0"] extend-include = ["*.ipynb"] [tool.ruff.lint] -select = ["D", "F", "B"] # pydocstyle, pyflakes, flake8-bugbear, isort +select = ["D", "F", "B"] # pydocstyle, pyflakes, flake8-bugbear [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py index 8b3e9b1..c0600a2 100644 --- a/src/pandahelper/__init__.py +++ b/src/pandahelper/__init__.py @@ -5,5 +5,5 @@ from pandahelper.profiles import DataFrameProfile, SeriesProfile from pandahelper.stats import distribution_stats, frequency_table -__version__ = "0.1.0" +__version__ = "0.1.1" __all__ = ["frequency_table", "distribution_stats", "DataFrameProfile", "SeriesProfile"] From dbb18aaf38bc48163cafc7bd01b206dfd453a1b5 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Sun, 14 Jul 2024 14:43:56 -0500 Subject: [PATCH 02/13] Change 'count' to 'number of columns' in nulls per row DataFrameProfile table. --- .gitignore | 6 ++-- src/pandahelper/profiles.py | 9 +++++- tests/test_data/test_df_profile_name.txt | 32 +++++++++---------- tests/test_data/test_df_profile_name_311.txt | 2 +- tests/test_data/test_df_profile_no_name.txt | 32 +++++++++---------- .../test_data/test_df_profile_no_name_311.txt | 2 +- 6 files changed, 45 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index b950a07..e73bcbb 100644 --- a/.gitignore +++ b/.gitignore @@ -9,14 +9,14 @@ conda_environment_dev_* # folders -.coverage .idea -data -notes dist htmlcov +notebooks +notes site __pycache__ # files +.coverage .DS_Store diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py index 1940cd1..40337dc 100644 --- a/src/pandahelper/profiles.py +++ b/src/pandahelper/profiles.py @@ -43,9 +43,16 @@ def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"): self.memory_usage = df.memory_usage(index=True, deep=True) / 1000000 # MB self.num_duplicates = sum(df.duplicated(keep="first")) self.nulls_per_row = df.isna().sum(axis=1) - self.null_stats = phs.dist_stats_dict(self.nulls_per_row) + self.null_stats = self.__null_stats() self._format = fmt + def __null_stats(self, delete_key="count"): + """Prepare distribution statistics for the number of nulls per row.""" + stats = phs.dist_stats_dict(self.nulls_per_row) + new_stats = {"Number of Columns": self.shape[1]} + del stats[delete_key] + return new_stats | stats + def __create_tables(self, table_fmt: str): """Create DataFrameProfile summary tables. diff --git a/tests/test_data/test_df_profile_name.txt b/tests/test_data/test_df_profile_name.txt index b71f123..d643fe4 100644 --- a/tests/test_data/test_df_profile_name.txt +++ b/tests/test_data/test_df_profile_name.txt @@ -40,19 +40,19 @@ VEHICLE TYPE CODE 5 object 0.006452 FLAG bool 0.0002 Summary of Nulls Per Row --------------------------- --------- -count 200 -min 3 -1% 3.99 -5% 6 -25% 7 -50% 8 -75% 10 -95% 12 -99% 14.01 -max 15 -mean 8.71 -standard deviation 2.04863 -median 8 -median absolute deviation 1 -skew 0.36218 +-------------------------- -------- +Number of Columns 30 +min 3 +1% 3.99 +5% 6 +25% 7 +50% 8 +75% 10 +95% 12 +99% 14.01 +max 15 +mean 8.71 +standard deviation 2.04863 +median 8 +median absolute deviation 1 +skew 0.36218 diff --git a/tests/test_data/test_df_profile_name_311.txt b/tests/test_data/test_df_profile_name_311.txt index 917e60e..16c9c05 100644 --- a/tests/test_data/test_df_profile_name_311.txt +++ b/tests/test_data/test_df_profile_name_311.txt @@ -41,7 +41,7 @@ FLAG bool 0.0002 Summary of Nulls Per Row -------------------------- --------- -count 200 +Number of Columns 30 min 3 1% 3.99 5% 6 diff --git a/tests/test_data/test_df_profile_no_name.txt b/tests/test_data/test_df_profile_no_name.txt index b15610c..0f160d4 100644 --- a/tests/test_data/test_df_profile_no_name.txt +++ b/tests/test_data/test_df_profile_no_name.txt @@ -39,19 +39,19 @@ VEHICLE TYPE CODE 5 object 0.006452 FLAG bool 0.0002 Summary of Nulls Per Row --------------------------- --------- -count 200 -min 3 -1% 3.99 -5% 6 -25% 7 -50% 8 -75% 10 -95% 12 -99% 14.01 -max 15 -mean 8.71 -standard deviation 2.04863 -median 8 -median absolute deviation 1 -skew 0.36218 +-------------------------- -------- +Number of Columns 30 +min 3 +1% 3.99 +5% 6 +25% 7 +50% 8 +75% 10 +95% 12 +99% 14.01 +max 15 +mean 8.71 +standard deviation 2.04863 +median 8 +median absolute deviation 1 +skew 0.36218 diff --git a/tests/test_data/test_df_profile_no_name_311.txt b/tests/test_data/test_df_profile_no_name_311.txt index 601a43d..150dca7 100644 --- a/tests/test_data/test_df_profile_no_name_311.txt +++ b/tests/test_data/test_df_profile_no_name_311.txt @@ -40,7 +40,7 @@ FLAG bool 0.0002 Summary of Nulls Per Row -------------------------- --------- -count 200 +Number of Columns 30 min 3 1% 3.99 5% 6 From 9b1121e6dc8b32390bfbe4f9750af4319d50412a Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Sun, 14 Jul 2024 15:07:32 -0500 Subject: [PATCH 03/13] Remove flaky test. --- tests/test_data/test_df_profile_name_311.txt | 58 ------------------- .../test_data/test_df_profile_no_name_311.txt | 57 ------------------ tests/test_profiles.py | 19 ------ 3 files changed, 134 deletions(-) delete mode 100644 tests/test_data/test_df_profile_name_311.txt delete mode 100644 tests/test_data/test_df_profile_no_name_311.txt diff --git a/tests/test_data/test_df_profile_name_311.txt b/tests/test_data/test_df_profile_name_311.txt deleted file mode 100644 index 16c9c05..0000000 --- a/tests/test_data/test_df_profile_name_311.txt +++ /dev/null @@ -1,58 +0,0 @@ -DataFrame-Level Info ----------------------- --------- -DF Name test_name -DF Shape (200, 30) -Duplicated Rows 0 -Memory Usage (MB) 0.200 - -Series Name Data Type Memory Usage (MB) ------------------------------ ----------- ------------------- -Index int64 0.000132 -CRASH DATE object 0.012785 -CRASH TIME object 0.01235 -BOROUGH object 0.010519 -ZIP CODE float64 0.0016 -LATITUDE float64 0.0016 -LONGITUDE float64 0.0016 -LOCATION object 0.014763 -ON STREET NAME object 0.015064 -CROSS STREET NAME object 0.01001 -OFF STREET NAME object 0.00952 -NUMBER OF PERSONS INJURED int64 0.0016 -NUMBER OF PERSONS KILLED int64 0.0016 -NUMBER OF PEDESTRIANS INJURED int64 0.0016 -NUMBER OF PEDESTRIANS KILLED int64 0.0016 -NUMBER OF CYCLIST INJURED int64 0.0016 -NUMBER OF CYCLIST KILLED int64 0.0016 -NUMBER OF MOTORIST INJURED int64 0.0016 -NUMBER OF MOTORIST KILLED int64 0.0016 -CONTRIBUTING FACTOR VEHICLE 1 object 0.015643 -CONTRIBUTING FACTOR VEHICLE 2 object 0.012727 -CONTRIBUTING FACTOR VEHICLE 3 object 0.007012 -CONTRIBUTING FACTOR VEHICLE 4 object 0.006652 -CONTRIBUTING FACTOR VEHICLE 5 object 0.006436 -COLLISION_ID int64 0.0016 -VEHICLE TYPE CODE 1 object 0.014306 -VEHICLE TYPE CODE 2 object 0.012294 -VEHICLE TYPE CODE 3 object 0.00715 -VEHICLE TYPE CODE 4 object 0.00673 -VEHICLE TYPE CODE 5 object 0.00646 -FLAG bool 0.0002 - -Summary of Nulls Per Row --------------------------- --------- -Number of Columns 30 -min 3 -1% 3.99 -5% 6 -25% 7 -50% 8 -75% 10 -95% 12 -99% 14.01 -max 15 -mean 8.71 -standard deviation 2.04863 -median 8 -median absolute deviation 1 -skew 0.36218 diff --git a/tests/test_data/test_df_profile_no_name_311.txt b/tests/test_data/test_df_profile_no_name_311.txt deleted file mode 100644 index 150dca7..0000000 --- a/tests/test_data/test_df_profile_no_name_311.txt +++ /dev/null @@ -1,57 +0,0 @@ -DataFrame-Level Info ----------------------- --------- -DF Shape (200, 30) -Duplicated Rows 0 -Memory Usage (MB) 0.200 - -Series Name Data Type Memory Usage (MB) ------------------------------ ----------- ------------------- -Index int64 0.000132 -CRASH DATE object 0.012785 -CRASH TIME object 0.01235 -BOROUGH object 0.010519 -ZIP CODE float64 0.0016 -LATITUDE float64 0.0016 -LONGITUDE float64 0.0016 -LOCATION object 0.014763 -ON STREET NAME object 0.015064 -CROSS STREET NAME object 0.01001 -OFF STREET NAME object 0.00952 -NUMBER OF PERSONS INJURED int64 0.0016 -NUMBER OF PERSONS KILLED int64 0.0016 -NUMBER OF PEDESTRIANS INJURED int64 0.0016 -NUMBER OF PEDESTRIANS KILLED int64 0.0016 -NUMBER OF CYCLIST INJURED int64 0.0016 -NUMBER OF CYCLIST KILLED int64 0.0016 -NUMBER OF MOTORIST INJURED int64 0.0016 -NUMBER OF MOTORIST KILLED int64 0.0016 -CONTRIBUTING FACTOR VEHICLE 1 object 0.015643 -CONTRIBUTING FACTOR VEHICLE 2 object 0.012727 -CONTRIBUTING FACTOR VEHICLE 3 object 0.007012 -CONTRIBUTING FACTOR VEHICLE 4 object 0.006652 -CONTRIBUTING FACTOR VEHICLE 5 object 0.006436 -COLLISION_ID int64 0.0016 -VEHICLE TYPE CODE 1 object 0.014306 -VEHICLE TYPE CODE 2 object 0.012294 -VEHICLE TYPE CODE 3 object 0.00715 -VEHICLE TYPE CODE 4 object 0.00673 -VEHICLE TYPE CODE 5 object 0.00646 -FLAG bool 0.0002 - -Summary of Nulls Per Row --------------------------- --------- -Number of Columns 30 -min 3 -1% 3.99 -5% 6 -25% 7 -50% 8 -75% 10 -95% 12 -99% 14.01 -max 15 -mean 8.71 -standard deviation 2.04863 -median 8 -median absolute deviation 1 -skew 0.36218 diff --git a/tests/test_profiles.py b/tests/test_profiles.py index cde7cf1..e27ecbd 100644 --- a/tests/test_profiles.py +++ b/tests/test_profiles.py @@ -35,25 +35,6 @@ def test_dataframe_profile_valid_312(test_df): assert filecmp.cmp(compare_file, test_file, shallow=False) -@pytest.mark.skipif( - not ((3, 11) <= sys.version_info < (3, 12)), reason="Runs on Python 3.11" -) -def test_dataframe_profile_valid_311(test_df): - """Generated DataFrame profile should match test profile (Python 3.11).""" - compare_profile_name = "test_df_profile_name_311.txt" - compare_profile_no_name = "test_df_profile_no_name_311.txt" - compare_files = [ - os.path.join(TEST_DATA_DIR, compare_profile_name), - os.path.join(TEST_DATA_DIR, compare_profile_no_name), - ] - names = ["test_name", ""] - with tempfile.TemporaryDirectory() as tmp: - for name, compare_file in zip(names, compare_files): - test_file = os.path.join(tmp, "temp.txt") - php.DataFrameProfile(test_df, name=name).save(test_file) - assert filecmp.cmp(compare_file, test_file, shallow=False) - - def test_dataframe_profile_invalid(non_series_invalid, num_series, cat_like_series): """DataFrame profile should not accept invalid data types.""" invalid_types = [*non_series_invalid, num_series, cat_like_series] From f30f8e7ebb4e38818685c7b058c21c20223464b4 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Mon, 15 Jul 2024 11:14:28 -0500 Subject: [PATCH 04/13] Lower default value for SeriesProfile frequency table. Also minor code formatting changes. --- src/pandahelper/profiles.py | 42 ++++++++++++++++++------------------- src/pandahelper/stats.py | 4 ++-- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py index 40337dc..a09f1fa 100644 --- a/src/pandahelper/profiles.py +++ b/src/pandahelper/profiles.py @@ -137,7 +137,7 @@ def __init__( series: pd.Series, *, fmt: str = "simple", - freq_most_least: tuple = (20, 5), + freq_most_least: tuple = (10, 5), ): """Initialize SeriesProfile. @@ -233,26 +233,6 @@ def save(self, path): fh.write(str(self)) -def _format_html_table(table: str, align: str = "left", font: str = "monospace") -> str: - """Add additional formatting to HTML table prepared by tabulate.""" - soup = bs4.BeautifulSoup(table, "html.parser") - for row in soup.find_all("tr"): - tags = row.find_all(["th", "td"]) # row in thead will have 'th' - for tag in tags: - tag["style"] = f"font-family: {font}, monospace; text-align: {align};" - return str(soup) - - -def _decimal_align_col(table: str, col: int): - """Create decimal-aligned numbers in column of HTML table.""" - soup = bs4.BeautifulSoup(table, "html.parser") - for row in soup.find_all("tr"): - tags = row.find_all("td") - if tags: - tags[col].string = tags[col].string.replace(" ", "\u2007") # figure space - return str(soup) - - def _abbreviate_df(df, first=20, last=5): """Return a shortened DataFrame or Series. @@ -282,3 +262,23 @@ def _abbreviate_df(df, first=20, last=5): else: abbrev = pd.concat([df.iloc[:first], df.iloc[(len(df) - last) : len(df)]]) return abbrev + + +def _format_html_table(table: str, align: str = "left", font: str = "monospace") -> str: + """Add additional formatting to HTML table prepared by tabulate.""" + soup = bs4.BeautifulSoup(table, "html.parser") + for row in soup.find_all("tr"): + tags = row.find_all(["th", "td"]) # row in thead will have 'th' + for tag in tags: + tag["style"] = f"font-family: {font}, monospace; text-align: {align};" + return str(soup) + + +def _decimal_align_col(table: str, col: int): + """Create decimal-aligned numbers in column of HTML table.""" + soup = bs4.BeautifulSoup(table, "html.parser") + for row in soup.find_all("tr"): + tags = row.find_all("td") + if tags: + tags[col].string = tags[col].string.replace(" ", "\u2007") # figure space + return str(soup) diff --git a/src/pandahelper/stats.py b/src/pandahelper/stats.py index 25ae251..9f8ee63 100644 --- a/src/pandahelper/stats.py +++ b/src/pandahelper/stats.py @@ -31,7 +31,7 @@ def frequency_table(series: pd.Series) -> pd.DataFrame: return output.sort_values(by="Count", ascending=False) -def _abbreviate_string(s, limit=60): +def _abbreviate_string(s, limit=60) -> str: """Return first x characters of a string. Args: @@ -157,7 +157,7 @@ def _add_quantiles(series: pd.Series, d: dict): d["99%"] = series.quantile(0.99) -def _order_stats(stats: dict): +def _order_stats(stats: dict) -> dict: """Sort stats dictionary by order provided in all_stats. Helper function used in distribution_stats. From 9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Tue, 16 Jul 2024 00:39:07 -0500 Subject: [PATCH 05/13] SeriesProfile now reports gaps for timeseries data. Also added time series functions to calculate time gaps. gh-20 --- docs/api.md | 4 ++ src/pandahelper/__init__.py | 10 ++++- src/pandahelper/profiles.py | 76 +++++++++++++++++++++++-------------- src/pandahelper/times.py | 50 ++++++++++++++++++++++++ tests/conftest.py | 13 ++++++- tests/test_profiles.py | 60 +++++++++++++++++++++++++++++ tests/test_times.py | 44 +++++++++++++++++++++ tests/utils.py | 16 ++++++++ 8 files changed, 242 insertions(+), 31 deletions(-) create mode 100644 src/pandahelper/times.py create mode 100644 tests/test_times.py create mode 100644 tests/utils.py diff --git a/docs/api.md b/docs/api.md index b8e6d06..63313ad 100644 --- a/docs/api.md +++ b/docs/api.md @@ -8,3 +8,7 @@ description: Panda-Helper API Reference. Detailed description of the Panda-Helpe
::: pandahelper.stats + +
+ +::: pandahelper.times diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py index c0600a2..b0a5288 100644 --- a/src/pandahelper/__init__.py +++ b/src/pandahelper/__init__.py @@ -4,6 +4,14 @@ from pandahelper.profiles import DataFrameProfile, SeriesProfile from pandahelper.stats import distribution_stats, frequency_table +from pandahelper.times import time_diffs, time_diffs_index __version__ = "0.1.1" -__all__ = ["frequency_table", "distribution_stats", "DataFrameProfile", "SeriesProfile"] +__all__ = [ + "frequency_table", + "distribution_stats", + "DataFrameProfile", + "SeriesProfile", + "time_diffs", + "time_diffs_index", +] diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py index a09f1fa..3aea3fa 100644 --- a/src/pandahelper/profiles.py +++ b/src/pandahelper/profiles.py @@ -5,6 +5,7 @@ import pandas.api.types as pat from tabulate import tabulate import pandahelper.stats as phs +import pandahelper.times as pht class DataFrameProfile: @@ -61,7 +62,6 @@ def __create_tables(self, table_fmt: str): Returns: list(str): List of Tabulate tables. - """ df_info = [ ("DF Shape", self.shape), @@ -129,7 +129,10 @@ class SeriesProfile: num_unique (int): Number of unique values. num_nulls (int): Number of null values. frequency (pd.DataFrame): Frequency table with counts and percentage. - stats (list): Distribution statistics for Series. + stats (dict): Distribution statistics for Series. + time_diffs (pd.Series): Time diffs (gaps) if series is of type `datetime64`. + Alternately, can be time diffs in a Series with a DateTimeIndex if the + `time_index` parameter was set to `True` when creating Series Profile. """ def __init__( @@ -138,48 +141,57 @@ def __init__( *, fmt: str = "simple", freq_most_least: tuple = (10, 5), + time_index: bool = False, ): """Initialize SeriesProfile. Args: - series (pd.Series): DataFrame to profile. - fmt (str: optional): Printed table format. See - https://github.com/astanin/python-tabulate for options. + series (pd.Series): Pandas Series to profile. + fmt (str: optional): Printed table format. See: + for options. freq_most_least (tuple: optional): Tuple (x, y) of the x most common and y least common values to display in frequency table. + time_index (bool: optional): Whether to use the index for calculating time + diffs for a `datetime64`-related Pandas Series. Not relevant for + non-time related Series. Raises: - TypeError: If input is not a pd.Series. + TypeError: If input is not a Pandas Series. """ if not isinstance(series, pd.Series): raise TypeError(f"{series}, is not pd.DataFrame") if freq_most_least[0] < 0 or freq_most_least[1] < 0: raise ValueError("Tuple values must be >= 0!") + self._format = fmt + self._freq_table = freq_most_least self.name = series.name self.dtype = series.dtype self.count = series.count() # counts non-null values self.num_unique = series.nunique() self.num_nulls = series.size - self.count # NAs, nans, NaT, but not "" self.frequency = phs.frequency_table(series) - self.stats = None - if not ( - pat.is_object_dtype(self.dtype) - or isinstance(self.dtype, pd.CategoricalDtype) - ): - self.stats = phs.dist_stats_dict(series) - self._format = fmt - self._freq_table = freq_most_least + self.stats = self.__calc_stats(series) + self.time_diffs = self.__calc_time_diffs(series, time_index) - def __create_tables(self, table_fmt: str): - """Create SeriesProfile summary tables. - - Args: - table_fmt (str): Tabulate table format name. - - Returns: - list(str): List of Tabulate tables. - - """ + def __calc_stats(self, series): + """Calculate distribution stats if allowed dtype, else return None.""" + if pat.is_object_dtype(self.dtype) or isinstance( + self.dtype, pd.CategoricalDtype + ): + return None + return phs.dist_stats_dict(series) + + @staticmethod + def __calc_time_diffs(series, use_time_index: bool) -> pd.Series or None: + """Calculate time diffs for time-indexed series or datetime64 series.""" + if use_time_index and pat.is_datetime64_any_dtype(series.index): + return pht.time_diffs_index(series) + if (not use_time_index) and pat.is_datetime64_any_dtype(series): + return pht.time_diffs(series) + return None + + def __create_tables(self, table_fmt: str) -> list[str]: + """Create and return SeriesProfile summary tables.""" series_info = [ ("Data Type", self.dtype), ("Count", self.count), @@ -201,16 +213,22 @@ def __create_tables(self, table_fmt: str): stats_table = "" if self.stats is not None: stats = self.stats - if pat.is_complex_dtype( - self.dtype - ): # tabulate converts complex numbers to real numbers + # tabulate casts complex numbers to real numbers, dropping imaginary part + if pat.is_complex_dtype(self.dtype): stats = {k: str(v) for k, v in self.stats.items()} stats_table = tabulate( list(stats.items()), headers=["Statistic", "Value"], tablefmt=table_fmt, ) - return [series_table, freq_table, stats_table] + time_diffs_table = "" + if self.time_diffs is not None: + time_diffs_table = tabulate( + phs.frequency_table(self.time_diffs), + headers=["Time Gaps (Diffs)", "Count", "% of total"], + tablefmt=table_fmt, + ) + return [series_table, freq_table, stats_table, time_diffs_table] def __repr__(self): """Printable version of profile.""" @@ -221,7 +239,7 @@ def _repr_html_(self): """HTML representation of profile.""" tables = [_format_html_table(t) for t in self.__create_tables("html")] tables[2] = _decimal_align_col(tables[2], 1) - return tables[0] + "
" + tables[1] + "
" + tables[2] + return tables[0] + "
" + tables[1] + "
" + tables[2] + "
" + tables[3] def save(self, path): """Save profile to provided path. diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py new file mode 100644 index 0000000..c1aabd1 --- /dev/null +++ b/src/pandahelper/times.py @@ -0,0 +1,50 @@ +"""Panda-Helper time-series functions.""" + +import pandas as pd +import pandas.api.types as pat + + +def time_diffs(series: pd.Series | pd.DatetimeIndex) -> pd.Series(pd.Timedelta): + """Calculate time diffs (gaps) for Pandas Series or Index of timestamps. + + Sorts input by time before calculating diffs. + + Args: + series (pd.Series or pd.DatetimeIndex): Pandas Series or DatetimeIndex + to calculate time diffs on. + + Returns: + Series of diffs (gaps) indexed by the time the diff was calculated. + + Raises: + TypeError: If input is not Series of type datetime64 or DatetimeIndex. + """ + if not pat.is_datetime64_any_dtype(series.dtype): + raise TypeError("Should be Series of datetime64 dtype.") + series = series.sort_values() + diffs = pd.Series(series.diff(), name="diffs") + diffs.index = series + return diffs + + +def time_diffs_index(df: pd.DataFrame | pd.Series) -> pd.Series(pd.Timedelta): + """Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe. + + Sorts input by time before calculating diffs. + + Args: + df (pd.Series or pd.DataFrame): Pandas Series or DataFrame with DateTimeIndex + to calculate time diffs on. + + Returns: + Series of diffs (gaps) indexed by the time the diff was calculated. + + Raises: + TypeError: If input does not have a DatetimeIndex. + """ + if isinstance(df.index, pd.DatetimeIndex): + df = df.sort_index() + diffs = pd.Series(df.index.diff(), name="diffs") + diffs.index = df.index + return diffs + raise TypeError(f"Index should be of type {pd.DatetimeIndex}") diff --git a/tests/conftest.py b/tests/conftest.py index e04f82f..f19a424 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,13 +2,14 @@ Note that fixtures with a package-scope are run once and then available as cached value. - """ +from datetime import datetime import os import numpy as np import pandas as pd import pytest +from .utils import make_category_data TEST_DATA_DIR = "tests/test_data" TEST_DATA_FILE = "sample_collisions.csv" @@ -16,6 +17,16 @@ NUM_SERIES = "NUMBER OF PERSONS INJURED" +@pytest.fixture +def cat_df(scope="package"): # pylint: disable=W0613 + """Return test pd.DataFrame.""" + start = datetime(year=1999, month=1, day=1, hour=0, minute=0) + end = start + pd.Timedelta(hours=10) + df = make_category_data("Springfield", start, end, freq="h") + df = df.sample(frac=1, random_state=2) # index is out of order + return df + + @pytest.fixture def test_df(scope="package"): # pylint: disable=W0613 """Return test pd.DataFrame.""" diff --git a/tests/test_profiles.py b/tests/test_profiles.py index e27ecbd..a00b8a1 100644 --- a/tests/test_profiles.py +++ b/tests/test_profiles.py @@ -9,6 +9,7 @@ import bs4 import numpy as np import pandas as pd +import pandas.api.types as pat import pytest import pandahelper.profiles as php @@ -200,6 +201,65 @@ def test_series_profile_frequency_table(test_df): assert len(freq_table.find_all("tr")) == v + 1 # +1 for header +def test_series_profile_time_index_true(cat_df): + """time_index=True calculates time diffs for Series with DateTimeIndex.""" + series = cat_df["category"] + profile = php.SeriesProfile(series, time_index=True) + assert pat.is_datetime64_any_dtype(series.index) + assert profile.time_diffs.iloc[0] is pd.NaT + assert all(profile.time_diffs[1:] == pd.Timedelta(hours=1)) + + +def test_series_profile_time_index_false(cat_df): + """time_index=False does not calculate time diffs for Series with DateTimeIndex.""" + series = cat_df["category"] + profile = php.SeriesProfile(series, time_index=False) + assert pat.is_datetime64_any_dtype(series.index) + assert profile.time_diffs is None + + +@pytest.fixture +def ts_timeindex(scope="module"): # pylint: disable=W0613 + """Return pd.Series of type datetime64 with DatetimeIndex.""" + start = datetime(year=1999, month=1, day=1, hour=0, minute=0) + end = start + pd.Timedelta(hours=40) + time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left")) + index_end = start + pd.Timedelta(hours=10) + time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left") + return time_series + + +def test_series_profile_ts_range_index_true(ts_timeindex): # pylint: disable=W0621 + """time_index=True does not calculate time diffs for Series with RangeIndex.""" + series = ts_timeindex + series.index = range(len(ts_timeindex)) + profile = php.SeriesProfile(series, time_index=True) + assert not pat.is_datetime64_any_dtype(series.index) + assert profile.time_diffs is None + + +def test_series_profile_both_time_index_false(ts_timeindex): # pylint: disable=W0621 + """SeriesProfile should have time diffs from series, (not index). + + Given for Series(datetime64) with TimeIndex and time_index=False. + """ + profile = php.SeriesProfile(ts_timeindex, time_index=False) + assert pat.is_datetime64_any_dtype(ts_timeindex.index) + assert profile.time_diffs.iloc[0] is pd.NaT + assert all(profile.time_diffs[1:] == pd.Timedelta(hours=4)) + + +def test_series_profile_both_time_index_true(ts_timeindex): # pylint: disable=W0621 + """SeriesProfile should have time diffs from index, (not series). + + Given for Series(datetime64) with TimeIndex and time_index=True. + """ + profile = php.SeriesProfile(ts_timeindex, time_index=True) + assert pat.is_datetime64_any_dtype(ts_timeindex.index) + assert profile.time_diffs.iloc[0] is pd.NaT + assert all(profile.time_diffs[1:] == pd.Timedelta(hours=1)) + + def test_series_profile_frequency_table_invalid(test_df): """Invalid frequency table most_least tuples should raise ValueError.""" invalid_tuples = [(0, -1), (-1, 0), (-1, -1)] diff --git a/tests/test_times.py b/tests/test_times.py new file mode 100644 index 0000000..17665ab --- /dev/null +++ b/tests/test_times.py @@ -0,0 +1,44 @@ +"""Tests for functions in times.py.""" + +import pandas as pd +import pytest +import pandahelper.times as pht + + +def test_time_diffs_index(cat_df): + """time_diffs_index should work on shuffled pd.Series or pd.DataFrame.""" + # test DF + df_result = pht.time_diffs_index(cat_df) + assert df_result.iloc[0] is pd.NaT + assert all(df_result[1:] == pd.Timedelta(hours=1)) + # test Series + series_result = pht.time_diffs_index(cat_df["B"]) + assert series_result.iloc[0] is pd.NaT + assert all(series_result[1:] == pd.Timedelta(hours=1)) + + +def test_time_diffs_index_exception(): + """pd.DataFrame and pd.Series without time index raise exception.""" + data = {"A": list(range(5))} + dtypes = [pd.DataFrame(data), pd.Series(data)] + for tipo in dtypes: + with pytest.raises(TypeError) as exc: + pht.time_diffs_index(tipo) + assert str(pd.DatetimeIndex) in str(exc) + + +def test_time_diffs(cat_df): + """time_diffs should work on shuffled pd.Series or Index of timestamps.""" + valid = [cat_df.index, pd.Series(cat_df.index)] + for v in valid: + result = pht.time_diffs(v) + assert result.iloc[0] is pd.NaT + assert all(result[1:] == pd.Timedelta(hours=1)) + + +def test_time_diffs_exception(): + """Non-datetime64 pd.Series raises exception.""" + invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)] + for tipo in invalid: + with pytest.raises(TypeError): + pht.time_diffs(tipo) diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..52d462c --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,16 @@ +"""Test-related utility functions.""" + +import pandas as pd + + +def make_category_data(cat_name, start, end, freq): + """Return pd.DataFrame of arbitrary data for specified 'category'.""" + rng = pd.date_range(start, end, freq=freq, inclusive="left") + data = { + "A": list(range(1, len(rng) + 1, 1)), + "B": [chr(ord("A") + (x % 26)) for x in range(0, len(rng), 1)], + "C": [float((-1) ** (x % 2) * x) for x in range(0, len(rng), 1)], + } + df = pd.DataFrame(data, index=rng) + df["category"] = cat_name + return df From 4c172d7cb614cada8939ed93bd36b59911b95997 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Tue, 16 Jul 2024 01:01:57 -0500 Subject: [PATCH 06/13] Fix type hint for Python 3.9. gh-20 --- src/pandahelper/times.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py index c1aabd1..98461a1 100644 --- a/src/pandahelper/times.py +++ b/src/pandahelper/times.py @@ -1,10 +1,11 @@ """Panda-Helper time-series functions.""" +from typing import Union # TODO: Remove when deprecating Python 3.9 import pandas as pd import pandas.api.types as pat -def time_diffs(series: pd.Series | pd.DatetimeIndex) -> pd.Series(pd.Timedelta): +def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta): """Calculate time diffs (gaps) for Pandas Series or Index of timestamps. Sorts input by time before calculating diffs. @@ -27,7 +28,7 @@ def time_diffs(series: pd.Series | pd.DatetimeIndex) -> pd.Series(pd.Timedelta): return diffs -def time_diffs_index(df: pd.DataFrame | pd.Series) -> pd.Series(pd.Timedelta): +def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta): """Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe. Sorts input by time before calculating diffs. From cc506d17b0f8da34f5df002d195a755f31b13921 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Tue, 16 Jul 2024 01:19:41 -0500 Subject: [PATCH 07/13] Update CHANGELOG. --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 271b43b..fce8c94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,14 @@ # Changelog ## Unreleased +- Add functionality to perform some common data cleaning tasks. +- Add `geo.py` module and functionality to set 'close' lat-long coordinates to same value. ## 0.1.1 - Unreleased ### Added -- functionality to detect time series gaps +- SeriesProfile now reports gaps in pd.Series with type `datetime64` or for Series with `DatetimeIndex`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) +- `times.py` module has been added with public functions `time_diffs` and `time_diffs_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) +- [`freq_most_least` default parameter for SeriesProfile has been changed to `(10, 5)`.](https://github.com/ray310/Panda-Helper/commit/9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb) ____ ## 0.1.0 - 2024-07-14 From 7001a4fa2deb1061bae643e2e3b6feae23fe6f8a Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Tue, 16 Jul 2024 23:17:04 -0500 Subject: [PATCH 08/13] Add id_gaps and id_gaps_index functions to times.py. gh-20 --- CHANGELOG.md | 2 +- mkdocs.yml | 1 + src/pandahelper/__init__.py | 4 +- src/pandahelper/times.py | 122 ++++++++++++++++++++++++++++++++++-- tests/conftest.py | 18 ++++-- tests/test_profiles.py | 11 ---- tests/test_times.py | 65 +++++++++++++++---- 7 files changed, 188 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fce8c94..d373552 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ ## 0.1.1 - Unreleased ### Added - SeriesProfile now reports gaps in pd.Series with type `datetime64` or for Series with `DatetimeIndex`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) -- `times.py` module has been added with public functions `time_diffs` and `time_diffs_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) +- `times.py` module has been added with public functions `time_diffs`, `time_diffs_index`, `id_gaps`, `id_gaps_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) - [`freq_most_least` default parameter for SeriesProfile has been changed to `(10, 5)`.](https://github.com/ray310/Panda-Helper/commit/9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb) ____ diff --git a/mkdocs.yml b/mkdocs.yml index 778b6d1..df63eb4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -101,6 +101,7 @@ plugins: python: paths: [src] options: + members_order: alphabetical docstring_style: google docstring_section_style: list docstring_options: diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py index b0a5288..58894b2 100644 --- a/src/pandahelper/__init__.py +++ b/src/pandahelper/__init__.py @@ -4,7 +4,7 @@ from pandahelper.profiles import DataFrameProfile, SeriesProfile from pandahelper.stats import distribution_stats, frequency_table -from pandahelper.times import time_diffs, time_diffs_index +from pandahelper.times import time_diffs, time_diffs_index, id_gaps, id_gaps_index __version__ = "0.1.1" __all__ = [ @@ -14,4 +14,6 @@ "SeriesProfile", "time_diffs", "time_diffs_index", + "id_gaps", + "id_gaps_index", ] diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py index 98461a1..1a99154 100644 --- a/src/pandahelper/times.py +++ b/src/pandahelper/times.py @@ -6,7 +6,7 @@ def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta): - """Calculate time diffs (gaps) for Pandas Series or Index of timestamps. + """Calculate time difference between subsequent observations. Sorts input by time before calculating diffs. @@ -19,19 +19,39 @@ def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timed Raises: TypeError: If input is not Series of type datetime64 or DatetimeIndex. + + Examples: + Calculate time differences between observations on Series of timestamps after + it has been randomized: + + >>> import pandahelper as ph + >>> import pandas as pd + >>> + >>> start = pd.Timestamp(year=1999, month=1, day=1) + >>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8]) + >>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order + + >>> ph.time_diffs(series) + 1999-01-01 NaT + 1999-01-02 1 days + 1999-01-03 1 days + 1999-01-07 4 days + 1999-01-08 1 days + 1999-01-10 2 days + Name: diffs, dtype: timedelta64[ns] """ if not pat.is_datetime64_any_dtype(series.dtype): - raise TypeError("Should be Series of datetime64 dtype.") + raise TypeError("Should be of datetime64 dtype.") series = series.sort_values() diffs = pd.Series(series.diff(), name="diffs") diffs.index = series return diffs -def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta): - """Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe. +def time_diffs_index(df: Union[pd.Series, pd.DataFrame]) -> pd.Series(pd.Timedelta): + """Calculate time difference between subsequent time-indexed observations. - Sorts input by time before calculating diffs. + Sorts input by time index before calculating diffs. Args: df (pd.Series or pd.DataFrame): Pandas Series or DataFrame with DateTimeIndex @@ -42,6 +62,27 @@ def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Tim Raises: TypeError: If input does not have a DatetimeIndex. + + Examples: + Calculate time differences between observations on time-indexed DataFrame after + it has been randomized: + + >>> import pandahelper as ph + >>> import pandas as pd + >>> + >>> start = pd.Timestamp(year=1999, month=1, day=1) + >>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8]) + >>> # index by time then randomize order + >>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3) + + >>> ph.time_diffs_index(df) + 1999-01-01 NaT + 1999-01-02 1 days + 1999-01-03 1 days + 1999-01-07 4 days + 1999-01-08 1 days + 1999-01-10 2 days + Name: diffs, dtype: timedelta64[ns] """ if isinstance(df.index, pd.DatetimeIndex): df = df.sort_index() @@ -49,3 +90,74 @@ def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Tim diffs.index = df.index return diffs raise TypeError(f"Index should be of type {pd.DatetimeIndex}") + + +def id_gaps( + series: Union[pd.Series, pd.DatetimeIndex], threshold: pd.Timedelta +) -> pd.DataFrame: + """Identify time gaps above `threshold` in datetime64 Series or DatetimeIndex. + + Sorts input by time before calculating gaps. + + Args: + series (pd.Series or pd.DatetimeIndex): `datetime64` Series or DatetimeIndex. + threshold (pd.Timedelta): Threshold to identify gaps + (and not expected time differences). + + Returns: + One-column Pandas DataFrame of gaps indexed by when gap was calculated. + + Examples: + Identify time gaps on Series of timestamps with a 2 and 4 hour + gap after it has been randomized: + + >>> import pandahelper as ph + >>> import pandas as pd + >>> + >>> start = pd.Timestamp(year=1999, month=1, day=1) + >>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 4, 8, 9, 10]) + >>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order + + >>> ph.id_gaps(series, pd.Timedelta(hours=1)) + diffs + 1999-01-01 11:00:00 0 days 04:00:00 + 1999-01-01 04:00:00 0 days 02:00:00 + """ + diffs = time_diffs(series) + return diffs[diffs > threshold].sort_values(ascending=False).to_frame() + + +def id_gaps_index( + df: Union[pd.Series, pd.DataFrame], threshold: pd.Timedelta +) -> pd.DataFrame: + """Identify time gaps above `threshold` in time-indexed Series or DataFrame. + + Sorts input by time index before calculating diffs. + + Args: + df (pd.Series or pd.DataFrame): Time-indexed Series or DataFrame. + threshold (pd.Timedelta): Threshold to identify gaps + (and not expected time differences). + + Returns: + One-column Pandas DataFrame of gaps indexed by when gap was calculated. + + Examples: + Identify time gaps on an hourly, time-indexed Series with a 2 and 4 hour + gap after it has been randomized: + + >>> import pandahelper as ph + >>> import pandas as pd + >>> + >>> start = pd.Timestamp(year=1999, month=1, day=1) + >>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 8, 9, 10]) + >>> # index by time then randomize order + >>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3) + + >>> ph.id_gaps_index(df, pd.Timedelta(hours=1)) + diffs + 1999-01-01 11:00:00 0 days 04:00:00 + 1999-01-01 04:00:00 0 days 02:00:00 + """ + diffs = time_diffs_index(df) + return diffs[diffs > threshold].sort_values(ascending=False).to_frame() diff --git a/tests/conftest.py b/tests/conftest.py index f19a424..a7bb3d8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,6 @@ cached value. """ -from datetime import datetime import os import numpy as np import pandas as pd @@ -19,17 +18,28 @@ @pytest.fixture def cat_df(scope="package"): # pylint: disable=W0613 - """Return test pd.DataFrame.""" - start = datetime(year=1999, month=1, day=1, hour=0, minute=0) + """Return test pd.DataFrame with DatetimeIndex.""" + start = pd.Timestamp(year=1999, month=1, day=1) end = start + pd.Timedelta(hours=10) df = make_category_data("Springfield", start, end, freq="h") df = df.sample(frac=1, random_state=2) # index is out of order return df +@pytest.fixture +def ts_timeindex(scope="package"): # pylint: disable=W0613 + """Return pd.Series of type datetime64 with DatetimeIndex.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(hours=40) + time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left")) + index_end = start + pd.Timedelta(hours=10) + time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left") + return time_series + + @pytest.fixture def test_df(scope="package"): # pylint: disable=W0613 - """Return test pd.DataFrame.""" + """Return test pd.DataFrame from sample of NYC collisions dataset.""" return pd.read_csv(os.path.join(TEST_DATA_DIR, TEST_DATA_FILE)) diff --git a/tests/test_profiles.py b/tests/test_profiles.py index a00b8a1..713b544 100644 --- a/tests/test_profiles.py +++ b/tests/test_profiles.py @@ -218,17 +218,6 @@ def test_series_profile_time_index_false(cat_df): assert profile.time_diffs is None -@pytest.fixture -def ts_timeindex(scope="module"): # pylint: disable=W0613 - """Return pd.Series of type datetime64 with DatetimeIndex.""" - start = datetime(year=1999, month=1, day=1, hour=0, minute=0) - end = start + pd.Timedelta(hours=40) - time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left")) - index_end = start + pd.Timedelta(hours=10) - time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left") - return time_series - - def test_series_profile_ts_range_index_true(ts_timeindex): # pylint: disable=W0621 """time_index=True does not calculate time diffs for Series with RangeIndex.""" series = ts_timeindex diff --git a/tests/test_times.py b/tests/test_times.py index 17665ab..83dfba2 100644 --- a/tests/test_times.py +++ b/tests/test_times.py @@ -5,6 +5,23 @@ import pandahelper.times as pht +def test_time_diffs(cat_df): + """time_diffs should work on shuffled pd.Series or Index of timestamps.""" + valid = [cat_df.index, pd.Series(cat_df.index)] + for v in valid: + result = pht.time_diffs(v) + assert result.iloc[0] is pd.NaT + assert all(result[1:] == pd.Timedelta(hours=1)) + + +def test_time_diffs_exception(): + """Non-datetime64 pd.Series raises exception.""" + invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)] + for tipo in invalid: + with pytest.raises(TypeError): + pht.time_diffs(tipo) + + def test_time_diffs_index(cat_df): """time_diffs_index should work on shuffled pd.Series or pd.DataFrame.""" # test DF @@ -27,18 +44,40 @@ def test_time_diffs_index_exception(): assert str(pd.DatetimeIndex) in str(exc) -def test_time_diffs(cat_df): - """time_diffs should work on shuffled pd.Series or Index of timestamps.""" - valid = [cat_df.index, pd.Series(cat_df.index)] - for v in valid: - result = pht.time_diffs(v) - assert result.iloc[0] is pd.NaT - assert all(result[1:] == pd.Timedelta(hours=1)) +def test_id_gaps_index(ts_timeindex): + """id_gap_index returns expected gap from time-Series with DatetimeIndex.""" + result = pht.id_gaps_index( + ts_timeindex, pd.Timedelta(minutes=59, microseconds=999999) + ) + expected = pd.DataFrame( + [pd.Timedelta(hours=1)] * 9, + index=pd.date_range(pd.Timestamp(1999, 1, 1, 1), periods=9, freq="h"), + columns=["diffs"], + ) + pd.testing.assert_frame_equal(expected, result, check_index_type=True) -def test_time_diffs_exception(): - """Non-datetime64 pd.Series raises exception.""" - invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)] - for tipo in invalid: - with pytest.raises(TypeError): - pht.time_diffs(tipo) +def test_id_gaps_index_no_gaps(ts_timeindex): + """id_gap_index returns empty Dataframe when threshold exceeds diffs.""" + result = pht.id_gaps_index(ts_timeindex, pd.Timedelta(minutes=60, microseconds=1)) + assert len(result) == 0 + + +def test_id_gaps_(ts_timeindex): + """id_gap returns expected gap from time-Series with DatetimeIndex.""" + result = pht.id_gaps( + ts_timeindex, pd.Timedelta(hours=3, minutes=59, microseconds=999999) + ) + expected = pd.DataFrame( + [pd.Timedelta(hours=4)] * 9, + index=pd.date_range(pd.Timestamp(1999, 1, 1, 4), periods=9, freq="4h"), + columns=["diffs"], + ) + expected.index.freq = None # diffs won't have freq set + pd.testing.assert_frame_equal(expected, result, check_index_type=True) + + +def test_id_gaps_no_gaps(ts_timeindex): + """id_gap_index returns empty Dataframe when threshold exceeds diffs.""" + result = pht.id_gaps(ts_timeindex, pd.Timedelta(hours=4, microseconds=1)) + assert len(result) == 0 From 90f236e78e93e07beebe5a484839c1724af6ed8d Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Wed, 17 Jul 2024 01:39:43 -0500 Subject: [PATCH 09/13] Add examples to stats documentation. gh-2 --- src/pandahelper/stats.py | 61 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/pandahelper/stats.py b/src/pandahelper/stats.py index 9f8ee63..9de9925 100644 --- a/src/pandahelper/stats.py +++ b/src/pandahelper/stats.py @@ -18,6 +18,20 @@ def frequency_table(series: pd.Series) -> pd.DataFrame: Raises: TypeError: If input is not a Pandas Series. + + Examples: + >>> import random + >>> import pandahelper as ph + >>> + >>> random.seed(314) + >>> cities = ["Springfield", "Quahog", "Philadelphia", "Shelbyville"] + >>> series = pd.Series(random.choices(cities, k = 200)) + >>> ph.frequency_table(series) + Count % of Total + Springfield 66 33.00% + Quahog 51 25.50% + Philadelphia 44 22.00% + Shelbyville 39 19.50% """ if not isinstance(series, pd.Series): raise TypeError(f"{series}, is not pd.Series") @@ -70,6 +84,53 @@ def distribution_stats(series: pd.Series) -> pd.DataFrame: Raises: TypeError: If input is not a numeric-like pd.Series. + + Examples: + Distribution stats for Pandas Series of type `float64`: + >>> from random import seed, gauss, expovariate + >>> import pandahelper as ph + >>> import pandas as pd + >>> + >>> seed(314) + >>> series = pd.Series([gauss(mu=30, sigma=20) for x in range(200)]) + >>> ph.distribution_stats(series) + Statistic Value + count 200.000000 + min -23.643007 + 1% -11.918955 + 5% 2.833604 + 25% 17.553793 + 50% 31.420759 + 75% 42.074998 + 95% 60.305435 + 99% 72.028633 + max 81.547828 + mean 30.580535 + standard deviation 18.277706 + median 31.420759 + median absolute deviation 12.216607 + skew -0.020083 + + Distribution stats for Pandas Series of type `datetime64`: + >>> start = pd.Timestamp(2000, 1, 1) + >>> tds = [pd.Timedelta(hours=int(expovariate(lambd=.003))) for x in range(200)] + >>> times = [start + td for td in tds] + >>> series = pd.Series(times) + >>> ph.distribution_stats(series) + Statistic Value + count 200 + min 2000-01-01 00:00:00 + 1% 2000-01-01 01:59:24 + 5% 2000-01-01 09:00:00 + 25% 2000-01-04 08:00:00 + 50% 2000-01-08 04:30:00 + 75% 2000-01-16 21:00:00 + 95% 2000-02-08 01:36:00 + 99% 2000-02-22 10:20:24 + max 2000-04-01 17:00:00 + mean 2000-01-12 14:24:18 + standard deviation 12 days 16:47:15.284423042 + median 2000-01-08 04:30:00 """ stats = dist_stats_dict(series) return pd.DataFrame.from_dict(stats, orient="index", columns=["Statistic Value"]) From 24a1e9cdeef74046e2545c83932c2469f1d47685 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Wed, 17 Jul 2024 22:55:18 -0500 Subject: [PATCH 10/13] Add 'category_gaps' function to times.py. gh-20 --- CHANGELOG.md | 2 +- src/pandahelper/__init__.py | 9 ++- src/pandahelper/times.py | 70 +++++++++++++++++++++++ tests/test_times.py | 110 ++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d373552..c2df74c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ ## 0.1.1 - Unreleased ### Added - SeriesProfile now reports gaps in pd.Series with type `datetime64` or for Series with `DatetimeIndex`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) -- `times.py` module has been added with public functions `time_diffs`, `time_diffs_index`, `id_gaps`, `id_gaps_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) +- `times.py` module has been added with public functions `time_diffs`, `time_diffs_index`, `id_gaps`, `id_gaps_index`, `category_gaps`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20) - [`freq_most_least` default parameter for SeriesProfile has been changed to `(10, 5)`.](https://github.com/ray310/Panda-Helper/commit/9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb) ____ diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py index 58894b2..0a3cbd1 100644 --- a/src/pandahelper/__init__.py +++ b/src/pandahelper/__init__.py @@ -4,7 +4,13 @@ from pandahelper.profiles import DataFrameProfile, SeriesProfile from pandahelper.stats import distribution_stats, frequency_table -from pandahelper.times import time_diffs, time_diffs_index, id_gaps, id_gaps_index +from pandahelper.times import ( + time_diffs, + time_diffs_index, + id_gaps, + id_gaps_index, + category_gaps, +) __version__ = "0.1.1" __all__ = [ @@ -16,4 +22,5 @@ "time_diffs_index", "id_gaps", "id_gaps_index", + "category_gaps", ] diff --git a/src/pandahelper/times.py b/src/pandahelper/times.py index 1a99154..d7bbbb2 100644 --- a/src/pandahelper/times.py +++ b/src/pandahelper/times.py @@ -1,5 +1,6 @@ """Panda-Helper time-series functions.""" +from warnings import warn from typing import Union # TODO: Remove when deprecating Python 3.9 import pandas as pd import pandas.api.types as pat @@ -161,3 +162,72 @@ def id_gaps_index( """ diffs = time_diffs_index(df) return diffs[diffs > threshold].sort_values(ascending=False).to_frame() + + +def category_gaps( + series: pd.Series, threshold: pd.Timedelta, max_cat: int = 50 +) -> [pd.DataFrame, None]: + """Calculate sum of gaps for each category in time-indexed Series. + + Gaps are time differences in excess of expected time increment (threshold). Gap per + category is relative to the minimum and maximum times in the Series. + Intended for use with categorical-like Series. + + Args: + series (pd.Series): Categorical-like Series. + threshold (pd.Timedelta): Threshold for the time difference to be considered + a gap. For hourly data, threshold should be pd.Timedelta(hours=1). + max_cat (int): Maximum number categories (unique values) before issuing + warning and returning `None`. + + Returns: + Key-value pairs with category name and associated gap. Will return None if + number of categories exceeds `max_cat`. + + Warns: + UserWarning: If the number of categories (unique values) in the series + exceeds `max_cat`. + + Examples: + >>> import pandahelper as ph + >>> import pandas as pd + >>> + >>> start = pd.Timestamp(year=1999, month=1, day=1) + >>> a = pd.Series(["A"] * 30, index=pd.date_range(start, periods=30, freq="D")) + >>> b = pd.Series(["B"] * 15, index=pd.date_range(start, periods=15, freq="2D")) + >>> c = pd.Series(["C"] * 10, index=pd.date_range(start, periods=10, freq="D")) + >>> ph.category_gaps(pd.concat([a, b, c]), threshold=pd.Timedelta(days=1)) + Cumulative Gap + C 20 days + B 15 days + A 0 days + """ + if not isinstance(series, pd.Series) or not isinstance( + series.index, pd.DatetimeIndex + ): + raise TypeError( + f"Series should be {pd.Series} with index of type {pd.DatetimeIndex}" + ) + if not isinstance(threshold, pd.Timedelta): + raise TypeError(f"Increment should be {pd.Timedelta}") + gaps = {} + time_range = series.index.max() - series.index.min() + categories = series.unique() + if len(categories) > max_cat: + msg = ( + f"Number of categories is greater than f{max_cat}. To proceed " + f"increase 'max_cat' and run function again." + ) + warn(msg, stacklevel=2) + return None + for cat in categories: + cat_slice = series.loc[series == cat] + if pd.isnull(cat): # treat nulls as distinct category + nulls = series.apply(lambda x: x is cat) # pylint: disable=W0640 + cat_slice = series[nulls] + cat_range = cat_slice.index.max() - cat_slice.index.min() + diffs = time_diffs_index(cat_slice) + gap = (diffs[diffs > threshold] - threshold).sum() + gaps[cat] = time_range - cat_range + gap + df = pd.Series(gaps.values(), index=gaps.keys(), name="Cumulative Gap") + return df.sort_values(ascending=False).to_frame() diff --git a/tests/test_times.py b/tests/test_times.py index 83dfba2..2197c09 100644 --- a/tests/test_times.py +++ b/tests/test_times.py @@ -1,8 +1,10 @@ """Tests for functions in times.py.""" +import numpy as np import pandas as pd import pytest import pandahelper.times as pht +from .utils import make_category_data def test_time_diffs(cat_df): @@ -81,3 +83,111 @@ def test_id_gaps_no_gaps(ts_timeindex): """id_gap_index returns empty Dataframe when threshold exceeds diffs.""" result = pht.id_gaps(ts_timeindex, pd.Timedelta(hours=4, microseconds=1)) assert len(result) == 0 + + +def test_category_gaps_frequency(): + """Gaps are calculated correctly for categories of varying frequency in Series.""" + start = pd.Timestamp(year=1999, month=1, day=1) + duration = pd.Timedelta(days=365) + end = start + duration + delay = pd.Timedelta(days=180) + c1 = make_category_data("Springfield", start, end, freq="h") + c2 = make_category_data("Quahog", start + delay, end, freq="h") + c3 = make_category_data("Park South", start, end, freq="2h") + c4 = make_category_data("East Midtown", start, end, freq="4h") + c5 = make_category_data("San Diego", start, end, freq="W") + c6 = make_category_data("South Philadelphia", start, end, freq="MS") + df = pd.concat([c1, c2, c3, c4, c5, c6]) + gaps = { + "South Philadelphia": duration - pd.Timedelta(hours=12), + "San Diego": duration - pd.Timedelta(hours=52), + "East Midtown": duration - duration / 4, + "Park South": duration / 2, + "Quahog": delay, + "Springfield": pd.Timedelta(hours=0), + } + expected = pd.DataFrame( + gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys()) + ) + result = pht.category_gaps(df["category"], pd.Timedelta(hours=1)) + pd.testing.assert_frame_equal(expected, result, check_index_type=True) + + +def test_category_gaps_no_gaps(): + """Series with no gaps should show 0 gaps.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(hours=1) + c1 = make_category_data("Springfield", start, end, freq="h") + c2 = make_category_data("Park South", start, end, freq="2h") + df = pd.concat([c1, c2]) + gaps = { + "Springfield": pd.Timedelta(hours=0), + "Park South": pd.Timedelta(hours=0), + } + expected = pd.DataFrame( + gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys()) + ) + result = pht.category_gaps(df["category"], pd.Timedelta(hours=1)) + pd.testing.assert_frame_equal(expected, result, check_index_type=True) + + +def test_category_gaps_nulls(): + """Nulls should be treated as separate categories with correctly calculated gaps.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(hours=25) # to get 24 hour range with freq='2h' + df = make_category_data("Quahog", start, end, freq="2h") + df.iloc[:2, 3] = None + df.iloc[2:4, 3] = pd.NA + df.iloc[4:6, 3] = np.nan + df.iloc[6:8, 3] = pd.NaT + gaps = { + None: pd.Timedelta(hours=23), + pd.NA: pd.Timedelta(hours=23), + np.nan: pd.Timedelta(hours=23), + pd.NaT: pd.Timedelta(hours=23), + "Quahog": pd.Timedelta(hours=20), + } + expected = pd.DataFrame( + gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys()) + ) + result = pht.category_gaps(df["category"], pd.Timedelta(hours=1)) + pd.testing.assert_frame_equal(expected, result, check_index_type=True) + + +def test_category_gaps_not_series_exception(): + """Non-series input raises Exception.""" + df = pd.DataFrame({"A": list(range(5))}) + with pytest.raises(TypeError) as exc: + pht.category_gaps(df, pd.Timedelta(hours=1)) + assert str(pd.Series) in str(exc.value) + + +def test_category_gaps_wrong_series_exception(): + """Non-time indexed series raises Exception.""" + series = pd.Series({"A": list(range(5))}) + with pytest.raises(TypeError) as exc: + pht.category_gaps(series, pd.Timedelta(hours=1)) + assert str(pd.DatetimeIndex) in str(exc.value) + + +def test_category_gaps_timedelta_wrong_type_exception(): + """Wrong input type for threshold raises exception.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(days=365) + df = make_category_data("Springfield", start, end, freq="h") + with pytest.raises(TypeError) as exc: + pht.category_gaps(df["category"], start) + assert str(pd.Timedelta) in str(exc.value) + + +def test_category_gaps_warning(): + """Series with more categories than max_cat raises warning and returns None.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(hours=1) + c1 = make_category_data("Springfield", start, end, freq="h") + c2 = make_category_data("Park South", start, end, freq="2h") + df = pd.concat([c1, c2]) + with pytest.warns(UserWarning): + assert ( + pht.category_gaps(df["category"], pd.Timedelta(hours=1), max_cat=1) is None + ) From f3b7a23fae2bcf0d132b229f89341030e3d26f90 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Thu, 18 Jul 2024 01:23:50 -0500 Subject: [PATCH 11/13] DataFrameProfile now includes time_diffs if DataFrame is time-indexed. Also adjusted formatting of __repr__ and _repr_html_. --- src/pandahelper/profiles.py | 41 ++++++++++---- tests/conftest.py | 57 ++++++++++++-------- tests/test_data/test_df_time_profile.txt | 36 +++++++++++++ tests/test_data/test_series_time_profile.txt | 20 +++++++ tests/test_profiles.py | 46 ++++++++++++---- tests/test_times.py | 45 +++++----------- 6 files changed, 172 insertions(+), 73 deletions(-) create mode 100644 tests/test_data/test_df_time_profile.txt create mode 100644 tests/test_data/test_series_time_profile.txt diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py index 3aea3fa..e5bbd11 100644 --- a/src/pandahelper/profiles.py +++ b/src/pandahelper/profiles.py @@ -22,6 +22,7 @@ class DataFrameProfile: num_duplicates (int): Number of duplicated rows. nulls_per_row (pd.Series): Count of null values per row. null_stats (list): Distribution statistics on nulls per row. + time_diffs (pd.Series): Time diffs (gaps) if DataFrame has a DateTimeIndex. """ def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"): @@ -44,6 +45,7 @@ def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"): self.memory_usage = df.memory_usage(index=True, deep=True) / 1000000 # MB self.num_duplicates = sum(df.duplicated(keep="first")) self.nulls_per_row = df.isna().sum(axis=1) + self.time_diffs = self.__calc_time_diffs(df) self.null_stats = self.__null_stats() self._format = fmt @@ -54,6 +56,13 @@ def __null_stats(self, delete_key="count"): del stats[delete_key] return new_stats | stats + @staticmethod + def __calc_time_diffs(df: pd.DataFrame) -> pd.Series or None: + """Calculate time diffs if DataFrame is time-indexed.""" + if pat.is_datetime64_any_dtype(df.index): + return pht.time_diffs_index(df) + return None + def __create_tables(self, table_fmt: str): """Create DataFrameProfile summary tables. @@ -92,7 +101,15 @@ def __create_tables(self, table_fmt: str): headers=["Summary of Nulls Per Row", ""], tablefmt=table_fmt, ) - return [df_table, dtype_usage_table, null_table] + tables = [df_table, dtype_usage_table, null_table] + if self.time_diffs is not None: + time_diffs_table = tabulate( + phs.frequency_table(self.time_diffs), + headers=["Time Gaps (Diffs)", "Count", "% of total"], + tablefmt=table_fmt, + ) + tables.append(time_diffs_table) + return tables def __repr__(self): """Printable version of profile.""" @@ -104,7 +121,8 @@ def _repr_html_(self): tables = [_format_html_table(t) for t in self.__create_tables("html")] tables[1] = _decimal_align_col(tables[1], 2) # type/memory usage table tables[2] = _decimal_align_col(tables[2], 1) # stats table - return tables[0] + "
" + tables[1] + "
" + tables[2] + output = "".join([table + "
" for table in tables]) + return output[:-4] # remove last
def save(self, path: str): """Save profile to provided path. @@ -159,7 +177,7 @@ def __init__( TypeError: If input is not a Pandas Series. """ if not isinstance(series, pd.Series): - raise TypeError(f"{series}, is not pd.DataFrame") + raise TypeError(f"{series}, is not pd.Series") if freq_most_least[0] < 0 or freq_most_least[1] < 0: raise ValueError("Tuple values must be >= 0!") self._format = fmt @@ -173,7 +191,7 @@ def __init__( self.stats = self.__calc_stats(series) self.time_diffs = self.__calc_time_diffs(series, time_index) - def __calc_stats(self, series): + def __calc_stats(self, series: pd.Series): """Calculate distribution stats if allowed dtype, else return None.""" if pat.is_object_dtype(self.dtype) or isinstance( self.dtype, pd.CategoricalDtype @@ -182,7 +200,7 @@ def __calc_stats(self, series): return phs.dist_stats_dict(series) @staticmethod - def __calc_time_diffs(series, use_time_index: bool) -> pd.Series or None: + def __calc_time_diffs(series: pd.Series, use_time_index: bool) -> pd.Series or None: """Calculate time diffs for time-indexed series or datetime64 series.""" if use_time_index and pat.is_datetime64_any_dtype(series.index): return pht.time_diffs_index(series) @@ -210,7 +228,7 @@ def __create_tables(self, table_fmt: str) -> list[str]: freq_table = tabulate( freq_info, headers=["Value", "Count", "% of total"], tablefmt=table_fmt ) - stats_table = "" + tables = [series_table, freq_table] if self.stats is not None: stats = self.stats # tabulate casts complex numbers to real numbers, dropping imaginary part @@ -221,14 +239,15 @@ def __create_tables(self, table_fmt: str) -> list[str]: headers=["Statistic", "Value"], tablefmt=table_fmt, ) - time_diffs_table = "" + tables.append(stats_table) if self.time_diffs is not None: time_diffs_table = tabulate( phs.frequency_table(self.time_diffs), headers=["Time Gaps (Diffs)", "Count", "% of total"], tablefmt=table_fmt, ) - return [series_table, freq_table, stats_table, time_diffs_table] + tables.append(time_diffs_table) + return tables def __repr__(self): """Printable version of profile.""" @@ -238,8 +257,10 @@ def __repr__(self): def _repr_html_(self): """HTML representation of profile.""" tables = [_format_html_table(t) for t in self.__create_tables("html")] - tables[2] = _decimal_align_col(tables[2], 1) - return tables[0] + "
" + tables[1] + "
" + tables[2] + "
" + tables[3] + if self.stats is not None: + tables[2] = _decimal_align_col(tables[2], 1) + output = "".join([table + "
" for table in tables]) + return output[:-4] # remove last
def save(self, path): """Save profile to provided path. diff --git a/tests/conftest.py b/tests/conftest.py index a7bb3d8..c25ac19 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,27 +16,6 @@ NUM_SERIES = "NUMBER OF PERSONS INJURED" -@pytest.fixture -def cat_df(scope="package"): # pylint: disable=W0613 - """Return test pd.DataFrame with DatetimeIndex.""" - start = pd.Timestamp(year=1999, month=1, day=1) - end = start + pd.Timedelta(hours=10) - df = make_category_data("Springfield", start, end, freq="h") - df = df.sample(frac=1, random_state=2) # index is out of order - return df - - -@pytest.fixture -def ts_timeindex(scope="package"): # pylint: disable=W0613 - """Return pd.Series of type datetime64 with DatetimeIndex.""" - start = pd.Timestamp(year=1999, month=1, day=1) - end = start + pd.Timedelta(hours=40) - time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left")) - index_end = start + pd.Timedelta(hours=10) - time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left") - return time_series - - @pytest.fixture def test_df(scope="package"): # pylint: disable=W0613 """Return test pd.DataFrame from sample of NYC collisions dataset.""" @@ -75,3 +54,39 @@ def non_series_invalid(scope="package"): # pylint: disable=W0613 np.array([1, 2, 3]), ] return invalid_types + + +@pytest.fixture +def simple_df(scope="package"): # pylint: disable=W0613 + """Return test pd.DataFrame with DatetimeIndex.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(hours=10) + df = make_category_data("Springfield", start, end, freq="h") + df = df.sample(frac=1, random_state=2) # index is out of order + return df + + +@pytest.fixture +def ts_timeindex(scope="package"): # pylint: disable=W0613 + """Return pd.Series of type datetime64 with DatetimeIndex.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(hours=40) + time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left")) + index_end = start + pd.Timedelta(hours=10) + time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left") + return time_series + + +@pytest.fixture +def cat_df(scope="package"): # pylint: disable=W0613 + """Return pd.DataFrame with DatetimeIndex.""" + start = pd.Timestamp(year=1999, month=1, day=1) + end = start + pd.Timedelta(days=365) + delay = pd.Timedelta(days=180) + c1 = make_category_data("Springfield", start, end, freq="h") + c2 = make_category_data("Quahog", start + delay, end, freq="h") + c3 = make_category_data("Park South", start, end, freq="2h") + c4 = make_category_data("East Midtown", start, end, freq="4h") + c5 = make_category_data("San Diego", start, end, freq="W") + c6 = make_category_data("South Philadelphia", start, end, freq="MS") + return pd.concat([c1, c2, c3, c4, c5, c6]) diff --git a/tests/test_data/test_df_time_profile.txt b/tests/test_data/test_df_time_profile.txt new file mode 100644 index 0000000..2978417 --- /dev/null +++ b/tests/test_data/test_df_time_profile.txt @@ -0,0 +1,36 @@ +DataFrame-Level Info +---------------------- ---------- +DF Shape (19834, 4) +Duplicated Rows 0 +Memory Usage (MB) 2.633 + +Series Name Data Type Memory Usage (MB) +------------- -------------- ------------------- +Index datetime64[ns] 0.158672 +A int64 0.158672 +B object 0.9917 +C float64 0.158672 +category object 1.16563 + +Summary of Nulls Per Row +-------------------------- -- +Number of Columns 4 +min 0 +1% 0 +5% 0 +25% 0 +50% 0 +75% 0 +95% 0 +99% 0 +max 0 +mean 0 +standard deviation 0 +median 0 +median absolute deviation 0 +skew 0 + +Time Gaps (Diffs) Count % of total +------------------- ------- ------------ +0 days 00:00:00 11074 55.84% +0 days 01:00:00 8759 44.16% diff --git a/tests/test_data/test_series_time_profile.txt b/tests/test_data/test_series_time_profile.txt new file mode 100644 index 0000000..977b7fc --- /dev/null +++ b/tests/test_data/test_series_time_profile.txt @@ -0,0 +1,20 @@ +category Info +--------------- ------ +Data Type object +Count 19834 +Unique Values 6 +Null Values 0 + +Value Count % of total +------------------ ------- ------------ +Springfield 8760 44.17% +Quahog 4440 22.39% +Park South 4380 22.08% +East Midtown 2190 11.04% +San Diego 52 0.26% +South Philadelphia 12 0.06% + +Time Gaps (Diffs) Count % of total +------------------- ------- ------------ +0 days 00:00:00 11074 55.84% +0 days 01:00:00 8759 44.16% diff --git a/tests/test_profiles.py b/tests/test_profiles.py index 713b544..487ca90 100644 --- a/tests/test_profiles.py +++ b/tests/test_profiles.py @@ -36,6 +36,18 @@ def test_dataframe_profile_valid_312(test_df): assert filecmp.cmp(compare_file, test_file, shallow=False) +@pytest.mark.skipif( + not ((3, 12) <= sys.version_info < (3, 13)), reason="Runs on Python 3.12" +) +def test_dataframe_time_profile_valid_312(cat_df): + """Time-indexed DataFrame profile should match test profile (Python 3.12).""" + compare_file = os.path.join(TEST_DATA_DIR, "test_df_time_profile.txt") + with tempfile.TemporaryDirectory() as tmp: + test_file = os.path.join(tmp, "temp.txt") + php.DataFrameProfile(cat_df).save(test_file) + assert filecmp.cmp(compare_file, test_file, shallow=False) + + def test_dataframe_profile_invalid(non_series_invalid, num_series, cat_like_series): """DataFrame profile should not accept invalid data types.""" invalid_types = [*non_series_invalid, num_series, cat_like_series] @@ -44,17 +56,18 @@ def test_dataframe_profile_invalid(non_series_invalid, num_series, cat_like_seri php.DataFrameProfile(invalid) -def test_dataframe_profile_html(test_df): +def test_dataframe_profile_html(cat_df): """Test html representation of DataFrameProfile.""" - profile = php.DataFrameProfile(test_df) + profile = php.DataFrameProfile(cat_df) # fmt: off soup = bs4.BeautifulSoup(profile._repr_html_(), "html.parser") # pylint: disable=W0212 # fmt: on tables = soup.find_all("table") - assert len(tables) == 3 # null_table + assert len(tables) == 4 assert len(tables[2].find_all("tr")) == 16 # 15 dist stats + head row first_td = tables[2].find("td") assert first_td["style"] == "font-family: monospace, monospace; text-align: left;" + assert len(tables[3].find_all("tr")) == 3 # 2 deltas + head row def test_series_profile_text_valid_numerical_format(num_series): @@ -77,6 +90,16 @@ def test_series_profile_text_valid_object_format(cat_like_series): assert filecmp.cmp(compare_file, test_file, shallow=False) +def test_series_profile_text_valid_time_format(cat_df): + """Text version of SeriesProfile for time data matches test profile.""" + comparison_profile = "test_series_time_profile.txt" + compare_file = os.path.join(TEST_DATA_DIR, comparison_profile) + with tempfile.TemporaryDirectory() as tmp: + test_file = os.path.join(tmp, "temp.txt") + php.SeriesProfile(cat_df["category"], time_index=True).save(test_file) + assert filecmp.cmp(compare_file, test_file, shallow=False) + + def test_series_profile_series_dtypes(): """pd.Series should create SeriesProfile for allowed data types.""" start = datetime(year=1999, month=1, day=1) @@ -168,18 +191,19 @@ def test_series_profile_invalid(non_series_invalid, test_df): php.SeriesProfile(invalid) -def test_series_profile_html(num_series): +def test_series_profile_html(cat_df): """Test html representation of SeriesProfile.""" - profile = php.SeriesProfile(num_series) + profile = php.SeriesProfile(cat_df["C"], time_index=True) # fmt: off soup = bs4.BeautifulSoup(profile._repr_html_(), "html.parser") # pylint: disable=W0212 # fmt: on tables = soup.find_all("table") - assert len(tables) == 3 # null_table - assert len(tables[1].find_all("tr")) == 6 # freq table + assert len(tables) == 4 + assert len(tables[1].find_all("tr")) == 16 # freq table assert len(tables[2].find_all("tr")) == 16 # 15 dist stats + head row first_td = tables[2].find("td") assert first_td["style"] == "font-family: monospace, monospace; text-align: left;" + assert len(tables[3].find_all("tr")) == 3 # 2 deltas + head row def test_series_profile_frequency_table(test_df): @@ -201,18 +225,18 @@ def test_series_profile_frequency_table(test_df): assert len(freq_table.find_all("tr")) == v + 1 # +1 for header -def test_series_profile_time_index_true(cat_df): +def test_series_profile_time_index_true(simple_df): """time_index=True calculates time diffs for Series with DateTimeIndex.""" - series = cat_df["category"] + series = simple_df["category"] profile = php.SeriesProfile(series, time_index=True) assert pat.is_datetime64_any_dtype(series.index) assert profile.time_diffs.iloc[0] is pd.NaT assert all(profile.time_diffs[1:] == pd.Timedelta(hours=1)) -def test_series_profile_time_index_false(cat_df): +def test_series_profile_time_index_false(simple_df): """time_index=False does not calculate time diffs for Series with DateTimeIndex.""" - series = cat_df["category"] + series = simple_df["category"] profile = php.SeriesProfile(series, time_index=False) assert pat.is_datetime64_any_dtype(series.index) assert profile.time_diffs is None diff --git a/tests/test_times.py b/tests/test_times.py index 2197c09..24321e2 100644 --- a/tests/test_times.py +++ b/tests/test_times.py @@ -7,9 +7,9 @@ from .utils import make_category_data -def test_time_diffs(cat_df): +def test_time_diffs(simple_df): """time_diffs should work on shuffled pd.Series or Index of timestamps.""" - valid = [cat_df.index, pd.Series(cat_df.index)] + valid = [simple_df.index, pd.Series(simple_df.index)] for v in valid: result = pht.time_diffs(v) assert result.iloc[0] is pd.NaT @@ -24,14 +24,14 @@ def test_time_diffs_exception(): pht.time_diffs(tipo) -def test_time_diffs_index(cat_df): +def test_time_diffs_index(simple_df): """time_diffs_index should work on shuffled pd.Series or pd.DataFrame.""" # test DF - df_result = pht.time_diffs_index(cat_df) + df_result = pht.time_diffs_index(simple_df) assert df_result.iloc[0] is pd.NaT assert all(df_result[1:] == pd.Timedelta(hours=1)) # test Series - series_result = pht.time_diffs_index(cat_df["B"]) + series_result = pht.time_diffs_index(simple_df["B"]) assert series_result.iloc[0] is pd.NaT assert all(series_result[1:] == pd.Timedelta(hours=1)) @@ -85,19 +85,10 @@ def test_id_gaps_no_gaps(ts_timeindex): assert len(result) == 0 -def test_category_gaps_frequency(): +def test_category_gaps_frequency(cat_df): """Gaps are calculated correctly for categories of varying frequency in Series.""" - start = pd.Timestamp(year=1999, month=1, day=1) duration = pd.Timedelta(days=365) - end = start + duration delay = pd.Timedelta(days=180) - c1 = make_category_data("Springfield", start, end, freq="h") - c2 = make_category_data("Quahog", start + delay, end, freq="h") - c3 = make_category_data("Park South", start, end, freq="2h") - c4 = make_category_data("East Midtown", start, end, freq="4h") - c5 = make_category_data("San Diego", start, end, freq="W") - c6 = make_category_data("South Philadelphia", start, end, freq="MS") - df = pd.concat([c1, c2, c3, c4, c5, c6]) gaps = { "South Philadelphia": duration - pd.Timedelta(hours=12), "San Diego": duration - pd.Timedelta(hours=52), @@ -109,7 +100,7 @@ def test_category_gaps_frequency(): expected = pd.DataFrame( gaps.values(), columns=["Cumulative Gap"], index=list(gaps.keys()) ) - result = pht.category_gaps(df["category"], pd.Timedelta(hours=1)) + result = pht.category_gaps(cat_df["category"], pd.Timedelta(hours=1)) pd.testing.assert_frame_equal(expected, result, check_index_type=True) @@ -154,11 +145,10 @@ def test_category_gaps_nulls(): pd.testing.assert_frame_equal(expected, result, check_index_type=True) -def test_category_gaps_not_series_exception(): +def test_category_gaps_not_series_exception(cat_df): """Non-series input raises Exception.""" - df = pd.DataFrame({"A": list(range(5))}) with pytest.raises(TypeError) as exc: - pht.category_gaps(df, pd.Timedelta(hours=1)) + pht.category_gaps(cat_df, pd.Timedelta(hours=1)) assert str(pd.Series) in str(exc.value) @@ -170,24 +160,17 @@ def test_category_gaps_wrong_series_exception(): assert str(pd.DatetimeIndex) in str(exc.value) -def test_category_gaps_timedelta_wrong_type_exception(): +def test_category_gaps_timedelta_wrong_type_exception(cat_df): """Wrong input type for threshold raises exception.""" - start = pd.Timestamp(year=1999, month=1, day=1) - end = start + pd.Timedelta(days=365) - df = make_category_data("Springfield", start, end, freq="h") with pytest.raises(TypeError) as exc: - pht.category_gaps(df["category"], start) + pht.category_gaps(cat_df["category"], pd.Timestamp(year=1999, month=1, day=1)) assert str(pd.Timedelta) in str(exc.value) -def test_category_gaps_warning(): +def test_category_gaps_warning(cat_df): """Series with more categories than max_cat raises warning and returns None.""" - start = pd.Timestamp(year=1999, month=1, day=1) - end = start + pd.Timedelta(hours=1) - c1 = make_category_data("Springfield", start, end, freq="h") - c2 = make_category_data("Park South", start, end, freq="2h") - df = pd.concat([c1, c2]) with pytest.warns(UserWarning): assert ( - pht.category_gaps(df["category"], pd.Timedelta(hours=1), max_cat=1) is None + pht.category_gaps(cat_df["category"], pd.Timedelta(hours=1), max_cat=5) + is None ) From ebcae875c7a55e5f374320cdae066ebebb903449 Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:16:07 -0500 Subject: [PATCH 12/13] Minor change to profile format. --- src/pandahelper/profiles.py | 4 ++-- tests/test_data/test_df_time_profile.txt | 8 ++++---- tests/test_data/test_series_time_profile.txt | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/pandahelper/profiles.py b/src/pandahelper/profiles.py index e5bbd11..6a5b1d0 100644 --- a/src/pandahelper/profiles.py +++ b/src/pandahelper/profiles.py @@ -105,7 +105,7 @@ def __create_tables(self, table_fmt: str): if self.time_diffs is not None: time_diffs_table = tabulate( phs.frequency_table(self.time_diffs), - headers=["Time Gaps (Diffs)", "Count", "% of total"], + headers=["Time Diffs", "Count", "% of total"], tablefmt=table_fmt, ) tables.append(time_diffs_table) @@ -243,7 +243,7 @@ def __create_tables(self, table_fmt: str) -> list[str]: if self.time_diffs is not None: time_diffs_table = tabulate( phs.frequency_table(self.time_diffs), - headers=["Time Gaps (Diffs)", "Count", "% of total"], + headers=["Time Diffs", "Count", "% of total"], tablefmt=table_fmt, ) tables.append(time_diffs_table) diff --git a/tests/test_data/test_df_time_profile.txt b/tests/test_data/test_df_time_profile.txt index 2978417..f4a1784 100644 --- a/tests/test_data/test_df_time_profile.txt +++ b/tests/test_data/test_df_time_profile.txt @@ -30,7 +30,7 @@ median 0 median absolute deviation 0 skew 0 -Time Gaps (Diffs) Count % of total -------------------- ------- ------------ -0 days 00:00:00 11074 55.84% -0 days 01:00:00 8759 44.16% +Time Diffs Count % of total +--------------- ------- ------------ +0 days 00:00:00 11074 55.84% +0 days 01:00:00 8759 44.16% diff --git a/tests/test_data/test_series_time_profile.txt b/tests/test_data/test_series_time_profile.txt index 977b7fc..413b170 100644 --- a/tests/test_data/test_series_time_profile.txt +++ b/tests/test_data/test_series_time_profile.txt @@ -14,7 +14,7 @@ East Midtown 2190 11.04% San Diego 52 0.26% South Philadelphia 12 0.06% -Time Gaps (Diffs) Count % of total -------------------- ------- ------------ -0 days 00:00:00 11074 55.84% -0 days 01:00:00 8759 44.16% +Time Diffs Count % of total +--------------- ------- ------------ +0 days 00:00:00 11074 55.84% +0 days 01:00:00 8759 44.16% From 2e8fe740dc8b9dc4602c15a23d841291c4f995ca Mon Sep 17 00:00:00 2001 From: ray310 <64942339+ray310@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:48:45 -0500 Subject: [PATCH 13/13] Updating documentation. -Add tutorial to project site. -Fix README --- README.md | 300 ++++++++++++++++++++++++++++++++++++++++++++- docs/index.md | 4 +- docs/tutorial.md | 297 ++++++++++++++++++++++++++++++++++++++++++++ docs/user_guide.md | 5 - mkdocs.yml | 10 +- 5 files changed, 607 insertions(+), 9 deletions(-) create mode 100644 docs/tutorial.md delete mode 100644 docs/user_guide.md diff --git a/README.md b/README.md index 127ccf3..0406284 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,302 @@ Assess data quality and usefulness with minimal effort. Quickly perform initial data exploration, _so you can move on to more in-depth analysis_. -Please see [project website](https://ray310.github.io/Panda-Helper/). +Please see the [project website](https://ray310.github.io/Panda-Helper/) for more information. + +## Installing Panda-Helper +Panda-Helper can be installed with: `pip install panda-helper`. + +## Using Panda Helper +For our Panda-Helper tutorial, we are going to use a dataset that counts how many + bicycles have passed through bike counting sensors at various locations in New York + City over time. We are going to merge the dataset with some additional metadata for + the sensors. The datasets can be downloaded from: + +- Bicycle Counts: [https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data) +- Metadata: [https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data) + +### Loading Data +```Python +import pandas as pd + +metadata = pd.read_csv("data/Bicycle_Counters.csv") +bike_counts = pd.read_csv( + "data/Bicycle_Counts.csv", + index_col="date", + parse_dates=["date"], + date_format="%m/%d/%Y %I:%M:%S %p", +) +bike_counts = bike_counts.join(metadata.set_index("id"), on="id", how="left") +``` + +### DataFrame Profile +The `DataFrameProfile` is used to get a quick overview of the contents of a Pandas + DataFrame. It is an object that can be later referenced or saved if desired. +In a single view it provides: + +- DataFrame shape. +- Memory usage. +- The number of duplicated rows (if any). +- The datatypes of the individual Series. +- Statistics nulls per row to provide a view on data completeness. +- Time Differences (Diffs or Gaps) if it is a time-indexed DataFrame. + - In the below example we see that most observations occur at the same time as + another observation or 15 minutes after the previous observation. There are a few + gaps where more than 15 minutes has passed since the last observation. + + +```Python +import pandahelper as ph + +ph.DataFrameProfile(bike_counts) +``` +``` +DataFrame-Level Info +---------------------- ------------- +DF Shape (5589249, 12) +Duplicated Rows 0 +Memory Usage (MB) 1,926.950 + +Series Name Data Type Memory Usage (MB) +------------- -------------- ------------------- +Index datetime64[ns] 44.714 +countid int64 44.714 +id int64 44.714 +counts int64 44.714 +status int64 44.714 +name object 438.682 +domain object 368.89 +latitude float64 44.714 +longitude float64 44.714 +interval int64 44.714 +timezone object 419.194 +sens int64 44.714 +counter object 297.758 + +Summary of Nulls Per Row +-------------------------- --------- +Number of Columns 12 +min 0 +1% 0 +5% 0 +25% 0 +50% 0 +75% 0 +95% 1 +99% 1 +max 1 +mean 0.240237 +standard deviation 0.427228 +median 0 +median absolute deviation 0 +skew 1.21604 + +Time Diffs Count % of total +--------------- ------- ------------ +0 days 00:00:00 5176050 92.61% +0 days 00:15:00 413183 7.39% +0 days 01:15:00 12 0.00% +0 days 02:15:00 1 0.00% +0 days 00:30:00 1 0.00% +0 days 06:15:00 1 0.00% +``` + +### Series Profile (Numeric) +The `SeriesProfile` is used to get a quick overview of the contents of a Pandas + Series. It is an object that can be later referenced or saved if desired. +In a single view it provides: + +- Series data type (dtype). +- The number of non-null values. +- The number of unique values. +- The number of null values. +- The counts of some of the most common and least common values in the series which + can be configured with the optional `freq_most_least` flag +- Distribution statistics for the Series based on the data type. + +_Counts are the number of bike crossings at a bike sensor in a window of time_ +```Python +ph.SeriesProfile(bike_counts["counts"]) +``` + +``` +counts Info +------------- ------- +Data Type int64 +Count 5589249 +Unique Values 897 +Null Values 0 + + Value Count % of total +------- ------- ------------ + 0 860809 15.40% + 1 373805 6.69% + 2 279622 5.00% + 3 217329 3.89% + 4 177636 3.18% + 5 150857 2.70% + 6 131232 2.35% + 7 117491 2.10% + 8 106717 1.91% + 9 98373 1.76% + 824 1 0.00% + 1092 1 0.00% + 925 1 0.00% + 894 1 0.00% + 1081 1 0.00% + +Statistic Value +------------------------- -------------- +count 5.58925e+06 +min 0 +1% 0 +5% 0 +25% 2 +50% 13 +75% 37 +95% 93 +99% 164 +max 1133 +mean 26.4127 +standard deviation 39.3405 +median 13 +median absolute deviation 13 +skew 5.17677 +``` + +### Series Profile (Object) +A `SeriesProfile` for an `object` Series will provide similar information as a numeric + Series but without distribution statistics. Here we use the optional `freq_most_least` + parameter to show a longer frequency table. + +_Name is the designation of the bike sensor station_ +```Python +ph.SeriesProfile(bike_counts["name"], freq_most_least=(20, 20)) +``` +``` +name Info +------------- ------- +Data Type object +Count 5589249 +Unique Values 34 +Null Values 0 + +Value Count % of total +----------------------------------------------------------- ------- ------------ +Manhattan Bridge Bike Comprehensive 381148 6.82% +Manhattan Bridge Display Bike Counter 381148 6.82% +Manhattan Bridge Ped Path 368665 6.60% +Ed Koch Queensboro Bridge Shared Path 368504 6.59% +Williamsburg Bridge Bike Path 368433 6.59% +Brooklyn Bridge Bike Path 366111 6.55% +Comprehensive Brooklyn Bridge Counter 365948 6.55% +Staten Island Ferry 287203 5.14% +Prospect Park West 266080 4.76% +Kent Ave btw North 8th St and North 9th St 264522 4.73% +Pulaski Bridge 243868 4.36% +1st Avenue - 26th St N - Interference testing 218169 3.90% +Manhattan Bridge 2012 to 2019 Bike Counter 202785 3.63% +8th Ave at 50th St. 195920 3.51% +Manhattan Bridge 2013 to 2018 Bike Counter 165505 2.96% +Columbus Ave at 86th St. 162481 2.91% +Amsterdam Ave at 86th St. 162369 2.91% +2nd Avenue - 26th St S 136388 2.44% +Brooklyn Bridge Bicycle Path (Roadway) 95955 1.72% +Kent Ave btw South 6th St. and Broadway 78478 1.40% +111th St at 50th Ave 72567 1.30% +Fountain Ave 63146 1.13% +Willis Ave 62148 1.11% +Willis Ave Bikes 62148 1.11% +Willis Ave Peds 62148 1.11% +Manhattan Bridge 2012 Test Bike Counter 36179 0.65% +Manhattan Bridge Interference Calibration 2019 Bike Counter 27675 0.50% +Ocean Pkwy at Avenue J 27260 0.49% +Pelham Pkwy 21452 0.38% +Broadway at 50th St 20544 0.37% +High Bridge 16276 0.29% +Emmons Ave 16267 0.29% +Forsyth Plaza 14998 0.27% +Concrete Plant Park 6761 0.12% +``` + +### Time Series Functionality +#### Calculate the cumulative gaps in time series data by category +In the above example we saw a notable difference in the number of observations per + bike counter station. We can use `category_gaps` to check for gaps in + time-indexed, categorical-like data. We use the `threshold` parameter to define the + maximum expected increment in the time-indexed data. Some of the bike stations report + data every 15 minutes and some report data every hour so we can use a threshold of one + hour. + +```Python +ph.category_gaps(bike_counts["name"], threshold=pd.Timedelta(hours=1)) +``` +``` + Cumulative Gap +Concrete Plant Park 4234 days 13:45:00 +Forsyth Plaza 4148 days 16:15:00 +Emmons Ave 4135 days 12:30:00 +High Bridge 4135 days 10:15:00 +Broadway at 50th St 4090 days 10:30:00 +Pelham Pkwy 4081 days 12:15:00 +Ocean Pkwy at Avenue J 4021 days 00:15:00 +Manhattan Bridge Interference Calibration 2019 ... 4016 days 15:00:00 +Manhattan Bridge 2012 Test Bike Counter 3928 days 01:30:00 +Willis Ave Peds 3657 days 12:45:00 +Willis Ave Bikes 3657 days 12:45:00 +Willis Ave 3657 days 12:45:00 +Fountain Ave 3647 days 01:45:00 +111th St at 50th Ave 3548 days 21:45:00 +Kent Ave btw South 6th St. and Broadway 3487 days 06:30:00 +Brooklyn Bridge Bicycle Path (Roadway) 3305 days 06:45:00 +2nd Avenue - 26th St S 2884 days 02:30:00 +Amsterdam Ave at 86th St. 2613 days 09:30:00 +Columbus Ave at 86th St. 2612 days 06:00:00 +Manhattan Bridge 2013 to 2018 Bike Counter 2580 days 19:15:00 +8th Ave at 50th St. 2263 days 19:00:00 +Manhattan Bridge 2012 to 2019 Bike Counter 2192 days 07:30:00 +1st Avenue - 26th St N - Interference testing 2032 days 00:00:00 +Pulaski Bridge 1764 days 08:45:00 +Kent Ave btw North 8th St and North 9th St 1549 days 04:30:00 +Prospect Park West 1533 days 00:30:00 +Staten Island Ferry 1312 days 22:15:00 +Comprehensive Brooklyn Bridge Counter 492 days 13:45:00 +Brooklyn Bridge Bike Path 490 days 21:45:00 +Williamsburg Bridge Bike Path 466 days 15:00:00 +Ed Koch Queensboro Bridge Shared Path 465 days 22:45:00 +Manhattan Bridge Ped Path 464 days 07:15:00 +Manhattan Bridge Bike Comprehensive 333 days 14:45:00 +Manhattan Bridge Display Bike Counter 333 days 14:45:00 +``` +#### Identify when gaps occur in time series data +It looks like the 'Manhattan Bridge Bike Comprehensive' category has the smallest + amount of missing time. We can use `id_gaps_index` to identify when the gaps occur. + We see that the largest gap for this bike sensor is ~328 days long in 2013. + +```Python +mbc = bike_counts["name"][bike_counts["name"] == "Manhattan Bridge Bike Comprehensive"] +ph.id_gaps_index(mbc, threshold=pd.Timedelta(hours=1)) +``` +``` + diffs +date +2013-12-03 00:00:00 328 days 00:15:00 +2023-09-27 02:15:00 2 days 02:30:00 +2024-01-21 02:15:00 1 days 02:30:00 +2023-07-03 02:15:00 1 days 02:30:00 +2023-07-01 02:15:00 1 days 02:30:00 +2013-12-03 11:00:00 0 days 06:15:00 +2012-10-12 15:00:00 0 days 02:15:00 +2021-03-14 03:00:00 0 days 01:15:00 +2023-03-12 03:00:00 0 days 01:15:00 +2022-03-13 03:00:00 0 days 01:15:00 +2019-03-10 03:00:00 0 days 01:15:00 +2020-03-08 03:00:00 0 days 01:15:00 +2018-03-11 03:00:00 0 days 01:15:00 +2017-03-12 03:00:00 0 days 01:15:00 +2016-03-13 03:00:00 0 days 01:15:00 +2015-03-08 03:00:00 0 days 01:15:00 +2014-11-04 05:00:00 0 days 01:15:00 +2014-03-09 03:00:00 0 days 01:15:00 +2024-03-10 03:00:00 0 days 01:15:00 +``` diff --git a/docs/index.md b/docs/index.md index 9ef480f..603c223 100644 --- a/docs/index.md +++ b/docs/index.md @@ -26,11 +26,11 @@ that allows you to assess data quality and usefulness with minimal effort. Detailed description of the Panda-Helper API -- [:material-television-guide:{ .lg .middle } __User Guide__](user_guide.md) +- [:material-television-guide:{ .lg .middle } __Tutorial__](tutorial.md) --- - How to use Panda-Helper with examples + Panda-Helper Tutorial - [:simple-github:{ .lg .middle } __Source Code__](https://github.com/ray310/Panda-Helper) diff --git a/docs/tutorial.md b/docs/tutorial.md new file mode 100644 index 0000000..4fe02df --- /dev/null +++ b/docs/tutorial.md @@ -0,0 +1,297 @@ +--- +description: Panda Helper Tutorial +--- +# Panda Helper Tutorial +For our Panda-Helper tutorial, we are going to use a dataset that counts how many + bicycles have passed through bike counting sensors at various locations in New York + City over time. We are going to merge the dataset with some additional metadata for + the sensors. The datasets can be downloaded from: + +- Bicycle Counts: [https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counts/uczf-rk3c/about_data) +- Metadata: [https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data](https://data.cityofnewyork.us/Transportation/Bicycle-Counters/smn3-rzf9/about_data) + +## Loading Data +```Python +import pandas as pd + +metadata = pd.read_csv("data/Bicycle_Counters.csv") +bike_counts = pd.read_csv( + "data/Bicycle_Counts.csv", + index_col="date", + parse_dates=["date"], + date_format="%m/%d/%Y %I:%M:%S %p", +) +bike_counts = bike_counts.join(metadata.set_index("id"), on="id", how="left") +``` + +## DataFrame Profile +The `DataFrameProfile` is used to get a quick overview of the contents of a Pandas + DataFrame. It is an object that can be later referenced or saved if desired. +In a single view it provides: + +- DataFrame shape. +- Memory usage. +- The number of duplicated rows (if any). +- The datatypes of the individual Series. +- Statistics nulls per row to provide a view on data completeness. +- Time Differences (Diffs or Gaps) if it is a time-indexed DataFrame. + - In the below example we see that most observations occur at the same time as + another observation or 15 minutes after the previous observation. There are a few + gaps where more than 15 minutes has passed since the last observation. + + +```Python +import pandahelper as ph + +ph.DataFrameProfile(bike_counts) +``` +``` +DataFrame-Level Info +---------------------- ------------- +DF Shape (5589249, 12) +Duplicated Rows 0 +Memory Usage (MB) 1,926.950 + +Series Name Data Type Memory Usage (MB) +------------- -------------- ------------------- +Index datetime64[ns] 44.714 +countid int64 44.714 +id int64 44.714 +counts int64 44.714 +status int64 44.714 +name object 438.682 +domain object 368.89 +latitude float64 44.714 +longitude float64 44.714 +interval int64 44.714 +timezone object 419.194 +sens int64 44.714 +counter object 297.758 + +Summary of Nulls Per Row +-------------------------- --------- +Number of Columns 12 +min 0 +1% 0 +5% 0 +25% 0 +50% 0 +75% 0 +95% 1 +99% 1 +max 1 +mean 0.240237 +standard deviation 0.427228 +median 0 +median absolute deviation 0 +skew 1.21604 + +Time Diffs Count % of total +--------------- ------- ------------ +0 days 00:00:00 5176050 92.61% +0 days 00:15:00 413183 7.39% +0 days 01:15:00 12 0.00% +0 days 02:15:00 1 0.00% +0 days 00:30:00 1 0.00% +0 days 06:15:00 1 0.00% +``` + +## Series Profile (Numeric) +The `SeriesProfile` is used to get a quick overview of the contents of a Pandas + Series. It is an object that can be later referenced or saved if desired. +In a single view it provides: + +- Series data type (dtype). +- The number of non-null values. +- The number of unique values. +- The number of null values. +- The counts of some of the most common and least common values in the series which + can be configured with the optional `freq_most_least` flag +- Distribution statistics for the Series based on the data type. + +_Counts are the number of bike crossings at a bike sensor in a window of time._ +```Python +ph.SeriesProfile(bike_counts["counts"]) +``` + +``` +counts Info +------------- ------- +Data Type int64 +Count 5589249 +Unique Values 897 +Null Values 0 + + Value Count % of total +------- ------- ------------ + 0 860809 15.40% + 1 373805 6.69% + 2 279622 5.00% + 3 217329 3.89% + 4 177636 3.18% + 5 150857 2.70% + 6 131232 2.35% + 7 117491 2.10% + 8 106717 1.91% + 9 98373 1.76% + 824 1 0.00% + 1092 1 0.00% + 925 1 0.00% + 894 1 0.00% + 1081 1 0.00% + +Statistic Value +------------------------- -------------- +count 5.58925e+06 +min 0 +1% 0 +5% 0 +25% 2 +50% 13 +75% 37 +95% 93 +99% 164 +max 1133 +mean 26.4127 +standard deviation 39.3405 +median 13 +median absolute deviation 13 +skew 5.17677 +``` + +## Series Profile (Object) +A `SeriesProfile` for an `object` Series will provide similar information as a numeric + Series but without distribution statistics. Here we use the optional `freq_most_least` + parameter to show a longer frequency table. + +_Name is the designation of the bike sensor station._ +```Python +ph.SeriesProfile(bike_counts["name"], freq_most_least=(20, 20)) +``` +``` +name Info +------------- ------- +Data Type object +Count 5589249 +Unique Values 34 +Null Values 0 + +Value Count % of total +----------------------------------------------------------- ------- ------------ +Manhattan Bridge Bike Comprehensive 381148 6.82% +Manhattan Bridge Display Bike Counter 381148 6.82% +Manhattan Bridge Ped Path 368665 6.60% +Ed Koch Queensboro Bridge Shared Path 368504 6.59% +Williamsburg Bridge Bike Path 368433 6.59% +Brooklyn Bridge Bike Path 366111 6.55% +Comprehensive Brooklyn Bridge Counter 365948 6.55% +Staten Island Ferry 287203 5.14% +Prospect Park West 266080 4.76% +Kent Ave btw North 8th St and North 9th St 264522 4.73% +Pulaski Bridge 243868 4.36% +1st Avenue - 26th St N - Interference testing 218169 3.90% +Manhattan Bridge 2012 to 2019 Bike Counter 202785 3.63% +8th Ave at 50th St. 195920 3.51% +Manhattan Bridge 2013 to 2018 Bike Counter 165505 2.96% +Columbus Ave at 86th St. 162481 2.91% +Amsterdam Ave at 86th St. 162369 2.91% +2nd Avenue - 26th St S 136388 2.44% +Brooklyn Bridge Bicycle Path (Roadway) 95955 1.72% +Kent Ave btw South 6th St. and Broadway 78478 1.40% +111th St at 50th Ave 72567 1.30% +Fountain Ave 63146 1.13% +Willis Ave 62148 1.11% +Willis Ave Bikes 62148 1.11% +Willis Ave Peds 62148 1.11% +Manhattan Bridge 2012 Test Bike Counter 36179 0.65% +Manhattan Bridge Interference Calibration 2019 Bike Counter 27675 0.50% +Ocean Pkwy at Avenue J 27260 0.49% +Pelham Pkwy 21452 0.38% +Broadway at 50th St 20544 0.37% +High Bridge 16276 0.29% +Emmons Ave 16267 0.29% +Forsyth Plaza 14998 0.27% +Concrete Plant Park 6761 0.12% +``` + +## Time Series Functionality +### Calculate the cumulative gaps in time series data by category +In the above example we saw a notable difference in the number of observations per + bike counter station. We can use `category_gaps` to check for gaps in + time-indexed, categorical-like data. We use the `threshold` parameter to define the + maximum expected increment in the time-indexed data. Some of the bike stations report + data every 15 minutes and some report data every hour so we can use a threshold of one + hour. + +```Python +ph.category_gaps(bike_counts["name"], threshold=pd.Timedelta(hours=1)) +``` +``` + Cumulative Gap +Concrete Plant Park 4234 days 13:45:00 +Forsyth Plaza 4148 days 16:15:00 +Emmons Ave 4135 days 12:30:00 +High Bridge 4135 days 10:15:00 +Broadway at 50th St 4090 days 10:30:00 +Pelham Pkwy 4081 days 12:15:00 +Ocean Pkwy at Avenue J 4021 days 00:15:00 +Manhattan Bridge Interference Calibration 2019 ... 4016 days 15:00:00 +Manhattan Bridge 2012 Test Bike Counter 3928 days 01:30:00 +Willis Ave Peds 3657 days 12:45:00 +Willis Ave Bikes 3657 days 12:45:00 +Willis Ave 3657 days 12:45:00 +Fountain Ave 3647 days 01:45:00 +111th St at 50th Ave 3548 days 21:45:00 +Kent Ave btw South 6th St. and Broadway 3487 days 06:30:00 +Brooklyn Bridge Bicycle Path (Roadway) 3305 days 06:45:00 +2nd Avenue - 26th St S 2884 days 02:30:00 +Amsterdam Ave at 86th St. 2613 days 09:30:00 +Columbus Ave at 86th St. 2612 days 06:00:00 +Manhattan Bridge 2013 to 2018 Bike Counter 2580 days 19:15:00 +8th Ave at 50th St. 2263 days 19:00:00 +Manhattan Bridge 2012 to 2019 Bike Counter 2192 days 07:30:00 +1st Avenue - 26th St N - Interference testing 2032 days 00:00:00 +Pulaski Bridge 1764 days 08:45:00 +Kent Ave btw North 8th St and North 9th St 1549 days 04:30:00 +Prospect Park West 1533 days 00:30:00 +Staten Island Ferry 1312 days 22:15:00 +Comprehensive Brooklyn Bridge Counter 492 days 13:45:00 +Brooklyn Bridge Bike Path 490 days 21:45:00 +Williamsburg Bridge Bike Path 466 days 15:00:00 +Ed Koch Queensboro Bridge Shared Path 465 days 22:45:00 +Manhattan Bridge Ped Path 464 days 07:15:00 +Manhattan Bridge Bike Comprehensive 333 days 14:45:00 +Manhattan Bridge Display Bike Counter 333 days 14:45:00 +``` +### Identify when gaps occur in time series data +It looks like the 'Manhattan Bridge Bike Comprehensive' category has the smallest + amount of missing time. We can use `id_gaps_index` to identify when the gaps occur. + We see that the largest gap for this bike sensor is ~328 days long in 2013. + +```Python +mbc = bike_counts["name"][bike_counts["name"] == "Manhattan Bridge Bike Comprehensive"] +ph.id_gaps_index(mbc, threshold=pd.Timedelta(hours=1)) +``` +``` + diffs +date +2013-12-03 00:00:00 328 days 00:15:00 +2023-09-27 02:15:00 2 days 02:30:00 +2024-01-21 02:15:00 1 days 02:30:00 +2023-07-03 02:15:00 1 days 02:30:00 +2023-07-01 02:15:00 1 days 02:30:00 +2013-12-03 11:00:00 0 days 06:15:00 +2012-10-12 15:00:00 0 days 02:15:00 +2021-03-14 03:00:00 0 days 01:15:00 +2023-03-12 03:00:00 0 days 01:15:00 +2022-03-13 03:00:00 0 days 01:15:00 +2019-03-10 03:00:00 0 days 01:15:00 +2020-03-08 03:00:00 0 days 01:15:00 +2018-03-11 03:00:00 0 days 01:15:00 +2017-03-12 03:00:00 0 days 01:15:00 +2016-03-13 03:00:00 0 days 01:15:00 +2015-03-08 03:00:00 0 days 01:15:00 +2014-11-04 05:00:00 0 days 01:15:00 +2014-03-09 03:00:00 0 days 01:15:00 +2024-03-10 03:00:00 0 days 01:15:00 +``` diff --git a/docs/user_guide.md b/docs/user_guide.md deleted file mode 100644 index 3ea7ebd..0000000 --- a/docs/user_guide.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -description: User Guide. How to use Panda-Helper with examples. ---- - -Coming soon... diff --git a/mkdocs.yml b/mkdocs.yml index df63eb4..4971039 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,7 +9,7 @@ copyright: > nav: - Home: index.md - Installation: install.md - - User Guide: user_guide.md + - Tutorial: tutorial.md - API Reference: api.md - Issue Tracker: https://github.com/ray310/Panda-Helper/issues extra_css: @@ -31,6 +31,7 @@ theme: - navigation.instant.progress - toc.integrate - navigation.footer + - content.code.copy palette: # Palette toggle for light mode - media: "(prefers-color-scheme: light)" @@ -93,6 +94,13 @@ markdown_extensions: - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences plugins: - search - mkdocstrings: