Skip to content

Commit

Permalink
Merge pull request #36 from awslabs/0.1.1
Browse files Browse the repository at this point in the history
0.1.1
  • Loading branch information
momonga-ml authored Nov 17, 2023
2 parents 2f1172b + 0a025db commit ef6d882
Show file tree
Hide file tree
Showing 11 changed files with 481 additions and 243 deletions.
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ lint-notebooks:
@echo "Linting notebooks..."
@nbqa black notebooks/*.ipynb
@nbqa isort notebooks/*.ipynb
nbstripout notebooks/*.ipynb


test:
Expand Down
24 changes: 10 additions & 14 deletions denseclus/DenseClus.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def __init__(
verbose: bool = False,
umap_params=None,
hdbscan_params=None,
**kwargs,
):
if umap_combine_method not in [
"intersection",
Expand Down Expand Up @@ -168,26 +169,19 @@ def __init__(
"metric": "euclidean",
}

# self.umap_params = dict()
# if umap_params is None:
# self.umap_params = default_umap_params
# else:
# for key, new_params in umap_params.items():
# self.umap_params[key] = default_umap_params | new_params
if umap_params is None:
self.umap_params = default_umap_params
else:
if umap_params:
for key in umap_params:
if key in default_umap_params:
default_umap_params[key].update(umap_params[key]) # type: ignore # noqa
else:
raise ValueError(f"Invalid key '{key}' in umap_params")
self.umap_params = default_umap_params
else:
self.umap_params = default_umap_params

if hdbscan_params is None:
self.hdbscan_params = default_hdbscan_params
if hdbscan_params:
self.hdbscan_params = hdbscan_params
else:
default_hdbscan_params.update(hdbscan_params)
self.hdbscan_params = default_hdbscan_params

if verbose:
Expand All @@ -210,6 +204,8 @@ def noop(*args, **kargs):
else:
logger.info("No random seed passed, running UMAP in Numba, parallel")

self.kwargs = kwargs

def __repr__(self):
return str(self.__dict__)

Expand All @@ -231,10 +227,10 @@ def fit(self, df: pd.DataFrame) -> None:
raise TypeError("Requires DataFrame as input")

logger.info("Extracting categorical features")
self.categorical_ = extract_categorical(df)
self.categorical_ = extract_categorical(df, **self.kwargs)

logger.info("Extracting numerical features")
self.numerical_ = extract_numerical(df)
self.numerical_ = extract_numerical(df, **self.kwargs)

logger.info("Fitting categorical UMAP")
self._fit_categorical()
Expand Down
2 changes: 1 addition & 1 deletion denseclus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .DenseClus import DenseClus
from .utils import extract_categorical, extract_numerical

__version__ = "0.1.0"
__version__ = "0.1.1"

if __name__ == "__main__":
print(type(DenseClus), type(extract_categorical), type(extract_numerical))
127 changes: 121 additions & 6 deletions denseclus/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,49 +4,164 @@
Utility functions for making fits to UMAP
"""
import warnings
from typing import Callable, Optional

import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.feature_extraction import FeatureHasher
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, PowerTransformer, StandardScaler


def extract_categorical(df: pd.DataFrame) -> pd.DataFrame:
def extract_categorical(
df: pd.DataFrame,
cardinality_threshold: int = 25,
strategy: str = "constant",
fill_value="Missing",
**kwargs,
) -> pd.DataFrame:
"""Extracts categorical features into binary dummy dataframe
Parameters:
df (pd.DataFrame): DataFrame with numerical and categorical features
cardinality_threshold: (int): Threshold to revert to using hashing when the number of
categorical features are high. Default: 25
**kwargs : Additional arguments to pass to imputation, allows to customize
Note: Imputation defaults to filling with 'Missing'
Returns:
pd.DataFrame: binary dummy DataFrame of categorical features
"""
if not isinstance(df, pd.DataFrame):
raise TypeError("Input should be a pandas DataFrame")
if df.empty:
raise ValueError("Input DataFrame should not be empty.")
if df.shape[1] < 1:
raise ValueError("Input DataFrame should have at least one column.")

categorical = df.select_dtypes(exclude=["float", "int"])
if categorical.shape[1] == 0:
raise ValueError("No Categories found, check that objects are in dataframe")

categorical_dummies = pd.get_dummies(categorical)
if categorical.empty:
raise ValueError("No categorical data found in the input DataFrame.")

return categorical_dummies
categorical = impute_categorical(
categorical,
strategy=strategy,
fill_value=fill_value,
**kwargs,
)

max_cardinality = max(categorical.nunique())

def extract_numerical(df: pd.DataFrame) -> pd.DataFrame:
if max_cardinality > cardinality_threshold:
print(f"Max of {max_cardinality} is greater than threshold {cardinality_threshold}")
print("Hashing categorical features")
hasher = FeatureHasher(n_features=cardinality_threshold, input_type="string")
hashed_df = pd.DataFrame()
for col in categorical.columns:
hashed_features = hasher.transform(categorical[col].apply(lambda x: [x]))
hashed_features = pd.DataFrame(hashed_features.toarray())
hashed_df = pd.concat([hashed_df, hashed_features], axis=1)

categorical = hashed_df
else:
categorical = pd.get_dummies(categorical, drop_first=True)

return categorical


def impute_categorical(
categorical: pd.DataFrame,
strategy: str,
fill_value: str,
custom_strategy: Optional[Callable[[pd.Series], object]] = None,
**kwargs,
) -> pd.DataFrame:
"""Imputes missing values in categorical features.
Parameters:
df (pd.DataFrame): DataFrame with categorical features
strategy (str, optional): The imputation strategy. Default is 'constant'.
fill_value (str, optional): The value to use for imputation when strategy is 'constant'. Default is 'Missing'.
custom_strategy (callable, optional): A custom function to compute the imputation value. Should take a Series and return an object.
Returns:
pd.DataFrame: DataFrame with imputed categorical features
Example:
To use a custom strategy that imputes missing values with the second most frequent value, you can do:
def second_most_frequent(s):
return s.value_counts().index[1] if len(s.value_counts()) > 1 else s.value_counts().index[0]
impute_categorical(df, custom_strategy=second_most_frequent)
"""
if strategy not in ["constant", "most_frequent"]:
raise ValueError(f"Invalid strategy for categorical: {strategy}")

if categorical.isnull().sum().sum() == 0:
return categorical

for col in categorical.columns:
if custom_strategy:
fill_value = custom_strategy(categorical[col]) # type: ignore
categorical[col].fillna(fill_value, inplace=True)
else:
imputer = SimpleImputer(strategy=strategy, fill_value=fill_value, **kwargs)
categorical[col] = imputer.fit_transform(categorical[[col]])[:, 0]

return categorical


def extract_numerical(df: pd.DataFrame, impute_strategy: str = "median", **kwargs) -> pd.DataFrame:
"""Extracts numerical features into normalized numeric only dataframe
Parameters:
df (pd.DataFrame): DataFrame with numerical and categorical features
impute_strategy (str): The imputation strategy to use if null values are found. Default is 'median'
Returns:
pd.DataFrame: normalized numerical DataFrame of numerical features
"""
if not isinstance(df, pd.DataFrame):
raise TypeError("Input should be a pandas DataFrame")
if df.empty:
raise ValueError("Input DataFrame should not be empty.")
if df.shape[1] < 1:
raise ValueError("Input DataFrame should have at least one column.")

numerical = df.select_dtypes(include=["float", "int"])
if numerical.shape[1] == 0:
raise ValueError("No numerics found, check that numerics are in dataframe")

numerical = impute_numerical(numerical, strategy=impute_strategy, **kwargs)

return transform_numerics(numerical)


def impute_numerical(numerical: pd.DataFrame, strategy: str = "median", **kwargs) -> pd.DataFrame:
"""Imputes numerical features with the given strategy
Parameters:
numerical (pd.DataFrame): DataFrame with numerical features
strategy (str): The imputation strategy. Default is 'median'
Returns:
pd.DataFrame: DataFrame with imputed numerical features
"""
if strategy not in ["median", "mean"]:
raise ValueError(f"Invalid strategy for numerical: {strategy}")

if numerical.isnull().sum().sum() == 0:
return numerical

imputer = SimpleImputer(strategy=strategy, **kwargs)
numerical_imputed = pd.DataFrame(imputer.fit_transform(numerical), columns=numerical.columns)

return numerical_imputed


def transform_numerics(numerical: pd.DataFrame) -> pd.DataFrame:
"""Power transforms numerical DataFrame
Expand Down
51 changes: 31 additions & 20 deletions notebooks/01_DenseClusExampleNB.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit ef6d882

Please sign in to comment.