merge

cavalab · Mar 28, 2024 · c42872b · c42872b
2 parents f02616a + 3f187f9
commit c42872b
Show file tree

Hide file tree

Showing 5 changed files with 124 additions and 41 deletions.
diff --git a/fomo/algorithm.py b/fomo/algorithm.py
@@ -52,6 +52,10 @@
 from pymoo.algorithms.base.genetic import GeneticAlgorithm
 
 def get_parent(pop):
+
+    if not hasattr(get_parent_WeightedCoinFlip, "_called"):
+        print("Default flex")
+        get_parent_WeightedCoinFlip._called = True
 
     fng = pop.get("fng")
     fn = pop.get("fn")
@@ -84,6 +88,10 @@ def get_parent(pop):
 
 def get_parent_noCoinFlip(pop):
 
+    if not hasattr(get_parent_WeightedCoinFlip, "_called"):
+        print("Flex with no coin flip")
+        get_parent_WeightedCoinFlip._called = True
+
     fng = pop.get("fng")
     fng = np.tile(fng, 2)
     fn = pop.get("fn")
@@ -116,6 +124,55 @@ def get_parent_noCoinFlip(pop):
     return random.choice(S)
 
 
+def get_parent_WeightedCoinFlip(pop):
+
+    if not hasattr(get_parent_WeightedCoinFlip, "_called"):
+        print("Flex with weighted coin flip")
+        get_parent_WeightedCoinFlip._called = True
+
+    samples_fnr = pop.get("samples_fnr")
+    fng = pop.get("fng")
+    fn = pop.get("fn")
+    gp_lens = pop.get('gp_lens')
+    G = np.arange(fng.shape[1])
+    S = np.arange(len(pop))
+    loss = []
+    weight = random.random()
+
+    while (len(G) > 0 and len(S) > 1):
+
+        g = random.choice(G)
+        loss = []
+
+        if (random.random() > weight):
+            #look at fairness
+            loss = fng[:, g]
+            G = G[np.where(G != g)]
+        else:
+            #look at accuracy
+            num_rows, num_cols = np.shape(samples_fnr)
+            indices = np.random.choice(num_cols, size = int(gp_lens[0, g]), replace = False)
+            fnr_sum = np.sum(samples_fnr[:, indices], axis=1)
+            pos_count = np.sum(samples_fnr[:, indices].astype(bool), axis=1)
+            for i in range (len(pos_count)):
+                if pos_count[i]:
+                    loss.append(fnr_sum[i]/pos_count[i])
+                else:
+                    loss.append(0)
+
+
+        L = min(loss) 
+        epsilon = np.median(np.abs(loss - np.median(loss)))
+        survivors = np.where(loss <= L + epsilon)
+        S = S[survivors]
+        fng = fng[survivors] 
+        fn = fn[survivors]
+        samples_fnr = samples_fnr[survivors]
+        gp_lens = gp_lens[survivors]
+
+    S = S[:, None].astype(int, copy=False)     
+    return random.choice(S)
+
 class FLEX(Selection):
 
     def __init__(self,
@@ -138,7 +195,7 @@ def _do(self, _, pop, n_select, n_parents=1, flag = 0, **kwargs):
 
         for i in range(n_select * n_parents): 
             #get pop_size parents
-            p = get_parent_noCoinFlip(pop)
+            p = get_parent(pop)
             parents.append(p)
 
         return np.reshape(parents, (n_select, n_parents))

diff --git a/fomo/estimator.py b/fomo/estimator.py
@@ -34,15 +34,15 @@
 import math
 import uuid 
 import numpy as np
+import pandas as pd
 from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
-from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.multiclass import unique_labels
-from sklearn.metrics import make_scorer, roc_auc_score, r2_score, mean_squared_error
+from sklearn.metrics import make_scorer, roc_auc_score, mean_squared_error
 from sklearn.linear_model import LogisticRegression, SGDRegressor
 from sklearn.base import clone
 from sklearn.pipeline import Pipeline
 import multiprocessing
-from multiprocessing.pool import ThreadPool
 import dill
 
 # pymoo
@@ -339,11 +339,7 @@ def plot(self):
         check_is_fitted(self, 'is_fitted_')
         I = self.I_
         F = self._get_signed_F()
-        axis_labels = (
-            [ am._score_func.__name__ for am in self.accuracy_metrics_ ] 
-            + [ fn.__name__ for fn in self.fairness_metrics_ ]
-        )
-        axis_labels = [al.replace('_',' ') for al in axis_labels]
+        axis_labels = self._get_objective_names()
         plot = (
             Scatter()
             .add(F, alpha=0.2, label='Candidate models')
@@ -367,16 +363,21 @@ def _get_signed_F(self, F=None):
                 F[:,i] = F[:,i]*m._sign
         return F
 
+    def _get_objective_names(self):
+        """Returns names of functions defining the objectives"""
+        labels = (
+            [ m._score_func.__name__ for m in self.accuracy_metrics_ ] 
+            + [ fn.__name__ for fn in self.fairness_metrics_ ]
+        )
+        labels = [l.replace('_',' ') for l in labels]
+        return labels
+
     def get_pareto_points(self):
         """Return a Pandas dataframe of the Pareto archive points"""
         F = self._get_signed_F() 
         I = self.I_
-        archive = pd.DataFrame(
-            F,
-            columns=self.accuracy_metrics_ + self.fairness_metrics_
-        )
-        chosen = [f==F[I] for f in F]
-        archive['chosen'] = chosen
+        archive = pd.DataFrame(F, columns=self._get_objective_names())
+        archive['chosen'] = [all(f==F[I]) for f in F]
         return archive
 
 class FomoClassifier(FomoEstimator, ClassifierMixin, BaseEstimator):
@@ -520,7 +521,7 @@ def _init_metrics(self):
         self.accuracy_metrics_ = self.accuracy_metrics
         self.fairness_metrics_ = self.fairness_metrics
         if self.accuracy_metrics is None:
-            self.accuracy_metrics_ = [make_scorer(roc_auc_score, greater_is_better=False)]
+            self.accuracy_metrics_ = [make_scorer(roc_auc_score, greater_is_better=False, needs_proba=True)]
         if self.fairness_metrics is None:
             self.fairness_metrics_ = [metrics.multicalibration_loss]
 

diff --git a/fomo/metrics.py b/fomo/metrics.py
@@ -271,6 +271,11 @@ def subgroup_loss(y_true, y_pred, X_protected, metric, grouping = 'intersectiona
                 mask = X_protected[col] == val
                 indices = X_protected[mask].index
                 categories[category_key] = indices
+    # print('#intersectional groups: ', len(categories))
+    # singles = 0
+    # gp_lens = [len(lst) for lst in categories.values()]
+    # singles = gp_lens.count(1)
+    # avg_len = sum(gp_lens) / len(gp_lens) if gp_lens else 0
 
     if isinstance(metric,str):
         loss_fn = FPR if metric=='FPR' else FNR
@@ -357,12 +362,14 @@ def subgroup_MSE_scorer(estimator, X, y_true, **kwargs):
     return subgroup_scorer( estimator, X, y_true, mean_squared_error, **kwargs)
 
 
-def loss(estimator, X, y_true, metric, flag = 1, **kwargs):
+def flex_loss(estimator, X, y_true, metric, **kwargs):
     """
         returns 
         ----------
         fn: overall loss of all samples
         fng: loss over group for every group in the training data
+        samples_fnr: False negative rate of every sample in the training data
+        gp_lens: length of each protected group
         
         Parameters
         ----------
@@ -381,7 +388,9 @@ def loss(estimator, X, y_true, metric, flag = 1, **kwargs):
     groups = kwargs['groups']
     X_protected = X[groups]
     categories = {}
-    group_losses = []
+    fng = []
+    samples_fnr = []
+    gp_lens = []
 
     y_pred = estimator.predict_proba(X)[:,1]
     y_pred = pd.Series(y_pred, index=X_protected.index)
@@ -393,30 +402,36 @@ def loss(estimator, X, y_true, metric, flag = 1, **kwargs):
     else:
         raise ValueError(f'metric={metric} must be "FPR", "FNR", or a callable')
 
-
-    if (flag == 1): #marginal grouping
-        categories = {}
-        for col in X_protected.columns:
-            unique_values = X_protected[col].unique()
-            for val in unique_values:
-                category_key = f'{col}_{val}'
-                mask = X_protected[col] == val
-                indices = X_protected[mask].index
-                categories[category_key] = indices
-    else: #intersectional grouping (flag is not 0 for now according to paper)
-        categories = X_protected.groupby(groups).groups
+    categories = {}
+    for col in X_protected.columns:
+        unique_values = X_protected[col].unique()
+        for val in unique_values:
+            category_key = f'{col}_{val}'
+            mask = X_protected[col] == val
+            indices = X_protected[mask].index
+            categories[category_key] = indices
 
     for c, idx in categories.items():
 
         category_loss = loss_fn(
             y_true.loc[idx].values, 
             y_pred.loc[idx].values
         )
-        group_losses.append(category_loss)
+        fng.append(category_loss)
+        gp_lens.append(len(y_true.loc[idx].values))
+
+    # print('#marginal groups: ', len(categories))
+    # singles = 0
+    # singles = gp_lens.count(1)
+    # avg_len = sum(gp_lens) / len(gp_lens) if gp_lens else 0
+
+    #Calculate FNR of each sample
+    for idx in y_true.index:
+        fnr = loss_fn(y_true[idx], y_pred[idx])
+        samples_fnr.append(fnr)
 
     fn = loss_fn(y_true, y_pred)    
-    fng = group_losses
-    return fn, fng
+    return fn, fng, samples_fnr, gp_lens
 
 
 def mce(estimator, X, y_true, num_bins=10):

diff --git a/fomo/problem.py b/fomo/problem.py
@@ -39,6 +39,7 @@
 import inspect
 import fomo.metrics as metrics
 from .surrogate_models import MLP, Linear, InterLinear
+from fomo.algorithm import Lexicase, Lexicase_NSGA2
 
 class BasicProblem(ElementwiseProblem):
     """ The evaluation function for each candidate sample weights. """
@@ -94,9 +95,14 @@ def _evaluate(self, sample_weight, out, *args, **kwargs):
             j += 1
 
         out['F'] = np.asarray(f)
-        fn, fng = metrics.loss(est, X, y, 'FNR', **self.metric_kwargs)
-        out['fn'] = fn #FNR of all samples
-        out['fng'] = fng #FNR of each group
+
+        if isinstance(self.fomo_estimator.algorithm, (Lexicase, Lexicase_NSGA2)):
+            fn, fng, samples_fnr, gp_lens = metrics.flex_loss(est, X, y, 'FNR', **self.metric_kwargs)
+            out['fn'] = fn #FNR of all samples to be used in Flex
+            out['fng'] = fng #FNR of every group to be used in Flex
+            out['samples_fnr'] = samples_fnr #FNR of each sample to be used in Flex with weighted coin flip
+            out['gp_lens'] = gp_lens #Length of each protected group to be used in Flex with weighted coin flip
+
 
 class SurrogateProblem(ElementwiseProblem):
     """ The evaluation function for each candidate weights. 
@@ -175,9 +181,13 @@ def _evaluate(self, x, out, *args, **kwargs):
             j += 1
 
         out['F'] = np.asarray(f)
-        fn, fng = metrics.loss(est, X, y, 'FNR', **self.metric_kwargs)
-        out['fn'] = fn #FNR of all samples
-        out['fng'] = fng #FNR of every group
+
+        if isinstance(self.fomo_estimator.algorithm, (Lexicase, Lexicase_NSGA2)):
+            fn, fng, samples_fnr, gp_lens = metrics.flex_loss(est, X, y, 'FNR', **self.metric_kwargs)
+            out['fn'] = fn #FNR of all samples to be used in Flex
+            out['fng'] = fng #FNR of every group to be used in Flex
+            out['samples_fnr'] = samples_fnr #FNR of each sample to be used in Flex with weighted coin flip
+            out['gp_lens'] = gp_lens #Length of each protected group to be used in Flex with weighted coin flip
 
 class MLPProblem(SurrogateProblem):
     """ The evaluation function for each candidate weights. 
@@ -198,4 +208,4 @@ class InterLinearProblem(SurrogateProblem):
 
     """
     def _get_surrogate(self):
-        return InterLinear(self.X_protected)
+        return InterLinear(self.X_protected)
diff --git a/fomo/surrogate_models.py b/fomo/surrogate_models.py
@@ -189,7 +189,7 @@ def _one_hot_encode(self, X):
             return self.ohc.transform(X)
         else:
             binary_columns = [col for col in X.columns if X[col].isin([0, 1]).all()]
-            categorical_features = [c for c in X.columns if (X[c].nunique() < 8 and c not in binary_columns)] #Do not one-hot-encode binary columns and columns with more than 8 categories. 
+            categorical_features = [c for c in X.columns if (X[c].nunique() <= 8 and c not in binary_columns)] #Do not one-hot-encode binary columns and columns with more than 8 categories. 
             self.ohc = ColumnTransformer(
                 [
                     (