Skip to content

Commit

Permalink
updated function to handle several filling strategies
Browse files Browse the repository at this point in the history
  • Loading branch information
vicpaton committed Sep 11, 2024
1 parent 1b89c8d commit 9df6016
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 10 deletions.
24 changes: 17 additions & 7 deletions networkcommons/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,17 @@ def targetlayer_formatter(df, n_elements=25, act_col='stat'):
return dict_df


def handle_missing_values(df, threshold=0.1, fill=True):
def handle_missing_values(df, threshold=0.1, fill=np.mean):
"""
Handles missing values in a DataFrame by filling them with the mean of the row or dropping the rows.
Handles missing values in a DataFrame by filling them with a specified function or value, or dropping the rows.
Parameters:
- df (pandas.DataFrame): The DataFrame containing the data.
- threshold (float): The threshold for the share (0<n<1) of missing values in a row. Rows with a share
of missing values greater than or equal to the threshold will be dropped.
- fill (callable, int, float, or None): If callable, the function is applied to each row to fill missing values.
If an integer or float, it is used to fill missing values.
If None, no filling is done.
Returns:
- df (pandas.DataFrame): The DataFrame with missing values handled.
Expand All @@ -246,7 +249,7 @@ def handle_missing_values(df, threshold=0.1, fill=True):
Example:
>>> df = pd.DataFrame({'A': [1, 2, np.nan], 'B': [3, 2, np.nan], 'C': [np.nan, 7, 8]})
>>> handle_missing_values(df, 0.5)
>>> handle_missing_values(df, 0.5, fill=np.mean)
Number of genes filled: 1
Number of genes removed: 1
"""
Expand All @@ -270,10 +273,17 @@ def handle_missing_values(df, threshold=0.1, fill=True):

filled_count = (df[to_fill].isna().sum(axis=1) > 0).sum()

# Replace NAs with the mean of the row for rows to fill
if fill:
df.loc[to_fill] = df.loc[to_fill].apply(lambda row: row.fillna(row.mean()), axis=1)
print(f"Number of genes filled: {filled_count}")
# Replace NAs based on the fill argument
if callable(fill):
# If fill is a function (like np.mean, np.median), apply it row-wise
df.loc[to_fill] = df.loc[to_fill].apply(lambda row: row.fillna(fill(row)), axis=1)
print(f"Number of genes filled using function {fill.__name__}: {filled_count}")
elif isinstance(fill, (int, float)):
# If fill is a constant (like 0), use it directly
df.loc[to_fill] = df.loc[to_fill].fillna(fill)
print(f"Number of genes filled with value {fill}: {filled_count}")
elif fill is not None:
raise ValueError("fill parameter must be a callable, a numeric value, or None")

# Drop rows with NA percentage greater than or equal to threshold
df = df[~to_drop]
Expand Down
6 changes: 3 additions & 3 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,21 +173,21 @@ def test_subset_df_with_nodes():

def test_handle_missing_values_fill():
df = pd.DataFrame({'A': [1, 2, np.nan], 'B': [3, 2, np.nan], 'C': [np.nan, 7, 8]})
result = utils.handle_missing_values(df, 0.5, fill=True)
result = utils.handle_missing_values(df, 0.5, fill=np.mean)
expected = pd.DataFrame({'index': [0, 1], 'A': [1.0, 2.0], 'B': [3.0, 2.0], 'C': [2.0, 7.0]}).astype({'index': 'int64'})
pd.testing.assert_frame_equal(result, expected)


def test_handle_missing_values_fill_and_drop():
df = pd.DataFrame({'A': [1, np.nan, np.nan], 'B': [np.nan, 2, np.nan], 'C': [np.nan, 7, np.nan]})
result = utils.handle_missing_values(df, 0.5, fill=True)
result = utils.handle_missing_values(df, 0.5, fill=np.mean)
expected = pd.DataFrame({'index': [1], 'A': [4.5], 'B': [2.0], 'C': [7.0]}).astype({'index': 'int64'})
pd.testing.assert_frame_equal(result, expected)


def test_handle_missing_values_drop():
df = pd.DataFrame({'A': [1, np.nan, np.nan], 'B': [np.nan, np.nan, np.nan], 'C': [np.nan, np.nan, 8]})
result = utils.handle_missing_values(df, 0.1, fill=False)
result = utils.handle_missing_values(df, 0.1, fill=None)
expected = pd.DataFrame({'index': [], 'A': [], 'B': [], 'C': []}).astype({'index': 'int64'})
pd.testing.assert_frame_equal(result, expected)

Expand Down

0 comments on commit 9df6016

Please sign in to comment.