-
Notifications
You must be signed in to change notification settings - Fork 76
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from cmap/math
Math
- Loading branch information
Showing
6 changed files
with
430 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
''' | ||
agg_wt_avg.py | ||
Aggregate a matrix of replicate profiles into a single signature using | ||
a weighted average based on the correlation between replicates. That is, if | ||
one replicate is less correlated with the other replicates, its values will | ||
not be weighted as highly in the aggregated signature. | ||
Equivalent to the 'modz' method in mortar. | ||
''' | ||
|
||
import numpy as np | ||
|
||
rounding_precision = 4 | ||
|
||
|
||
def get_upper_triangle(correlation_matrix): | ||
''' Extract upper triangle from a square matrix. Negative values are | ||
set to 0. | ||
Args: | ||
correlation_matrix (pandas df): Correlations between all replicates | ||
Returns: | ||
upper_tri_df (pandas df): Upper triangle extracted from | ||
correlation_matrix; rid is the row index, cid is the column index, | ||
corr is the extracted correlation value | ||
''' | ||
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool)) | ||
|
||
# convert matrix into long form description | ||
upper_tri_df = upper_triangle.stack().reset_index(level=1) | ||
upper_tri_df.columns = ['rid', 'corr'] | ||
|
||
# Index at this point is cid, it now becomes a column | ||
upper_tri_df.reset_index(level=0, inplace=True) | ||
|
||
# Get rid of negative values | ||
upper_tri_df['corr'] = upper_tri_df['corr'].clip(lower=0) | ||
|
||
return upper_tri_df.round(rounding_precision) | ||
|
||
|
||
def calculate_weights(correlation_matrix, min_wt): | ||
''' Calculate a weight for each profile based on its correlation to other | ||
replicates. Negative correlations are clipped to 0, and weights are clipped | ||
to be min_wt at the least. | ||
Args: | ||
correlation_matrix (pandas df): Correlations between all replicates | ||
min_wt (float): Minimum raw weight when calculating weighted average | ||
Returns: | ||
raw weights (pandas series): Mean correlation to other replicates | ||
weights (pandas series): raw_weights normalized such that they add to 1 | ||
''' | ||
# fill diagonal of correlation_matrix with np.nan | ||
np.fill_diagonal(correlation_matrix.values, np.nan) | ||
|
||
# remove negative values | ||
correlation_matrix = correlation_matrix.clip(lower=0) | ||
|
||
# get average correlation for each profile (will ignore NaN) | ||
raw_weights = correlation_matrix.mean(axis=1) | ||
|
||
# threshold weights | ||
raw_weights = raw_weights.clip(lower=min_wt) | ||
|
||
# normalize raw_weights so that they add to 1 | ||
weights = raw_weights / sum(raw_weights) | ||
|
||
return raw_weights.round(rounding_precision), weights.round(rounding_precision) | ||
|
||
|
||
def agg_wt_avg(mat, min_wt = 0.01, corr_metric='spearman'): | ||
''' Aggregate a set of replicate profiles into a single signature using | ||
a weighted average. | ||
Args: | ||
mat (pandas df): a matrix of replicate profiles, where the columns are | ||
samples and the rows are features; columns correspond to the | ||
replicates of a single perturbagen | ||
min_wt (float): Minimum raw weight when calculating weighted average | ||
corr_metric (string): Spearman or Pearson; the correlation method | ||
Returns: | ||
out_sig (pandas series): weighted average values | ||
upper_tri_df (pandas df): the correlations between each profile that went into the signature | ||
raw weights (pandas series): weights before normalization | ||
weights (pandas series): weights after normalization | ||
''' | ||
assert mat.shape[1] > 0, "mat is empty! mat: {}".format(mat) | ||
|
||
if mat.shape[1] == 1: | ||
|
||
out_sig = mat | ||
upper_tri_df = None | ||
raw_weights = None | ||
weights = None | ||
|
||
else: | ||
|
||
assert corr_metric in ["spearman", "pearson"] | ||
|
||
# Make correlation matrix column wise | ||
corr_mat = mat.corr(method=corr_metric) | ||
|
||
# Save the values in the upper triangle | ||
upper_tri_df = get_upper_triangle(corr_mat) | ||
|
||
# Calculate weight per replicate | ||
raw_weights, weights = calculate_weights(corr_mat, min_wt) | ||
|
||
# Apply weights to values | ||
weighted_values = mat * weights | ||
out_sig = weighted_values.sum(axis=1) | ||
|
||
return out_sig, upper_tri_df, raw_weights, weights |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
''' | ||
robust_zscore.py | ||
Robustly z-scores a pandas df along the rows (i.e. the z-score is made relative | ||
to a row). A robust z-score means that median is used instead of mean and | ||
median absolute deviation (MAD) instead of standard deviation in the | ||
standard z-score calculation: | ||
z = (x - u) / s | ||
x: input value | ||
u: median | ||
s: MAD | ||
Optionally, the median and MAD can be computed from a control df, instead of the | ||
input df. This functionality is useful for "vehicle-control"; that is, if | ||
the control df consists only of negative control samples, the median and MAD | ||
can be computed using just those samples but applied to the input df. | ||
''' | ||
|
||
rounding_precision = 4 | ||
|
||
|
||
def robust_zscore(mat, ctrl_mat=None, min_mad=0.1): | ||
''' Robustly z-score a pandas df along the rows. | ||
Args: | ||
mat (pandas df): Matrix of data that z-scoring will be applied to | ||
ctrl_mat (pandas df): Optional matrix from which to compute medians and MADs | ||
(e.g. vehicle control) | ||
min_mad (float): Minimum MAD to threshold to; tiny MAD values will cause | ||
z-scores to blow up | ||
Returns: | ||
zscore_df (pandas_df): z-scored data | ||
''' | ||
|
||
# If optional df exists, calc medians and mads from it | ||
if ctrl_mat is not None: | ||
medians = ctrl_mat.median(axis=1) | ||
median_devs = abs(ctrl_mat.subtract(medians, axis=0)) | ||
|
||
# Else just use plate medians | ||
else: | ||
medians = mat.median(axis=1) | ||
median_devs = abs(mat.subtract(medians, axis=0)) | ||
|
||
sub = mat.subtract(medians, axis='index') | ||
mads = median_devs.median(axis=1) | ||
|
||
# Threshold mads | ||
mads = mads.clip(lower=min_mad) | ||
|
||
# Must multiply values by 1.4826 to make MAD comparable to SD | ||
# (https://en.wikipedia.org/wiki/Median_absolute_deviation) | ||
zscore_df = sub.divide(mads * 1.4826, axis='index') | ||
|
||
return zscore_df.round(rounding_precision) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import unittest | ||
import logging | ||
import pandas as pd | ||
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger | ||
import cmapPy.math.agg_wt_avg as agg_wt_avg | ||
|
||
logger = logging.getLogger(setup_logger.LOGGER_NAME) | ||
|
||
test_mat = pd.DataFrame({'A':[1,2,3], 'B': [2,8,6], 'C': [6,8,9]}) | ||
test_mat_corr = test_mat.corr() | ||
|
||
|
||
class TestAggWtAvg(unittest.TestCase): | ||
def test_calculate_weights(self): | ||
# happy path | ||
raw_weights, weights = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.1) | ||
self.assertTrue(len(weights == 3)) | ||
self.assertTrue(raw_weights.tolist() == [0.8183, 0.7202, 0.8838]) | ||
self.assertTrue(weights.tolist() == [0.3378, 0.2973, 0.3649]) | ||
|
||
# test that min_wt works | ||
raw_weights2, weights2 = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.85) | ||
self.assertEqual(raw_weights2[1], 0.85) | ||
|
||
def test_get_upper_triangle(self): | ||
# happy path | ||
upper_tri_df = agg_wt_avg.get_upper_triangle(test_mat_corr) | ||
self.assertTrue(upper_tri_df['corr'].tolist() == [0.6547, 0.982, 0.7857]) | ||
self.assertTrue(upper_tri_df['rid'].tolist() == ['B', 'C', 'C']) | ||
self.assertTrue(upper_tri_df['index'].tolist() == ['A', 'A', 'B']) | ||
|
||
def test_agg_wt_avg(self): | ||
# use spearman | ||
out_sig, upper_tri_df, raw_weights, weights = agg_wt_avg.agg_wt_avg(test_mat) | ||
self.assertTrue(out_sig.tolist() == [3.125, 5.75, 6.0]) | ||
self.assertAlmostEqual(upper_tri_df.loc[upper_tri_df.index[0], "corr"], 0.5) | ||
self.assertAlmostEqual(raw_weights[0], 0.75) | ||
self.assertAlmostEqual(weights[0], 0.375) | ||
|
||
# test on a single signature | ||
out_sig2, _, _, _ = agg_wt_avg.agg_wt_avg(test_mat[["C"]]) | ||
pd.util.testing.assert_frame_equal(out_sig2, test_mat[["C"]]) | ||
|
||
# should break if empty input | ||
with self.assertRaises(AssertionError) as e: | ||
agg_wt_avg.agg_wt_avg(test_mat[[]]) | ||
self.assertIn("mat is empty!", str(e.exception)) | ||
|
||
if __name__ == "__main__": | ||
setup_logger.setup(verbose=True) | ||
unittest.main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import unittest | ||
import logging | ||
import pandas as pd | ||
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger | ||
import cmapPy.math.robust_zscore as robust_zscore | ||
|
||
logger = logging.getLogger(setup_logger.LOGGER_NAME) | ||
|
||
test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1]}) | ||
test_ctl_mat = pd.DataFrame({'E':[8,8,6], 'F': [7,6,6]}) | ||
test_ctl_mat2 = pd.DataFrame({'E':[8,8,6], 'F': [8,6,6]}) | ||
|
||
|
||
class TestRobustZscore(unittest.TestCase): | ||
def test_zscore_pc(self): | ||
pc_zscores = robust_zscore.robust_zscore(test_mat) | ||
self.assertTrue(pc_zscores.shape == (3, 4)) | ||
|
||
pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame( | ||
{'A': [-0.3372, -0.6745, -0.4047], | ||
'B': [-1.6862, 2.0235, 0.4047], | ||
'C': [1.0117, 0.6745, 1.2141], | ||
'D': [0.3372, -0.6745, -0.9443]})) | ||
|
||
def test_zscore_vc(self): | ||
vc_zscores = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat) | ||
self.assertTrue(vc_zscores.shape == (3, 4)) | ||
pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame( | ||
{'A': [-4.7214, -3.3725, -20.2347], | ||
'B': [-7.4194, 0.6745, 0.0], | ||
'C': [-2.0235, -1.349, 20.2347], | ||
'D': [-3.3725, -3.3725, -33.7245]})) | ||
|
||
# check that min_mad works | ||
vc_zscores2 = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat2) | ||
self.assertEqual(vc_zscores2.iloc[0, 0], -26.9796) | ||
self.assertEqual(vc_zscores2.iloc[1, 1], 0.6745) | ||
|
||
if __name__ == "__main__": | ||
setup_logger.setup(verbose=True) | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
''' | ||
diff_gctoo.py | ||
Converts a matrix of values (e.g. gene expression, viability, etc.) into a | ||
matrix of differential values. Values can be made differential relative to all | ||
samples in the dataset ("plate-control") or relative to just negative control | ||
samples ("vehicle-control"). The method of computing the differential can be | ||
either a robust z-score ("robust_z") or simply median normalization | ||
("median_norm"). | ||
''' | ||
import cmapPy.math.robust_zscore as robust_zscore | ||
import cmapPy.pandasGEXpress.GCToo as GCToo | ||
|
||
possible_diff_methods = ["robust_z", "median_norm"] | ||
|
||
|
||
def diff_gctoo(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle', | ||
diff_method="robust_z", upper_diff_thresh=10, lower_diff_thresh=-10): | ||
''' Converts a matrix of values (e.g. gene expression, viability, etc.) | ||
into a matrix of differential values. | ||
Args: | ||
df (pandas df): data to make diff_gctoo | ||
plate_control (bool): True means calculate diff_gctoo using plate control. | ||
False means vehicle control. | ||
group_field (string): Metadata field in which to find group_val | ||
group_val (string): Value in group_field that indicates use in vehicle control | ||
diff_method (string): Method of computing differential data; currently only | ||
support either "robust_z" or "median_norm" | ||
upper_diff_thresh (float): Maximum value for diff data | ||
lower_diff_thresh (float): Minimum value for diff data | ||
Returns: | ||
out_gctoo (GCToo object): GCToo with differential data values | ||
''' | ||
assert diff_method in possible_diff_methods, ( | ||
"possible_diff_methods: {}, diff_method: {}".format( | ||
possible_diff_methods, diff_method)) | ||
|
||
# Compute median and MAD using all samples in the dataset | ||
if plate_control: | ||
|
||
# Compute differential data | ||
if diff_method == "robust_z": | ||
diff_data = robust_zscore.robust_zscore(gctoo.data_df) | ||
|
||
elif diff_method == "median_norm": | ||
medians = gctoo.data_df.median(axis=1) | ||
diff_data = gctoo.data_df.subtract(medians, axis='index') | ||
|
||
# Compute median and MAD from negative controls, rather than all samples | ||
else: | ||
|
||
assert group_field in gctoo.col_metadata_df.columns.values, ( | ||
"group_field {} not present in column metadata. " + | ||
"gctoo.col_metadata_df.columns.values: {}").format( | ||
group_field, gctoo.col_metadata_df.columns.values) | ||
|
||
assert sum(gctoo.col_metadata_df[group_field] == group_val) > 0, ( | ||
"group_val {} not present in the {} column.").format( | ||
group_val, group_field) | ||
|
||
# Find negative control samples | ||
neg_ctl_samples = gctoo.col_metadata_df.index[gctoo.col_metadata_df[group_field] == group_val] | ||
neg_ctl_df = gctoo.data_df[neg_ctl_samples] | ||
|
||
# Compute differential data | ||
if diff_method == "robust_z": | ||
diff_data = robust_zscore.robust_zscore(gctoo.data_df, neg_ctl_df) | ||
|
||
elif diff_method == "median_norm": | ||
medians = gctoo.data_df.median(axis=1) | ||
diff_data = gctoo.data_df.subtract(medians, axis='index') | ||
|
||
# Threshold differential data before returning | ||
diff_data = diff_data.clip(lower=lower_diff_thresh, upper=upper_diff_thresh) | ||
|
||
# Construct output GCToo object | ||
out_gctoo = GCToo.GCToo(data_df=diff_data, | ||
row_metadata_df=gctoo.row_metadata_df, | ||
col_metadata_df=gctoo.col_metadata_df) | ||
|
||
return out_gctoo | ||
|
Oops, something went wrong.