-
Notifications
You must be signed in to change notification settings - Fork 1
/
p_values.py
144 lines (113 loc) · 5.52 KB
/
p_values.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from itertools import starmap
from warnings import warn
import numpy as np
from pandas import Series, DataFrame
from skgof import ks_test, ad_test
from scipy.stats import genpareto
def ecdf_p_value(test_statistics: Series, null_distribution: DataFrame, ecdf_pseudocount=1):
# note: using biased p-value estimator as in https://www.ncbi.nlm.nih.gov/pubmed/21044043
does_test_exceed_null = null_distribution.ge(test_statistics, axis='rows')
N = len(null_distribution.columns)
return (ecdf_pseudocount + does_test_exceed_null.sum(axis=1)) / (N + 1)
def gpd_p_value(test_statistics: Series, null_distribution: DataFrame, n=250, m=10, decrease_n_by=10, ecdf_pseudocount=1):
"""Partially-vectorized calculation of p-values using Empirical Cumulative Distribution Function (ECDF)
with Generalized Pareto Distribution (GPD) correction for extreme values,
usign algorithm proposed by (Knijnenburg, et. al, 2009), see:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2687965/
Maximum Likelihood Estimate is used to fit GPD.
Args:
test_statistics: vector of values of the test statistics for observations
null_distribution: null distribution obtained from permutation testing,
containing iterations in columns and observations in rows
n: initial number of permutation exceedances ($N_{\text{exc}}$) used to determine exceedances threshold ($t$),
used to select data for fitting of GDP
m: the threshold for test statistic exceedances $M$ below which GDP is used instead of ECDF
decrease_n_by: how much should n be decreased in each iteration if the GDP fit is not significant
ecdf_pseudocount: small value added to ECDF p-value estimate to prevent p_values equal to zero
Details:
a formula using (N+1) in denominator of biased p-value (ECDF-based) estimator
is used (as in https://www.ncbi.nlm.nih.gov/pubmed/21044043) rather than N as
in the (Knijnenburg, et. al, 2009).
"""
# all tested observations have to be in the null
assert not (set(test_statistics.index) - set(null_distribution.index))
if (
len(null_distribution.index) != len(test_statistics.index)
or
any(null_distribution.index != test_statistics.index)
):
# reorder and limit null so that it matches tested observations
null_distribution = null_distribution.loc[test_statistics.index]
# TODO:
#assert not any(null_distribution.isnull())
# number of permutations
N = len(null_distribution.columns)
assert n < N
# indicator vector
does_test_exceed_null = null_distribution.ge(test_statistics, axis='rows')
# count of test statistic exceedances
M = does_test_exceed_null.sum(axis=1)
if len(M) == 0:
print('No observations for p-value estimation!')
return DataFrame(
columns={
'p_value': np.nan,
'is_gpd': np.nan,
'gof': np.nan,
'ecdf_p': np.nan
}.keys()
)
if min(M) >= m:
print(f'Got enough permutations, no observations with fewer exceedances than {m}, falling back to ECDF')
return DataFrame({
'p_value': ecdf_p_value(test_statistics, null_distribution, ecdf_pseudocount),
'is_gpd': False,
'gof': np.nan,
'ecdf_p': np.nan
})
def _p(test_i, null_i, M_i, d_i):
gpd_fit = None
gpd_fit_p_value = None
n_i = n
# TODO: no need to sort as much as N numbers, do partial sort:
# but this requires some tests (both performance and unit)
# null_i_partitioned = np.partition(null_i, n_i+1)
# null_i_first_n_sorted = sorted(null_i_partitioned[:-n_i+1])
null_i = sorted(null_i)
t = None
if all(np.isnan(null_i)):
return np.nan, False, np.nan, np.nan
# compute ecdf based, biased estimate of p-value
raw_ecdf_estimate = (ecdf_pseudocount + d_i.sum()) / (N + 1)
if M_i < m:
# fit GDP, reducing $n$ until convergance
while n_i > 0:
# -1 because Python has 0-based indexing
t = (null_i[-n_i-1] + null_i[-n_i-2]) / 2
y_untill_n = null_i[-n_i:]
exceedences = y_untill_n - t
assert all(y_untill_n >= t)
assert len(exceedences) == n_i
fit = genpareto.fit(exceedences)
fitted = genpareto(*fit)
gpd_fit = fitted
gpd_fit_p_value = ad_test(exceedences, fitted).pvalue
if gpd_fit_p_value <= 0.05:
break
else:
n_i -= decrease_n_by
if gpd_fit and gpd_fit_p_value < 0.05:
return n_i / N * (1 - gpd_fit.cdf(test_i - t)), True, gpd_fit_p_value, raw_ecdf_estimate
else:
if gpd_fit:
# TODO: get index and highlight which observation could not be fitted!
warn(f'A good GPD fit could not be reached, using ECDF estimate instead')
return raw_ecdf_estimate, False, np.nan, raw_ecdf_estimate
p_values = DataFrame(starmap(
_p,
list(zip(
test_statistics.values, null_distribution.values,
M.values, does_test_exceed_null.values
))
), columns=['p_value', 'is_gpd', 'gof', 'ecdf_p'], index=test_statistics.index)
return p_values