-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot.py
90 lines (80 loc) · 3.39 KB
/
plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
def plot_qq(p_values, file_path):
output_dir = os.path.splitext(file_path)[0]
sorted_pvals = np.sort(p_values)
expected = -np.log10(np.arange(1, len(sorted_pvals) + 1) / (len(sorted_pvals) + 1))
observed = -np.log10(sorted_pvals)
plt.figure(figsize=(6, 6))
plt.plot(expected, observed, 'o', markersize=2)
plt.plot([0, max(expected)], [0, max(expected)], 'r--')
plt.xlabel('Expected -log10(p)')
plt.ylabel('Observed -log10(p)')
plt.title('QQ Plot')
plt.savefig(os.path.join(output_dir, 'qq_plot.png'))
plt.close()
def plot_manhattan(df, file_path):
output_dir = os.path.splitext(file_path)[0]
df['-log10(P)'] = -np.log10(df['P'])
df = df.dropna(subset=['CHR', 'BP', '-log10(P)'])
df = df[np.isfinite(df['-log10(P)'])]
df['CHR'] = df['CHR'].astype(int)
chr_dict = {chrom: index for index, chrom in enumerate(sorted(df['CHR'].unique()))}
df['chr_index'] = df['CHR'].map(chr_dict)
df['normalized_bp'] = df.groupby('CHR')['BP'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
fig, ax = plt.subplots(figsize=(18, 6))
colors = ['b', 'r']
x_labels = []
x_labels_pos = []
for num, (name, group) in enumerate(df.groupby('CHR')):
ax.scatter(group['chr_index'] + group['normalized_bp'], group['-log10(P)'], color=colors[num % len(colors)], s=10)
x_labels.append(name)
x_labels_pos.append(chr_dict[name] + 0.5)
ax.set_xticks(x_labels_pos)
ax.set_xticklabels(x_labels, fontsize=8)
ax.set_xlabel('Chromosome')
ax.set_ylabel('-log10(p-value)')
ax.set_title('Manhattan Plot')
plt.savefig(os.path.join(output_dir, 'manhattan_plot.png'))
plt.close()
def plot_pvalue_histogram(p_values, file_path):
output_dir = os.path.splitext(file_path)[0]
plt.figure(figsize=(8, 6))
plt.hist(p_values, bins=50, edgecolor='k', alpha=0.7)
plt.xlabel('P-value')
plt.ylabel('Frequency')
plt.title('P-value Distribution')
plt.savefig(os.path.join(output_dir, 'pvalue_histogram.png'))
plt.close()
def plot_effect_size_distribution(effect_sizes, file_path):
output_dir = os.path.splitext(file_path)[0]
plt.figure(figsize=(8, 6))
plt.hist(effect_sizes, bins=50, edgecolor='k', alpha=0.7)
plt.xlabel('Effect Size (BETA)')
plt.ylabel('Frequency')
plt.title('Effect Size Distribution')
plt.savefig(os.path.join(output_dir, 'effect_size_distribution.png'))
plt.close()
def plot_volcano(df, file_path):
output_dir = os.path.splitext(file_path)[0]
df['-log10(P)'] = -np.log10(df['P'])
plt.figure(figsize=(10, 8))
plt.scatter(df['BETA'], df['-log10(P)'], alpha=0.5)
plt.xlabel('Effect Size (BETA)')
plt.ylabel('-log10(p-value)')
plt.title('Volcano Plot')
plt.savefig(os.path.join(output_dir, 'volcano_plot.png'))
plt.close()
def assess_population_stratification(df, file_path, n_components=2):
output_dir = os.path.splitext(file_path)[0]
pca = PCA(n_components=n_components)
pca_result = pca.fit_transform(df[['NCAS', 'NCON']].dropna())
plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5)
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Population Stratification Assessment')
plt.savefig(os.path.join(output_dir, 'population_stratification.png'))
plt.close()