-
Notifications
You must be signed in to change notification settings - Fork 0
/
reduce-dims-pca.py
48 lines (31 loc) · 1.29 KB
/
reduce-dims-pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import keras as k
import pandas as p
import numpy as n
from dfply import *
from plotnine import *
from sklearn.decomposition import PCA
@dfpipe
def dropna_in_column(df,column_name):
return df.dropna(subset=[column_name]);
data = (p.read_csv("source_data/clinical_outcomes.csv") >> dropna_in_column("group") >> mutate(group = X.group.astype(int)))
initial = data >> mask(X.redcap_event_name == "baseline")
def col_types(df):
return [(c,type(df[c])) for c in df.columns];
@dfpipe
def numeric_columns(df):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
return newdf
@dfpipe
def drop_na_columns(df):
return df.dropna(axis=1, how='any')
pca = PCA(n_components = 13);
baseline = data >> mask(X.redcap_event_name=="baseline");
numerical_data = baseline >> numeric_columns() >> drop_na_columns() >> drop(X.id, X.group);
def pcs_names(n):
return [f'PC{i+1}' for i in range(n)]
transformed = p.DataFrame(pca.fit_transform(numerical_data), columns=pcs_names(pca.n_components));
baseline['PC1'] = list(transformed['PC1']);
baseline['PC2'] = list(transformed['PC2']);
plt = (ggplot(baseline, aes('PC1','PC2',color='pain_avg')) + geom_point());
plt.save(filename="figures/baseline-basic-pca.png");