forked from joyceyiyiwang/Portability_Questions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
09i_post_heterozygosity_calc.R
66 lines (52 loc) · 2.19 KB
/
09i_post_heterozygosity_calc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
library(dplyr)
library(tidyr)
library(readr)
# Group by magnitude
pheno <- c("Height","Platelet","MCV","MCH","BMI","RBC","Monocyte",
"Lymphocyte","WBC","Eosinophil", "LDL", "Weight", "Triglycerides", "Cystatin_C",
"Body_Fat_Perc")
# Get the SNPs used in PGS calculations
snp_lst <- list()
for(trait in pheno){
snp <- read_tsv(paste0("data/pgs/", trait, "_threshold_1.txt"),
col_names = F)
colnames(snp) = "ID"
beta <- read_tsv(paste0("data/gwas_results/", trait, "_combined.glm.linear"))
snp <- snp %>% left_join(beta, by = "ID")
snp$beta_sq <- (snp$BETA)^2
snp <- snp[, c(1, 14)]
snp_lst[[trait]] <- snp
}
# Stratify SNPs by into 3 equally sized bins
het <- c()
dir <- "data/heterozygosity/"
for(i in 1:500){
print(i)
temp <- read_tsv(paste0(dir, "heterozygosity_bin_", i, ".frqx"))
for(trait in pheno){
# Only look at SNPs used in PGS calculations
temp2 <- temp %>% select(SNP, `C(HOM A1)`, `C(HET)`, `C(HOM A2)`)
colnames(temp2)[1] <- "ID"
temp2 <- snp_lst[[trait]] %>% left_join(temp2, by = "ID")
temp2 <- temp2 %>% na.omit()
temp2$beta_sq_size <- ntile(temp2$beta_sq, 3)
# Calculate the heterozygosity of each SNP
temp2$het <- temp2$`C(HET)` / ( temp2$`C(HOM A1)` + temp2$`C(HET)` + temp2$`C(HOM A2)`)
# Take the mean of each stratum
het_temp <- temp2 %>%
dplyr::group_by(beta_sq_size) %>%
dplyr::summarize(mean_het = mean(het))
het <- c(het, het_temp$mean_het)
}
}
heterozygosity <- cbind.data.frame(phenotype = rep(rep(pheno, each = 3), 500),
weighted_pc_groups = rep(1:500, each = 15 * 3),
size = rep(1:3, 500 * 15),
mean_het = het)
heterozygosity <- heterozygosity %>%
arrange(phenotype, weighted_pc_groups, size)
# Read in the file containing median PC distance of each bin
median_pc <- read_tsv('data/pgs_pred/group_pgs_df.tsv')
median_pc <- median_pc %>% distinct(weighted_pc_groups, median_pc, group_close_to_gwas)
heterozygosity <- heterozygosity %>% left_join(median_pc, by = "weighted_pc_groups")
heterozygosity %>% write_tsv("data/heterozygosity/mean_heterozygosity.tsv")