Skip to content

Commit

Permalink
first time commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jmiao24 committed Aug 16, 2024
1 parent d215fa0 commit cd012f4
Show file tree
Hide file tree
Showing 13 changed files with 1,259 additions and 1 deletion.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,24 @@
# POP-GWAS_analysis
# POP-GWAS analysis

This repository contains source code to reproduce analyses in "Valid inference for machine learning-assisted genome-wide association studies".

The official software package is in [POP-TOOLS](https://github.com/qlu-lab/POP-TOOLS) GitHub repo.

## File structures
### Simulations
* `./simulation/Fun.R`: Functions for simulation and simple implementation of POP-GWAS in R
* `./simulation/qt.R`: simulation for quantitative phenotype
* `./simulation/bt.R`: simulation for binary phenotype
* `./simulation/imputation_r_FPR.R`: type-I error simulation for varying imputation correlation
* `./simulation/imputation_r_power.R`: power simulation for varying imputation correlation
* `./simulation/vary_ratio.R`: simulation for varying sample size of unlabeled data

### UK Biobank data analysis
* `./real_data/1_softimpute.R`: phenotype imputation using softimpute
* `./real_data/2_regenie.sh`: run GWAS using regenie
* `./real_data/3_popGWAS.sh`: apply POP-GWAS to summary statistics
* `./real_data/4.1_post-GWAS.sh`: estimating heritability and genetic correlation using LD score regression
* `./real_data/4.2_coloc.R`: colocalization analysis

## Contact
Feel free to reach out to Jiacheng at jmiao24@wisc.edu for questions.
120 changes: 120 additions & 0 deletions real_data/1_softimpute.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
rm(list = ls())
library(data.table)
library(openxlsx)
library(janitor)
library(rpart)
library(softImpute)
library(caret)
library(optparse)

option_list = list(
make_option("--t", action="store", type='character')
)

opt = parse_args(OptionParser(option_list=option_list))

t <- opt$t

# Y
pheno <- fread("./data/train/DXA/dxa_bone_size_mineral_density.txt.gz")
covar <- fread("./Resource/Phenotype/EUR_covar.txt.gz")
pheno.indep <- pheno[pheno$IID %in% covar$IID, ]
covar <- covar[match(pheno.indep$IID, covar$IID),]

# Z
pred <- fread(paste0("./data/train/DXA_BMD/",t,"_eur.Z.txt"))
colnames(pred)[1] <- "IID"
pred <- pred[match(pheno.indep$IID, pred$IID), ]

pred_tmp <- cbind(pred, as.data.frame(pheno.indep)[, t, drop = F])
colnames(pred_tmp)[ncol(pred_tmp)] <- "y"
pred_tmp.1 <- pred_tmp[,"IID",drop=F]
for (i in 2:ncol(pred_tmp)) {
c <- covar
c$est <- as.data.frame(pred_tmp)[, i]
model <- lm(est ~ Sex+Year+SexYear+Chip+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16+PC17+PC18+PC19+PC20, data = c)
res <- rep(NA, nrow(pred_tmp))
res[as.integer(names(residuals(model)))] <- residuals(model)
pred_tmp.1 <- cbind(pred_tmp.1, res)
}
colnames(pred_tmp.1)[2:ncol(pred_tmp.1)] <- paste0(colnames(pred_tmp)[2:ncol(pred_tmp)], ".res")
pred_tmp.2 <- pred_tmp.1[as.vector(!is.na(pred_tmp.1[, ncol(pred_tmp.1), with = F])), ]
pred_tmp.2 <- clean_names(pred_tmp.2)
dat <- pred_tmp.2


# Cross-fitting
cv_folds <- createFolds(dat$y_res, k = 10) # Assuming you're doing 10-fold CV
test_all <- c()
r_vec <- c()

set.seed(1234)
for (i in 1:10) {
print(i)
# i <- 1
# Split the data into training and test based on the folds
train <- as.matrix(dat[-cv_folds[[i]], -1])
test_tmp <- dat[cv_folds[[i]],]
test <- as.matrix(test_tmp[, -1])
test[, "y_res"] <- NA
incomplete <- as.matrix(rbind(train, test))

## softImpute
lambda.len <- 100
maxrank <- T
fixed.maxrank <- ncol(incomplete)-1
verbose <- F
lam0 <- lambda0(incomplete)
lamseq <- exp(seq(from=log(lam0+.2),to=log(.001),length=lambda.len))
ranks <- as.integer( lamseq )
rank.max <- ifelse(maxrank, 2, fixed.maxrank )
warm <- NULL
for(j in seq(along=lamseq)){
if( verbose ) cat( j, ' ' )
out <- softImpute(x=incomplete, lambda=lamseq[j], rank=rank.max, warm=warm, maxit=1000)
complete <- complete(incomplete, out)
warm <- out
if( maxrank ){
ranks[j] <- sum(round(out$d,4)>0)
rank.max <- min( ranks[j]+2, fixed.maxrank ) ### grows by at most 2, bounded by P/2
}
if( verbose ) cat( '\n' )
}
test <- cbind(test_tmp, complete[(nrow(train)+1):nrow(complete), "y_res"])
colnames(test)[ncol(test)] <- "y_hat"
r_vec <- c(r_vec, cor(test$y_res, test$y_hat, use = "pairwise.complete.obs"))
test_all <- rbind(test_all, test)
}

r_vec

# Prediction on all data:
pred_tmp.3 <- clean_names(as.data.frame(pred_tmp.1)[, -ncol(pred_tmp.1)])
pred <- as.matrix(clean_names(as.data.frame(pred_tmp.1)[, -1]))
lambda.len <- 100
maxrank <- T
fixed.maxrank <- ncol(pred)-1
verbose <- F
lam0 <- lambda0(pred)
lamseq <- exp(seq(from=log(lam0+.2),to=log(.001),length=lambda.len))
ranks <- as.integer( lamseq )
rank.max <- ifelse(maxrank, 2, fixed.maxrank )
warm <- NULL
for(j in seq(along=lamseq)){
if( verbose ) cat( j, ' ' )
out <- softImpute(x=pred, lambda=lamseq[j], rank=rank.max, warm=warm, maxit=1000)
pred_complete <- complete(pred, out)
warm <- out
if( maxrank ){
ranks[j] <- sum(round(out$d,4)>0)
rank.max <- min( ranks[j]+2, fixed.maxrank ) ### grows by at most 2, bounded by P/2
}
if( verbose ) cat( '\n' )
}
pred_all <- cbind(pred_tmp.3[, 1], as.data.frame(pred_complete))
colnames(pred_all)[ncol(pred_all)] <- "y_hat"
colnames(pred_all)[1] <- "iid"
pred_all_out <- pred_all[!(pred_all$iid %in% test_all$iid), ]

fwrite(pred_all_out, paste0("./data/train/DXA_BMD/",t,".unlab_eur.pop.txt"), sep = "\t", quote = F, row.names = F, col.names = T)
fwrite(test_all, paste0("./data/train/DXA_BMD/",t,".lab_eur.pop.txt"), sep = "\t", quote = F, row.names = F, col.names = T)
107 changes: 107 additions & 0 deletions real_data/2_regenie.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/bash

## see more from: https://rgcgithub.github.io/regenie/recommendations/
## regenie documentations: https://rgcgithub.github.io/regenie/options/

## Replace the following variables with your variable of interest
# trait=Arms
# pheno=lab.y_hat

## Step1, stage1
mkdir -p ./data/gwas_regenie/DXA_BMD/regenie/step0

./plink \
--bfile ./UKB/genotype/ukb_EUR_pedigree_finetuned \
--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--autosome \
--snps-only just-acgt \
--geno 0.01 \
--hwe 0.000001 \
--maf 0.01 \
--allow-no-sex \
--write-snplist \
--out ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass

mkdir -p ./data/gwas_regenie/DXA_BMD/regenie/step1

./regenie/v3.2.1/regenie \
--step 1 \
--gz \
--threads 8 \
--bed ./UKB/genotype/ukb_EUR_pedigree_finetuned \
--extract ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass.snplist \
--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoColList ${pheno} \
--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
--catCovarList Sex,Chip \
--maxCatLevels 100 \
--bsize 1000 \
--split-l0 ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_parallel,100 \
--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_l0

## Step1, stage 2
./regenie/v3.2.1/regenie \
--step 1 \
--gz \
--threads 8 \
--bed ./UKB/genotype/ukb_EUR_pedigree_finetuned \
--extract ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass.snplist \
--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoColList ${pheno} \
--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
--catCovarList Sex,Chip \
--maxCatLevels 100 \
--bsize 1000 \
--run-l0 ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_parallel.master,${job} \
--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_l0_${job}

## Step1, stage 3
./regenie/v3.2.1/regenie \
--step 1 \
--gz \
--threads 32 \
--bed ./UKB/genotype/ukb_EUR_pedigree_finetuned \
--extract ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass.snplist \
--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoColList ${pheno} \
--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
--catCovarList Sex,Chip \
--maxCatLevels 100 \
--bsize 1000 \
--run-l1 ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_parallel.master \
--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}

# Stage 2:
./plink \
--bfile ./genotype/chunks/chr${chr}.EUR.idupdated_snpid_${chunk}_50 \
--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--geno 0.01 \
--hwe 0.000001 \
--maf 0.01 \
--write-snplist \
--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_qc_pass_chr${chr}_chunk${chunk}

mkdir -p ./data/gwas_regenie/DXA_BMD/regenie/step2

./regenie/v3.2.1/regenie \
--step 2 \
--gz \
--threads 5 \
--bsize 400 \
--pred ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_pred.list \
--bed ./genotype/chunks/chr${chr}.EUR.idupdated_snpid_${chunk}_50 \
--extract ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_qc_pass_chr${chr}_chunk${chunk}.snplist \
--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
--phenoColList ${pheno} \
--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
--catCovarList Sex,Chip \
--maxCatLevels 100 \
--out ./data/gwas_regenie/DXA_BMD/regenie/step2/gwas_linear_${trait}_${pheno}_chr${chr}_snpid_${chunk}_50
12 changes: 12 additions & 0 deletions real_data/3_popGWAS.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

# Prepare the Regenie input into the POP-TOOLS format
cd POP-TOOLS

trait=Head_BMD

python3 ./POP-GWAS.py \
--gwas-yhat-unlab ./data/${trait}_yhat_unlab.txt.gz \
--gwas-y-lab ./data/${trait}_y_lab.txt.gz \
--gwas-yhat-lab ./data/${trait}_yhat_lab.txt.gz \
--out ./result/${trait}
107 changes: 107 additions & 0 deletions real_data/4.1_post-GWAS.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/bash

# Clumping
./plink/plink_1.9_linux_x86_64/plink \
--bfile ./UKB_LD/genotype/ukb_10k \
--clump ./result/${trait}_POP-GWAS.txt \
--clump-field P \
--clump-p1 1.4e-8 \
--clump-p2 1.4e-8 \
--clump-r2 0.01 \
--clump-kb 5000 \
--out ./results/clumping/${trait}_popgwas

# Heritability & Genetic correlation
## LDSC munge
traits=(
Arms
Femur_neck
Femur_shaft
Femur_total
Femur_troch
Femur_wards
Head
L1-L4
Legs
Pelvis
Ribs
Spine
Total
Trunk
)

OutDir="./data/munged_gwas"

for trait in "${traits[@]}"; do
filepath="./DXA/cleaned/${trait}_pop.txt.gz"
output="${OutDir}/munged_${trait}_pop"

./bin/python ./Software/ldsc/ldsc/munge_sumstats.py\
--merge-alleles ./Software/ldsc/Inputs/w_hm3.snplist\
--sumstats $filepath\
--ignore "Z"\
--out $output &
done


# Cross-site Genetic correlation
# Put the following code in R:
library(data.table)
library(dplyr)
library(glue)

pop_gwas_path <- list.files("../data/munged_gwas", pattern = "pop.sumstats.gz", full.names = TRUE)

gwas_list_collapsed <- paste(pop_gwas_path, collapse = ",")

for(i in seq_along(pop_gwas_path)){
index_gwas <- pop_gwas_path[i]
index_gwas_trait <- sub(".*/munged_(.*?)_pop.\\.sumstats\\.gz", "\\1", index_gwas)
cmd <- paste0(
"./bin/python ./Software/ldsc/ldsc/ldsc.py",
" --rg ", index_gwas, ",", gwas_list_collapsed,
" --ref-ld-chr ./Software/ldsc/Inputs/EUR_1KGphase1/eur_w_ld_chr/",
" --w-ld-chr ./Software/ldsc/Inputs/EUR_1KGphase1/eur_w_ld_chr/",
" --out ../data/gen_cor/", index_gwas_trait, "_vs_all.pop &")

system(cmd)
}

traits=(
Arms
Femur_neck
Femur_shaft
Femur_total
Femur_troch
Femur_wards
Head
L1-L4
Legs
Pelvis
Ribs
Spine
Total
Trunk
)
# make directory for each trait

for trait in "${traits[@]}"; do

mkdir $PWD/data/gen_cor_vs_other_traits/${trait}

done

# create .R file for each trait to run ldsc gencor
for trait in "${traits[@]}"; do
# Create a unique .R file for each trait
echo "source(\"./Tools/GeneticCorrelation/GenCorrLDSC.R\")" > "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
echo "" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
echo "ss <- c(" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
echo " \"./data/munged_gwas/munged_${trait}_pop.sumstats.gz\"" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
echo ")" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
echo "ss_name <- c(\"${trait}\")" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
echo "working_folder <- \"temp_ldsc\"" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
echo "multiple_ldsc_gencorr_func(working_folder, ss, ss_name)" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
done

# SMR
Loading

0 comments on commit cd012f4

Please sign in to comment.