first time commit

jmiao24 · Aug 16, 2024 · cd012f4 · cd012f4
1 parent d215fa0
commit cd012f4
Show file tree

Hide file tree

Showing 13 changed files with 1,259 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1 +1,24 @@
-# POP-GWAS_analysis
+# POP-GWAS analysis
+
+This repository contains source code to reproduce analyses in "Valid inference for machine learning-assisted genome-wide association studies". 
+
+The official software package is in [POP-TOOLS](https://github.com/qlu-lab/POP-TOOLS) GitHub repo.
+
+## File structures
+### Simulations
+* `./simulation/Fun.R`: Functions for simulation and simple implementation of POP-GWAS in R
+* `./simulation/qt.R`: simulation for quantitative phenotype
+* `./simulation/bt.R`: simulation for binary phenotype
+* `./simulation/imputation_r_FPR.R`: type-I error simulation for varying imputation correlation 
+* `./simulation/imputation_r_power.R`: power simulation for varying imputation correlation
+* `./simulation/vary_ratio.R`: simulation for varying sample size of unlabeled data
+
+### UK Biobank data analysis
+* `./real_data/1_softimpute.R`: phenotype imputation using softimpute
+* `./real_data/2_regenie.sh`: run GWAS using regenie
+* `./real_data/3_popGWAS.sh`: apply POP-GWAS to summary statistics
+* `./real_data/4.1_post-GWAS.sh`: estimating heritability and genetic correlation using LD score regression
+* `./real_data/4.2_coloc.R`: colocalization analysis
+
+## Contact
+Feel free to reach out to Jiacheng at jmiao24@wisc.edu for questions.
diff --git a/real_data/1_softimpute.R b/real_data/1_softimpute.R
@@ -0,0 +1,120 @@
+rm(list = ls())
+library(data.table)
+library(openxlsx)
+library(janitor)
+library(rpart)
+library(softImpute)
+library(caret)
+library(optparse)
+
+option_list = list(
+    make_option("--t", action="store", type='character')
+)
+
+opt = parse_args(OptionParser(option_list=option_list))
+
+t <- opt$t
+
+# Y
+pheno <- fread("./data/train/DXA/dxa_bone_size_mineral_density.txt.gz")
+covar <- fread("./Resource/Phenotype/EUR_covar.txt.gz")
+pheno.indep <- pheno[pheno$IID %in% covar$IID, ]
+covar <- covar[match(pheno.indep$IID, covar$IID),]
+
+# Z
+pred <- fread(paste0("./data/train/DXA_BMD/",t,"_eur.Z.txt"))
+colnames(pred)[1] <- "IID"
+pred <- pred[match(pheno.indep$IID, pred$IID), ]
+
+pred_tmp <- cbind(pred, as.data.frame(pheno.indep)[, t, drop = F])
+colnames(pred_tmp)[ncol(pred_tmp)] <- "y"
+pred_tmp.1 <- pred_tmp[,"IID",drop=F]
+for (i in 2:ncol(pred_tmp)) {
+    c <- covar
+    c$est <- as.data.frame(pred_tmp)[, i]
+    model <- lm(est ~ Sex+Year+SexYear+Chip+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16+PC17+PC18+PC19+PC20, data = c)
+    res <- rep(NA, nrow(pred_tmp))
+    res[as.integer(names(residuals(model)))] <- residuals(model)
+    pred_tmp.1 <- cbind(pred_tmp.1, res)
+}
+colnames(pred_tmp.1)[2:ncol(pred_tmp.1)] <- paste0(colnames(pred_tmp)[2:ncol(pred_tmp)], ".res")
+pred_tmp.2 <- pred_tmp.1[as.vector(!is.na(pred_tmp.1[, ncol(pred_tmp.1), with = F])), ]
+pred_tmp.2 <- clean_names(pred_tmp.2)
+dat <- pred_tmp.2
+
+
+# Cross-fitting
+cv_folds <- createFolds(dat$y_res, k = 10)  # Assuming you're doing 10-fold CV
+test_all <- c()
+r_vec <- c()
+
+set.seed(1234)
+for (i in 1:10) {
+    print(i)
+    # i <- 1
+    # Split the data into training and test based on the folds
+    train <- as.matrix(dat[-cv_folds[[i]], -1])
+    test_tmp <- dat[cv_folds[[i]],]
+    test <- as.matrix(test_tmp[, -1])
+    test[, "y_res"] <- NA
+    incomplete <- as.matrix(rbind(train, test))
+
+    ## softImpute
+    lambda.len <- 100
+    maxrank <- T
+    fixed.maxrank <- ncol(incomplete)-1
+    verbose <- F
+    lam0 <- lambda0(incomplete)
+    lamseq <- exp(seq(from=log(lam0+.2),to=log(.001),length=lambda.len))
+    ranks <- as.integer( lamseq )
+    rank.max <- ifelse(maxrank, 2, fixed.maxrank )
+    warm <- NULL
+    for(j in seq(along=lamseq)){
+        if( verbose ) cat( j, ' ' )
+        out <- softImpute(x=incomplete, lambda=lamseq[j], rank=rank.max, warm=warm, maxit=1000)
+        complete <- complete(incomplete, out)
+        warm <- out
+        if( maxrank ){
+            ranks[j]  <- sum(round(out$d,4)>0)
+            rank.max  <- min( ranks[j]+2, fixed.maxrank ) ### grows by at most 2, bounded by P/2
+        }
+        if( verbose ) cat( '\n' )
+    }
+    test <- cbind(test_tmp, complete[(nrow(train)+1):nrow(complete), "y_res"])
+    colnames(test)[ncol(test)] <- "y_hat"
+    r_vec <- c(r_vec, cor(test$y_res, test$y_hat, use = "pairwise.complete.obs"))
+    test_all <- rbind(test_all, test)
+}
+
+r_vec
+
+# Prediction on all data:
+pred_tmp.3 <- clean_names(as.data.frame(pred_tmp.1)[, -ncol(pred_tmp.1)])
+pred <- as.matrix(clean_names(as.data.frame(pred_tmp.1)[, -1]))
+lambda.len <- 100
+maxrank <- T
+fixed.maxrank <- ncol(pred)-1
+verbose <- F
+lam0 <- lambda0(pred)
+lamseq <- exp(seq(from=log(lam0+.2),to=log(.001),length=lambda.len))
+ranks <- as.integer( lamseq )
+rank.max <- ifelse(maxrank, 2, fixed.maxrank )
+warm <- NULL
+for(j in seq(along=lamseq)){
+    if( verbose ) cat( j, ' ' )
+    out <- softImpute(x=pred, lambda=lamseq[j], rank=rank.max, warm=warm, maxit=1000)
+    pred_complete <- complete(pred, out)
+    warm <- out
+    if( maxrank ){
+        ranks[j]  <- sum(round(out$d,4)>0)
+        rank.max  <- min( ranks[j]+2, fixed.maxrank ) ### grows by at most 2, bounded by P/2
+    }
+    if( verbose ) cat( '\n' )
+}
+pred_all <- cbind(pred_tmp.3[, 1], as.data.frame(pred_complete))
+colnames(pred_all)[ncol(pred_all)] <- "y_hat"
+colnames(pred_all)[1] <- "iid"
+pred_all_out <- pred_all[!(pred_all$iid %in% test_all$iid), ]
+
+fwrite(pred_all_out, paste0("./data/train/DXA_BMD/",t,".unlab_eur.pop.txt"), sep = "\t", quote = F, row.names = F, col.names = T)
+fwrite(test_all, paste0("./data/train/DXA_BMD/",t,".lab_eur.pop.txt"), sep = "\t", quote = F, row.names = F, col.names = T)
diff --git a/real_data/2_regenie.sh b/real_data/2_regenie.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+## see more from: https://rgcgithub.github.io/regenie/recommendations/
+## regenie documentations: https://rgcgithub.github.io/regenie/options/
+
+## Replace the following variables with your variable of interest
+# trait=Arms
+# pheno=lab.y_hat
+
+## Step1, stage1
+mkdir -p ./data/gwas_regenie/DXA_BMD/regenie/step0
+
+./plink \
+--bfile ./UKB/genotype/ukb_EUR_pedigree_finetuned \
+--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--autosome \
+--snps-only just-acgt \
+--geno 0.01 \
+--hwe 0.000001 \
+--maf 0.01 \
+--allow-no-sex \
+--write-snplist \
+--out ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass
+
+mkdir -p ./data/gwas_regenie/DXA_BMD/regenie/step1
+
+./regenie/v3.2.1/regenie \
+--step 1 \
+--gz \
+--threads 8 \
+--bed ./UKB/genotype/ukb_EUR_pedigree_finetuned \
+--extract ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass.snplist \
+--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoColList ${pheno} \
+--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
+--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
+--catCovarList Sex,Chip \
+--maxCatLevels 100 \
+--bsize 1000 \
+--split-l0 ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_parallel,100 \
+--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_l0
+
+## Step1, stage 2
+./regenie/v3.2.1/regenie \
+--step 1 \
+--gz \
+--threads 8 \
+--bed ./UKB/genotype/ukb_EUR_pedigree_finetuned \
+--extract ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass.snplist \
+--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoColList ${pheno} \
+--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
+--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
+--catCovarList Sex,Chip \
+--maxCatLevels 100 \
+--bsize 1000 \
+--run-l0 ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_parallel.master,${job} \
+--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_l0_${job}
+
+## Step1, stage 3
+./regenie/v3.2.1/regenie \
+--step 1 \
+--gz \
+--threads 32 \
+--bed ./UKB/genotype/ukb_EUR_pedigree_finetuned \
+--extract ./data/gwas_regenie/DXA_BMD/regenie/step0/${trait}_${pheno}.qc.pass.snplist \
+--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoColList ${pheno} \
+--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
+--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
+--catCovarList Sex,Chip \
+--maxCatLevels 100 \
+--bsize 1000 \
+--run-l1 ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_fit_bin_parallel.master \
+--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}
+
+# Stage 2:
+./plink \
+--bfile ./genotype/chunks/chr${chr}.EUR.idupdated_snpid_${chunk}_50 \
+--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--geno 0.01 \
+--hwe 0.000001 \
+--maf 0.01 \
+--write-snplist \
+--out ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_qc_pass_chr${chr}_chunk${chunk}
+
+mkdir -p ./data/gwas_regenie/DXA_BMD/regenie/step2
+
+./regenie/v3.2.1/regenie \
+--step 2 \
+--gz \
+--threads 5 \
+--bsize 400 \
+--pred ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_pred.list \
+--bed ./genotype/chunks/chr${chr}.EUR.idupdated_snpid_${chunk}_50 \
+--extract ./data/gwas_regenie/DXA_BMD/regenie/step1/${trait}_${pheno}_qc_pass_chr${chr}_chunk${chunk}.snplist \
+--keep ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoFile ./data/pheno/DXA_BMD/${trait}.${pheno}_eur.txt \
+--phenoColList ${pheno} \
+--covarFile ./data/covar/DXA_BMD/${trait}.${pheno}_eur.txt \
+--covarColList Year,Sex,SexYear,Chip,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
+--catCovarList Sex,Chip \
+--maxCatLevels 100 \
+--out ./data/gwas_regenie/DXA_BMD/regenie/step2/gwas_linear_${trait}_${pheno}_chr${chr}_snpid_${chunk}_50
diff --git a/real_data/3_popGWAS.sh b/real_data/3_popGWAS.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Prepare the Regenie input into the POP-TOOLS format
+cd POP-TOOLS
+
+trait=Head_BMD
+
+python3 ./POP-GWAS.py \
+--gwas-yhat-unlab ./data/${trait}_yhat_unlab.txt.gz \
+--gwas-y-lab ./data/${trait}_y_lab.txt.gz \
+--gwas-yhat-lab ./data/${trait}_yhat_lab.txt.gz \
+--out ./result/${trait}
diff --git a/real_data/4.1_post-GWAS.sh b/real_data/4.1_post-GWAS.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+# Clumping
+./plink/plink_1.9_linux_x86_64/plink \
+--bfile ./UKB_LD/genotype/ukb_10k \
+--clump ./result/${trait}_POP-GWAS.txt \
+--clump-field P \
+--clump-p1 1.4e-8 \
+--clump-p2 1.4e-8 \
+--clump-r2 0.01 \
+--clump-kb 5000 \
+--out ./results/clumping/${trait}_popgwas
+
+# Heritability & Genetic correlation
+## LDSC munge
+traits=(
+    Arms
+    Femur_neck
+    Femur_shaft
+    Femur_total
+    Femur_troch
+    Femur_wards
+    Head
+    L1-L4
+    Legs
+    Pelvis
+    Ribs
+    Spine
+    Total
+    Trunk
+)
+
+OutDir="./data/munged_gwas"
+
+for trait in "${traits[@]}"; do
+    filepath="./DXA/cleaned/${trait}_pop.txt.gz"
+    output="${OutDir}/munged_${trait}_pop"
+
+    ./bin/python ./Software/ldsc/ldsc/munge_sumstats.py\
+        --merge-alleles ./Software/ldsc/Inputs/w_hm3.snplist\
+        --sumstats $filepath\
+        --ignore "Z"\
+        --out $output &
+done
+
+
+# Cross-site Genetic correlation
+# Put the following code in R:
+library(data.table)
+library(dplyr)
+library(glue)
+
+pop_gwas_path <- list.files("../data/munged_gwas", pattern = "pop.sumstats.gz", full.names = TRUE)
+
+gwas_list_collapsed <- paste(pop_gwas_path, collapse = ",")
+
+for(i in seq_along(pop_gwas_path)){
+    index_gwas <- pop_gwas_path[i]
+    index_gwas_trait <- sub(".*/munged_(.*?)_pop.\\.sumstats\\.gz", "\\1", index_gwas)
+    cmd <- paste0(
+        "./bin/python ./Software/ldsc/ldsc/ldsc.py",
+        " --rg ", index_gwas, ",", gwas_list_collapsed,
+        " --ref-ld-chr ./Software/ldsc/Inputs/EUR_1KGphase1/eur_w_ld_chr/",
+        " --w-ld-chr ./Software/ldsc/Inputs/EUR_1KGphase1/eur_w_ld_chr/",
+        " --out ../data/gen_cor/", index_gwas_trait, "_vs_all.pop &")
+
+    system(cmd)
+}
+
+traits=(
+    Arms
+    Femur_neck
+    Femur_shaft
+    Femur_total
+    Femur_troch
+    Femur_wards
+    Head
+    L1-L4
+    Legs
+    Pelvis
+    Ribs
+    Spine
+    Total
+    Trunk
+)
+# make directory for each trait
+
+for trait in "${traits[@]}"; do
+
+    mkdir $PWD/data/gen_cor_vs_other_traits/${trait}
+
+done
+
+# create .R file for each trait to run ldsc gencor
+for trait in "${traits[@]}"; do
+    # Create a unique .R file for each trait
+    echo "source(\"./Tools/GeneticCorrelation/GenCorrLDSC.R\")" > "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+    echo "" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+    echo "ss <- c(" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+    echo "    \"./data/munged_gwas/munged_${trait}_pop.sumstats.gz\"" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+    echo ")" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+    echo "ss_name <- c(\"${trait}\")" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+    echo "working_folder <- \"temp_ldsc\"" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+    echo "multiple_ldsc_gencorr_func(working_folder, ss, ss_name)" >> "$PWD/data/gen_cor_vs_other_traits/${trait}/ldsc_gencor.R"
+done
+
+# SMR