-
Notifications
You must be signed in to change notification settings - Fork 3
/
run-analysis.sh
122 lines (102 loc) · 5.18 KB
/
run-analysis.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
# 11/11/2021 VERSION
# This file is maintained by Seungsoo Kim (kisudsoe@gmail.com).
WORK_DIR="/data"
BASE_BED=$WORK_DIR"/gwas_biomart_32712.bed"
GWAS_IN=$WORK_DIR"/gwas_biomart_5892_uq.tsv"
ANN_PATH="/data/db_gwas"
### RUN FUNCTIONS ###
# DO NOT CHANGE BELLOW THIS CODE.
ROAD_FILE=$ANN_PATH"/roadmap_meta.tsv"
REG_DIR=$ANN_PATH"/regulome"
ENCODE_FILE=$ANN_PATH"/wgEncodeRegTfbsClusteredWithCellsV3.bed"
TXT_LOG=$WORK_DIR"/txt_logs"
SUB_DIR=$WORK_DIR"/db_intersect"
GNM_TSV=$WORK_DIR"/db_intersect/genome_tsv"
CAT_TSV=$WORK_DIR"/db_intersect/catlas_tsv"
DHS_TSV=$WORK_DIR"/db_intersect/dhslist_tsv"
EPI_TSV=$WORK_DIR"/db_intersect/epimap_18_status"
ENCD_TSV=$WORK_DIR"/db_intersect/encode_tsv"
CAT_BED=$WORK_DIR"/db_intersect/catlas_bed"
DHS_BED=$WORK_DIR"/db_intersect/dhslist_bed"
EPI_BED=$WORK_DIR"/db_intersect/epimap_bed"
GTEX_BED=$WORK_DIR"/db_intersect/gtex_bed"
ENCD_BED=$WORK_DIR"/db_intersect/encode_bed"
export WORK_DIR BASE_BED SH_FILE GWAS_IN ANN_PATH
export RAOD_FILE REG_DIR ENCODE_FILE TXT_LOG SUB_DIR GNM_TSV CAT_TSV DHS_TSV EPI_TSV ENCD_TSV CAT_BED DHS_BED EPI_BED GTEX_BED ENCD_BED
printf "\n1. Overlap the epigenomic track annotations\n"
mkdir $TXT_LOG
mkdir -p $GNM_TSV
printf "\n1/6 Run closestBed for Gencode TSS:"
sortBed -i $ANN_PATH"/gencode_v39_hg19_gff3-tss.bed.gz" | closestBed -d -a $BASE_BED -b stdin > $GNM_TSV"/nearest_gencode_tss.tsv" && printf " done.\n"
printf "2/6 Run intersectBed for ENCODE TFBS converting BED:"
mkdir $WORK_DIR"/summary_bed"
sortBed -i $ANN_PATH"/wgEncodeRegTfbsClusteredWithCellsV3.bed.gz" | intersectBed -wa -wb -a $BASE_BED -b stdin > $GNM_TSV"/encode_tfbs.tsv" && printf " done.\n"
printf "3/6 Run intersectBed for db_gwas/custom_bed:"
Rscript src/utils.r --bash_custom \
--base $BASE_BED --bedtools intersect --ann $ANN_PATH"/custom_bed" --out $GNM_TSV \
> $TXT_LOG"/log1.3-custom.txt"
cat $SUB_DIR"/genome_tsv.sh" | parallel --bar #&& printf " done.\n"
printf "4/6 Run intersectBed for db_gwas/catlas_bed:"
mkdir $CAT_TSV
Rscript src/utils.r --bash_custom \
--base $BASE_BED --bedtools intersect --ann $ANN_PATH"/catlas_bed" --out $CAT_TSV \
> $TXT_LOG"/log1.4-catlas.txt"
cat $SUB_DIR"/catlas_tsv.sh" | parallel --bar #&& printf " done.\n"
find $CAT_TSV -size 0 -print -delete > $TXT_LOG"/log1.4-catlas-removed.txt"
printf "5/6 Run intersectBed for db_gwas/dhslist_bed:"
mkdir $DHS_TSV
Rscript src/utils.r --bash_custom \
--base $BASE_BED --bedtools intersect --ann $ANN_PATH"/dhslist_bed" --out $DHS_TSV \
> $TXT_LOG"/log1.5-dhslist.txt"
cat $SUB_DIR"/dhslist_tsv.sh" | parallel --bar #&& printf " done.\n"
find $DHS_TSV -size 0 -print -delete > $TXT_LOG"/log1.5-dhslist-removed.txt"
printf "6/6 Run intersectBed for db_gwas/epimap_18_bed:"
mkdir $EPI_TSV
Rscript src/utils.r --bash_custom \
--base $BASE_BED --bedtools intersect --ann $ANN_PATH"/epimap_18_bed" --out $EPI_TSV \
> $TXT_LOG"/log1.6-epimap.txt"
cat $SUB_DIR"/epimap_18_status.sh" | parallel --bar #&& printf " done.\n"
find $EPI_TSV -size 0 -print -delete > $TXT_LOG"/log1.6-epimap-removed.txt"
printf "\n2. Annotations for ENCODE, RegulomeDB, lncRNASNP, and GTEx\n\n"
mkdir $EPI_BED
mkdir $WORK_DIR/summary_tsv
cat tsv-analysis.sh | parallel --bar
printf "\n3. Convert BED for ENCODE, RegulomeDB, and lncRNASNP annotations\n\n"
cat bed-analysis.sh | parallel --bar
printf "\n4. GWAS summary\n"
printf "1/4 Roadmap 18 Enh_Prom & Encode Tfbs to BED"
EP18ENH=`find $WORK_DIR"/summary_bed" -name "epimap_18-enh_prom_*.bed"`
ENCODE=`find $WORK_DIR"/summary_bed" -name "encode_tfbs_*.bed"`
Rscript src/db_summ.r --uniset \
--base $EP18ENH,$ENCODE --out $WORK_DIR"/summary_tsv/enh_prom_tfbs" --out_bed $WORK_DIR"/summary_bed/enh_prom_tfbs" --intersect TRUE > $TXT_LOG"/log4.1-enhancer_tfbs.txt"
printf " - done\n"
printf "2/4 Overlap-1 to BED"
ENH_TFBS=`find $WORK_DIR"/summary_bed" -name "enh_prom_tfbs_*.bed"`
REGULOME=`find $WORK_DIR"/summary_bed" -name "snp_regulome2b_*.bed"`
Rscript src/db_summ.r --uniset \
--base $ENH_TFBS,$REGULOME --out $WORK_DIR"/summary_tsv/overlap-1" --out_bed $WORK_DIR"/summary_bed/overlap-1" --intersect TRUE > $TXT_LOG"/log4.2-overlap-1.txt"
printf " - done\n"
printf "3/4 Overlap-2 to BED"
GTEX=`find $WORK_DIR"/summary_bed" -name "gtex_eqtl_*.bed"`
Rscript src/db_summ.r --uniset \
--base $ENH_TFBS,$GTEX --out $WORK_DIR"/summary_tsv/overlap-2" --out_bed $WORK_DIR"/summary_bed/overlap-2" --intersect TRUE > $TXT_LOG"/log4.3-overlap-2.txt"
printf " - done\n"
printf "4/4 Summarize GWAS candidates, Nearest genes, and GTEx eQTLs"
ANN_GTEX=`find $WORK_DIR"/summary_tsv" -name 'gtex_signif_*.tsv'`
Rscript src/db_summ.r --summ \
--base $WORK_DIR"/summary_bed",$GTEX_BED \
--out $WORK_DIR"/summary_tsv/summ" \
--sub_dir FALSE \
--ann_gwas $GWAS_IN \
--ann_encd $GNM_TSV"/encode_tfbs.tsv" \
--ann_near $GNM_TSV"/nearest_gencode_tss.tsv" \
--ann_gtex $ANN_GTEX \
> $TXT_LOG"/log4.4-gwas_summary.txt"
printf "done\n"
### END FUNCTIONS ###
#SH_FILE=$WORK_DIR"/dist_data.sh"
#printf " Writing "$SH_FILE"\n"
#Rscript src/utils.r --bash --base $BASE_BED --ann $ANN_PATH --out $WORK_DIR
#printf "1-1. Run bedtools commands for genome_dist, Roadmap_dist_15/18/25, and custom_dist. This step is taking a while. "
#time cat $SH_FILE | parallel