-
Notifications
You must be signed in to change notification settings - Fork 0
/
bionanotator.pl
200 lines (158 loc) · 8.34 KB
/
bionanotator.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#/usr/bin/perl -w
use strict;
use Getopt::Long qw(GetOptions);
# defaults for file parameters
my$hallmark_mapping_file="hallmark_genes.tsv"; # unusual mapping file, not one gene per line
my$infile="in.vcf";
my$logfile="logfile.log";
my$sample="testsample";
# true/false params
my$delete_empty_cols=1;
my$delete_in_between_files=0;
my$case="g";# germline, with mother/father information= "g", somatic, without father/mother information="s"
my$include="";# columns to extra add to the output , list of numbers splitted by a ,
GetOptions('in|i=s'=>\$infile,'sample|s=s'=>\$sample,'hallmark_file|h=s'=>\$hallmark_mapping_file,'delete_empty_cols|d=i'=>\$delete_empty_cols,'logfile|log=s'=>\$logfile,'delete_in_between_files|del=i'=>\$delete_in_between_files,'case|c=s'=>\$case,'include|i_c=s'=>\$include)|| warn "Using default parameters:\nin=in.vcf\ndelete_empty_cols=1\nhallmark_file=hallmark_genes.tsv!\nlogfile=logfile.log\ndelete_in_between_files=1\n";
chomp($infile,$hallmark_mapping_file,$sample,$delete_empty_cols,$delete_in_between_files,$logfile,$case,$include);
open(ER,'>>',$logfile)||die "$!\n logfile not found. use --log path/to/your_logfile.log\n";
# hallmark mapping hash filling
print ER "reading hallmark mapping file $hallmark_mapping_file...\n";
open(MA,$hallmark_mapping_file) || die "$!\n hallmark mapping file not found\n";
my@allemappings= <MA>;
my%mapping_hash=(); # mapping hash: gene name is key, hallmark mechanism is value
foreach my $mapline (@allemappings){
chomp $mapline;
my@mappingline_parts=split(/\s+/,$mapline);
my$hallmarktype_full=shift @mappingline_parts;# getting hallmark description properly with some regex cleaning by shifting first array index
$hallmarktype_full=~s/HALLMARK//;# cleanup of the hallmark name
$hallmarktype_full=~s/\s+//;
$hallmarktype_full=~s/_//;
shift @mappingline_parts; # remove web adress
# rest of the line is all hallmark genes, need to be cleaned up before saving
foreach my $hallmg (@mappingline_parts){
$hallmg=~s/\s+//;
if($hallmg=~/[A-Z]/){ # checking for empty lines and gene names
$mapping_hash{"$hallmg"}="$hallmarktype_full";
}
}
}
my@allehallmarkg=keys %mapping_hash;# to quickly check if we have a hallmark at all connected to gene in question
my@all_hm_genes= values %mapping_hash;
close MA;
# check for infile
my$infilecheck=`ls -1f $infile`;
if($infilecheck=~/$infile/gix){
print ER "found infile. creating rundir...\n";
}
else{
die "could not find infile\ntry --in yourfile.csv\n";
}
# prepare the annovar run
mkdir "run_$sample";
print ER "created run dir run_$sample\n";
my$prep_run=`perl prepare_bionano_output.pl --in $infile --vcf run_$sample/$sample.vcf --tsv run_$sample/$sample._raw_cols.tsv --c $case --include $include`;
print "prepared input: $prep_run\n";
chdir "run_$sample";
print ER "now in run_$sample/.\n";
# find infile
# create rundir
# go to rundir
# execute annovar - maybe place and DB place as param?
my$annovar_ex=`perl ../../table_annovar.pl $sample.vcf ../../humanhg38/ -buildver hg38 -out $sample.annotated -polish -protocol refGene,cytoBand,dgvMerged,gwasCatalog,wgEncodeRegDnaseClustered,genomicSuperDups,wgRna -operation gx,r,r,r,r,r,r -nastring . >$logfile`;
print ER "executed annovar: $annovar_ex\n";
# delete all non-txt files in run dir
if($delete_in_between_files){
my$del_between_files=`find . -type f -name '*.hg38*' -print0 | xargs -0 rm --`; # here only deleting some annovar in-between files
print ER "deleted all in-between files per users request: $del_between_files\n";
}
else{
print ER "set to not delete annovar in-between files.\n";
}
my$outfile_found=`ls -1f *.txt`;
chomp $outfile_found;
print ER "found annovar outfile: $outfile_found\n";
# the final outfile will be : run_$sample/$sample_annotated.tsv
open(FIN,">$sample.annotated.tsv") || die "$!\ncould not write the final outfile\n";
open(OU,$outfile_found)|| die "$!\ncould not open annovar outfile in run_$sample\n";
my@all_out_lines=<OU>;
my$line_count=0;
my@file_lines=();
my$full_header_line=0;
my@lines_with_afs=();# full line with AF values
# two possible header lines: with and without allele freqs
my$new_header_line="Chr Start End Ref Alt Func.refGene\tCancer.hallmark cytoBand dgvMerged gwasCatalog wgEncodeRegDnaseClustered genomicSuperDups\n";
my$w_af_header="Chr Start End Ref Alt Func.refGene\tCancer.hallmark GeneDetail.refGene cytoBand dgvMerged gwasCatalog wgEncodeRegDnaseClustered genomicSuperDups wgRna\tAF AF_raw AF_male AF_female AF_afr AF_ami AF_amr AF_asj AF_eas AF_fin AF_nfe AF_oth AF_sas\n";
my$hallm="";
my$allele_freqs=""; #- attach here all allelle frequencies of the whole file- if there is more than . in there, leave it in the file
foreach my $outline(@all_out_lines){
$line_count=$line_count + 1;
chomp $outline;
if($line_count == 1 ){# header
# save header, edit as needed
}
else{
my@all_line_parts=split(/\t/,$outline);
# find cancer hallmarks in gene names
my@genes_all=split(/\;/,$all_line_parts[6]);# find each gene
$hallm="";
foreach my $single_gene(@genes_all){
# check hash for fitting hits
$single_gene=~s/\"//g;
if(grep(/$single_gene?/,@allehallmarkg)){
if($mapping_hash{$single_gene}=~/[A-Z]/){
my$hallm_neu=$mapping_hash{$single_gene};# one hallmark max per SV for now
$hallm="$hallm"."+"."$hallm_neu".";"."$single_gene";# collecting all hallmarks
}
}
}
# cleanup hallmarks
$hallm=~s/\./\;/g;
if(!($hallm=~/[A-z]/)){
$hallm="none";
}
$allele_freqs="$allele_freqs"."$all_line_parts[16]";
# all other file lines
# split by column
# cleanup the alleles freqs from . and whitespace?
if($allele_freqs=~/[0-9]/g){
# leave the allele frequencies in the output file
# change the header line
$full_header_line=1;
}
my$new_line="$all_line_parts[0]\t$all_line_parts[1]\t$all_line_parts[2]\t$all_line_parts[3]\t$all_line_parts[4]\t$all_line_parts[5]\t$hallm\t$all_line_parts[10]\t$all_line_parts[11]\t$all_line_parts[12]\t$all_line_parts[13]\t$all_line_parts[14]\n";
# cleanup
$new_line=~s/\t\s+/\t/g;
#print FIN $new_line;
push(@file_lines,$new_line);
my$new_line_with_AF="$all_line_parts[0]\t$all_line_parts[1]\t$all_line_parts[2]\t$all_line_parts[3]\t$all_line_parts[4]\t$all_line_parts[5]\t$all_line_parts[6]\t$hallm\t$all_line_parts[7]\t$all_line_parts[10]\t$all_line_parts[11]\t$all_line_parts[12]\t$all_line_parts[13]\t$all_line_parts[14]\t$all_line_parts[15]\t$all_line_parts[16]\t$all_line_parts[17]\t$all_line_parts[18]\t$all_line_parts[19]\t$all_line_parts[20]\t$all_line_parts[21]\t$all_line_parts[22]\t$all_line_parts[23]\t$all_line_parts[24]\t$all_line_parts[25]\t$all_line_parts[26]\t$all_line_parts[27]\t$all_line_parts[28]\n";
$new_line_with_AF=~s/\t\s+/\t/g;
push(@lines_with_afs,$new_line_with_AF);
}
}
# now print the final file- first decide on the header
if($full_header_line){
print FIN $w_af_header;
print ER "found allele frequencies, thus including them ...\n";
foreach my $line_w_af(@lines_with_afs){
print FIN $line_w_af;
}
}
elsif($full_header_line==0){
print FIN $new_header_line;
print ER "found NO allele frequencies, thus excluding them ...\n";
foreach my $raw_line (@file_lines){
print FIN $raw_line;
}
}
close FIN;
# cleaning up the annotated file to be a 3-column .bed- so it only will be created once annovar finished
my$catting_part1=`cat $sample.annotated.tsv | cut -f -3 | sed 's/^/chr/' | tail -n +2 >bedfile.$sample.annotated.bed`;
print ER "done preparing annotated file :$catting_part1\n";
# now use bedtools to count overlapping dgv entries
my$bedtools_out=`bedtools intersect -a bedfile.$sample.annotated.bed -b ../../humanhg38/dgv_merged_bedfile.bed -f 0.5 -F 0.5 -c >dgv_hitcounts.$sample.annotated.tsv`;
#get the column of the hitcounts into a final outfile + append header name
my$all_hits=`cat dgv_hitcounts.$sample.annotated.tsv | cut -f 4 | sed -e '1i n_hits_dgv_filtered ' >hits_$sample.annotated.bed`;
# then re-introduce these counts into a new file with count_overlap_30 in header
my$complete_file=`paste $sample._raw_cols.tsv $sample.annotated.tsv hits_$sample.annotated.bed >$sample.annotated.finished.tsv`;
print ER "errors bedtools execution:$bedtools_out\nerrors cutting the bedtools outfile:$all_hits \nerrors pasting the complete outfile:$complete_file\n";
print "done creating $sample.annotated.tsv. and $sample.annotated.finished.tsv\n";
1;