-
Notifications
You must be signed in to change notification settings - Fork 1
/
alignment_distance.pl
executable file
·138 lines (122 loc) · 4.51 KB
/
alignment_distance.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/perl -w
use strict;
use File::Basename;
use Getopt::Long;
my $qual_cutoff = 0;
my $minread = 35;
my $direction = 1;
my $range = 100;
my ($output, $side, $drop);
GetOptions ("quality=s" => \$qual_cutoff,
"minread=i" => \$minread,
"range=i" => \$range,
"drop" => \$drop,
"output=i" => \$output);
&help unless scalar @ARGV == 1;
&help unless $ARGV[0] =~ /\.bam$/;
if ($output) {
my $outname = basename($ARGV[0]);
$outname =~ s/\.bam//;
open WRITE5, "| samtools view -S -b - >$outname.1st_dist$output.bam" or die "could not write file $! \n";
open WRITE3, "| samtools view -S -b - >$outname.2nd_dist$output.bam" or die "could not write file $! \n";
}
open BAM, "samtools view -hX -F p $ARGV[0] |" or die "could not open file $!\n";
my (%dist_hist, %gap_base, %gap_before, %gap_after);
my ($counter, $interval) = (0, 0);
my $prev_chr = "";
my $prev_end_plus = 0;
my $prev_end_minus = 0;
my ($prev_line_plus, $prev_line_minus);
while (my $line = <BAM>) {
$counter++; $interval++;
if ($line =~ /^\@/) {
if ($output) {
print WRITE5 $line;
print WRITE3 $line;
}
next;
}
if ($interval == 100_000) {
print STDERR "\r$counter";
$interval = 0;
}
chomp $line;
my @fields = split /\t/, $line;
my ($header, $flag, $chr, $start, $mapqual, $cigar, $junk1, $junk2, $junk3, $sequence) = split /\t/, $line;
my $length = length($sequence);
unless ($drop) {
next unless $chr =~ /^(\d{1,2}|X)$/;
}
next if $length < $minread;
next if $mapqual < $qual_cutoff;
my $end = aln_end($start, $cigar, $length);
if ($flag =~ /r/) {
if ($chr eq $prev_chr && $prev_end_minus > 0 && $prev_end_minus < $start) {
my $distance = $start - $prev_end_minus;
$dist_hist{$distance}++;
if ($output && $distance == $output) {
}
}
$prev_end_minus = $end;
$prev_line_minus = $line;
} else {
if ($chr eq $prev_chr && $prev_end_plus > 0 && $prev_end_plus < $start) {
my $distance = $start - $prev_end_plus;
$dist_hist{$distance}++;
if ($output && $distance == $output) {
print WRITE3 "$line\n";
print WRITE5 "$prev_line_plus\n" if $prev_line_plus;
}
}
$prev_end_plus = $end;
$prev_line_plus = $line;
}
$prev_chr = $chr;
}
foreach my $dist (1 .. $range) {
print "$dist\t", $dist_hist{$dist} || 0, "\n";
}
sub aln_end {
my ($start, $cigar, $length) = @_;
my $end;
if ($cigar =~ /[ID]/) {
my ($insertions, $deletions, $matches) = (0,0,0);
my $cigar_backup = $cigar;
while ($cigar =~ s/(\d+)([MID])//) {
my ($num, $type) = ($1, $2);
$insertions += $num if $type eq "I";
$deletions += $num if $type eq "D";
$matches += $num if $type eq "M";
}
$end = $start + $matches + $deletions - 1;
} else {
$end = $start + $length - 1;
}
return $end;
}
sub help {
print STDERR "
This script computes a histogram of the distance between the 3' end of each sequence and the closest 5' end of another sequence on the same strand. A 1-bp gap corresponds to a distance of 2. Input are sorted alignment files in BAM format produced by BWA. There is a strict requirement for single reads, paired reads are filtered out.
---------------> ---------------> ----------->
5 3 5 3 5 3
---------------------->
5 3
|<-------->| |<--------->|
[usage]
./alignment_distance.pl [-options] in.bam
[options]
-quality map quality cutoff [0]
-minread minimal length cutoff [35]
-range range of values to report
-output write out sequences that have another sequence in distance X from their 5' or 3' end respectively.
-drop drop filtering of sequences that are not mapped to the autosomes and chromosome X (recommended when using
a reference genome other than hg19)
[output]
infile.1st_distX.bam first (left) sequence whose 3' end is in distance X to the 5' end of the following sequence
infile.2nd_distX.bam second (right) sequences whose 5' end is in distance X to the 3' of the sequence to the left
screen alignment distance histogram
DEPENDENCIES
This perl script has been successfully used with perl 5 (version 22). It requires 2 perl modules to be installed (File::Basename and Getopt::Long) as well as samtools (successfully used with version 1.3.1).
";
exit;
}