-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_redundant_protein.pl
97 lines (80 loc) · 2.14 KB
/
remove_redundant_protein.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/perl
use strict ;
#use MJL_dataUtils ;
(@ARGV == 1) || die "usage: $0 fastaFile\n" ;
my $inFile = shift ;
my $outFile = $inFile ;
$outFile =~ s/\.\w+$// ;
$outFile .= 'NR0.fa' ;
my @seqs = readFasta ($inFile) ;
my $n = scalar @seqs ;
# restrict to non-redundant sequences:
my @seqsNR = nonRedundantSeqS (@seqs) ;
my $nNR = scalar @seqsNR ;
print "compacting $n sequences to $nNR non-redundant sequences...\n" ;
# write to file:
open (OUT, ">$outFile") || die ;
foreach my $seq (@seqsNR) {
print OUT ">$seq->[0] $seq->[2]\n$seq->[1]\n" ;
}
close OUT ;
sub nonRedundantSeqS (@) {
my @seqs = @_ ;
my $counter = my $percentage = 0 ;
for (my $i1=0; $i1<@seqs-1; $i1++) {
my $seq1 = substr $seqs[$i1][1], 1;
#my $seq1 = $seqs[$i1][1];
for (my $i2=$i1+1; $i2<@seqs; $i2++) {
#my $seq2 =$seqs[$i2][1];
my $seq2 = substr $seqs[$i2][1], 1;
if (index ($seq1, $seq2) >= 0) {
$seqs[$i2][2] = 'redundant' ;
} elsif (index ($seq2, $seq1) >= 0) {
$seqs[$i1][2] = 'redundant' ;
}
}
if (int($counter++ / scalar (@seqs) * 100) > $percentage) {
printf STDERR "\r%3d%%", $percentage ;
$percentage = int($counter++ / scalar (@seqs) * 100) ;
}
}
print STDERR "\n" ;
# remove redundant proteins and write non-redundant set to file:
return grep ($_->[2] ne 'redundant', @seqs) ;
}
sub readFasta {
my $file = shift ;
my $verbose = shift ;
unless (open (IN, $file)) {
warn "couldn't open Fasta file $file for reading" ;
return () ;
}
my $header = '' ;
while ($header = <IN>) {
last if $header =~ /^>/ ;
}
my $seq = '' ;
my @sequences = () ;
my $counter = 0 ;
while (1) {
my $line = <IN> ;
unless (($line =~ /^>/) || !$line) {
$seq .= $line ;
next ;
}
if (($line =~ /^>/) || eof IN) {
my ($name, $desc) = ($header =~ /^>(\S*)\s+(.*)/) ;
$seq =~ s/\s//g ;
#$seq = substr $seq, 1; ;
my @sequence = ($name, $seq, $desc) ;
push @sequences, \@sequence ;
$verbose && !($counter++ % 100) && printf STDERR "\r%6d seqs. read", $counter ;
last if eof IN ;
$header = $line ;
$seq = '' ;
}
}
close IN ;
$verbose && printf (STDERR "\r%6d seqs. read", $counter) ;
return @sequences ;
}