Skip to content

Commit

Permalink
Added check for missing hits in uniprot lists
Browse files Browse the repository at this point in the history
  • Loading branch information
Pombert-JF committed Jun 29, 2021
1 parent 96fc738 commit 7da0359
Showing 1 changed file with 32 additions and 5 deletions.
37 changes: 32 additions & 5 deletions Function_prediction/parse_annotators.pl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/perl
## Pombert Lab, IIT, 2020
my $name = 'parse_annotators.pl';
my $version = '1.7';
my $version = '1.7a';
my $updated = '2021-06-29';

use strict; use warnings; use Getopt::Long qw(GetOptions);
Expand All @@ -16,6 +16,7 @@
- BLASTP/DIAMOND searches against SwissProt/trEMBL databases
- BLASTP/DIAMOND searches against reference organism(s)
- KofamKOALA/GhostKOALA/BlastKOALA searches against KEGG
- dbCAN2 searches against CAZy
USAGE ${name} \\
-q BEOM2.proteins.queries \\
Expand Down Expand Up @@ -90,7 +91,7 @@
## parsing step
my %references;
unless (scalar(@rblist) == scalar(@rbblast)){
die "[E] the number of reference feature lists do not equal the number of reference blast files\n";
die "ERROR: the number of reference feature lists does not equal the number of reference blast files\n";
}
else {
for (my $i = 0; $i < scalar(@rblist); $i++){
Expand All @@ -106,7 +107,6 @@
## Using a double pass for memory optimization and reduce the size of the hash
my %sprot;
open SB, "<", "$spblast" or die "Can't open $spblast: $!\n";
my %sphits;
while (my $line = <SB>){
chomp $line;
my @cols = split("\t", $line);
Expand All @@ -126,6 +126,7 @@
close SP;

open SB, "<", "$spblast" or die "Can't open $spblast: $!\n";
my %sphits;
while (my $line = <SB>){
chomp $line;
my @cols = split("\t", $line);
Expand All @@ -136,17 +137,29 @@
elsif ( $sprot{$hit} =~ /uncharacterized/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
elsif ( $sprot{$hit} =~ /hypothetical/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
elsif ( $sprot{$hit} =~ /predicted protein/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
else{
elsif ( $sprot{$hit} eq '1'){ next; } ## Checking if entry is missing from $spblast, if so move to next hit
else {
$sphits{$query}[0] = $sprot{$hit};
$sphits{$query}[1] = $evalue;
}
}
close SB;

## Checking for discrepancies
my $num = scalar (keys %sprot);
my $match = 0;
foreach (keys %sprot){
if ($sprot{$_} eq '1'){
if ($verbose) { print "$_ is missing from $splist\n"; }
$match++;
}
}
print "\nSwissProt hits = $num\n";
print "SwissProt hits missing from $splist = $match\n";

## Parsing TREMBL blast.6
## Using a double pass for memory optimization and reduce the size of the hash
my %trembl;
my %tbhits;
open TBB, "<", "$tbblast" or die "Can't open $tbblast: $!\n";
while(my $line = <TBB>){
chomp $line;
Expand All @@ -167,6 +180,7 @@
close TB;

open TBB, "<", "$tbblast" or die "Can't open $tbblast: $!\n";
my %tbhits;
while (my $line = <TBB>){
chomp $line;
my @cols = split("\t", $line);
Expand All @@ -177,13 +191,26 @@
elsif ( $trembl{$hit} =~ /uncharacterized/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
elsif ( $trembl{$hit} =~ /hypothetical/i ) { next; } ## Discarding uninformative BLAST/DIAMOND hits
elsif ( $trembl{$hit} =~ /predicted protein/i ){ next; } ## Discarding uninformative BLAST/DIAMOND hits
elsif ( $trembl{$hit} eq '1'){ next; } ## Checking if entry is missing from $tblist, if so move to next hit
else {
$tbhits{$query}[0] = $trembl{$hit};
$tbhits{$query}[1] = $evalue;
}
}
close TBB;

## Checking for discrepancies
my $num2 = scalar (keys %trembl);
my $match2 = 0;
foreach (keys %trembl){
if ($trembl{$_} eq '1'){
if ($verbose) { print "$_ is missing from $tblist\n"; }
$match2++;
}
}
print "\nTrEMBL hits = $num2\n";
print "TrEMBL hits missing from $tblist = $match2\n\n";

my $time_taken = time - $tstart;
print "$time: Finished obtaining annotations for $splist and $tblist in $time_taken seconds.\n";

Expand Down

0 comments on commit 7da0359

Please sign in to comment.