Skip to content

Commit

Permalink
Added support for dbCAN2 searches
Browse files Browse the repository at this point in the history
  • Loading branch information
Pombert-JF committed Jun 29, 2021
1 parent 2f0fff5 commit 96fc738
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 61 deletions.
191 changes: 131 additions & 60 deletions Function_prediction/parse_annotators.pl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/perl
## Pombert Lab, IIT, 2020
my $name = 'parse_annotators.pl';
my $version = '1.6a';
my $updated = '2021-06-28';
my $version = '1.7';
my $updated = '2021-06-29';

use strict; use warnings; use Getopt::Long qw(GetOptions);

Expand Down Expand Up @@ -49,6 +49,9 @@
-gk GhostKOALA output file
-bk BlastKOALA output file
## dbCAN2 CAZy searches: http://bcb.unl.edu/dbCAN2/
-ca dbCAN2 output file
-cl CAZy families list ## http://bcb.unl.edu/dbCAN2/download/Databases/CAZyDB.07302020.fam-activities.txt
OPTIONS
die "\n$usage\n" unless @ARGV;

Expand All @@ -63,6 +66,8 @@
my $kofamkoala;
my $ghostkoala;
my $blastkoala;
my $dbcan2;
my $cazy_list;
GetOptions(
'q=s' => \$queries,
'o=s' => \$output,
Expand All @@ -77,6 +82,8 @@
'ko=s' => \$kofamkoala,
'gk=s' => \$ghostkoala,
'bk=s' => \$blastkoala,
'ca=s' => \$dbcan2,
'cl=s' => \$cazy_list
);

## Connecting reference feature lists to its corresponding blast file, and storing it in a database to reference during
Expand All @@ -93,7 +100,7 @@

my $time = localtime();
my $tstart = time;
print "$time: Obtaining annotations for DIAMOND blastp hits in $spblast and $tbblast...\n";
print "\n$time: Obtaining annotations for DIAMOND blastp hits in $spblast and $tbblast...\n";

## Parsing SwissProt blast.6
## Using a double pass for memory optimization and reduce the size of the hash
Expand Down Expand Up @@ -263,76 +270,126 @@
$time = localtime();
print "$time: Finished parsing InterProScan5 $ipro in $time_taken seconds...\n";

## KofamKOALA results, if any
### KofamKOALA results, if any
my %kofam;
open KOFAM, "<", "$kofamkoala" or die "Can't open $kofamkoala: $!\n";
while (my $line = <KOFAM>){
chomp $line;
if ($line =~ /^#/) { next; }
elsif ($line =~ /^\* /){
my @columns = split (/\s+/, $line);
my $query = $columns[1];
my $ko = $columns[2];
my $evalue = $columns[5];
my $definition;
for my $num (6..$#columns){
$definition .= "$columns[$num] ";
}
$definition =~ s/ $//;
if ($kofamkoala){
open KOFAM, "<", "$kofamkoala" or die "Can't open $kofamkoala: $!\n";
while (my $line = <KOFAM>){
chomp $line;
if ($line =~ /^#/) { next; }
elsif ($line =~ /^\* /){
my @columns = split (/\s+/, $line);
my $query = $columns[1];
my $ko = $columns[2];
my $evalue = $columns[5];
my $definition;
for my $num (6..$#columns){
$definition .= "$columns[$num] ";
}
$definition =~ s/ $//;

## Adding to %kofam db
$kofam{$query}{'ko'} = $ko;
$kofam{$query}{'evalue'} = $evalue;
$kofam{$query}{'def'} = $definition;
## Adding to %kofam db
$kofam{$query}{'ko'} = $ko;
$kofam{$query}{'evalue'} = $evalue;
$kofam{$query}{'def'} = $definition;
}
}
close KOFAM;
}
close KOFAM;

## GhostKOALA results, if any
### GhostKOALA results, if any
my %ghost;
open GHOST, "<", "$ghostkoala" or die "Can't open $ghostkoala: $!\n";
while (my $line = <GHOST>){
chomp $line;
if ($line =~ /^#/) { next; }
else {
my @columns = split ("\t", $line);
my $query = $columns[0];
my $ko = $columns[1];
my $definition = $columns[2];
my $score = $columns[3];

## Adding to %ghost db
if ($ko){
$ghost{$query}{'ko'} = $ko;
$ghost{$query}{'score'} = $score;
$ghost{$query}{'def'} = $definition;
if ($ghostkoala){
open GHOST, "<", "$ghostkoala" or die "Can't open $ghostkoala: $!\n";
while (my $line = <GHOST>){
chomp $line;
if ($line =~ /^#/) { next; }
else {
my @columns = split ("\t", $line);
my $query = $columns[0];
my $ko = $columns[1];
my $definition = $columns[2];
my $score = $columns[3];

## Adding to %ghost db
if ($ko){
$ghost{$query}{'ko'} = $ko;
$ghost{$query}{'score'} = $score;
$ghost{$query}{'def'} = $definition;
}
}
}
close GHOST;
}
close GHOST;

## BlastKOALA results, if any
### BlastKOALA results, if any
my %blastko;
open BLASTKO, "<", "$blastkoala" or die "Can't open $blastkoala: $!\n";
while (my $line = <BLASTKO>){
chomp $line;
if ($line =~ /^#/) { next; }
else {
my @columns = split ("\t", $line);
my $query = $columns[0];
my $ko = $columns[1];
my $definition = $columns[2];
my $score = $columns[3];

## Adding to %blastko db
if ($ko){
$blastko{$query}{'ko'} = $ko;
$blastko{$query}{'score'} = $score;
$blastko{$query}{'def'} = $definition;
if ($blastkoala){
open BLASTKO, "<", "$blastkoala" or die "Can't open $blastkoala: $!\n";
while (my $line = <BLASTKO>){
chomp $line;
if ($line =~ /^#/) { next; }
else {
my @columns = split ("\t", $line);
my $query = $columns[0];
my $ko = $columns[1];
my $definition = $columns[2];
my $score = $columns[3];

## Adding to %blastko db
if ($ko){
$blastko{$query}{'ko'} = $ko;
$blastko{$query}{'score'} = $score;
$blastko{$query}{'def'} = $definition;
}
}
}
close BLASTKO;
}

### dbCAN2 results, if any
my %cazy_ids;
my %dbcan;
if ($dbcan2){
# Creating database of CAZymes IDs and their corresponding products
open CAZY, "<", "$cazy_list" or die "Can't open $cazy_list: $!\n";
while (my $line = <CAZY>){
chomp $line;
if ($line =~ /^#/){ next; }
else {
my ($id, $def) = $line =~ /^(\S+)\s+(.*)$/;
$def =~ s/;(\s+)?$//; ## discarding trailing semicolons
$def =~ s/\.$//; ## discarding trailing periods
my @deflines = split (";", $def);
for my $num (0..$#deflines){
my $definition = $deflines[$num];
$definition =~ s/^\s+//;
if ($definition eq '') { next; } ## discarding blank entries (e.g. ; ; )
else {
push (@{$cazy_ids{$id}}, $definition);
}
}
}
}
close CAZY;
# Working on dbCAN2 results
open DBCAN2, "<", "$dbcan2" or die "Can't open $dbcan2: $!\n";
while (my $line = <DBCAN2>){
chomp $line;
if ($line =~ /^HMM_Profile/){ next; }
else {
my @columns = split ("\t", $line);
my $model = $columns[0];
$model =~ s/.hmm$//;
$model =~ s/_.*$//; ## Discarding fluff not part of database ID (if present)
my $locus = $columns[2];
my $evalue = $columns[4];
$dbcan{$locus}{'model'} = $model;
$dbcan{$locus}{'evalue'} = $evalue;
}
}
close DBCAN2;
}
close BLASTKO;

### Reference organism, if any
my %refhits;
Expand Down Expand Up @@ -395,6 +452,7 @@
if ($kofamkoala){ print OUT "\tEvalue\tKofamKOALA"; }
if ($ghostkoala){ print OUT "\tEvalue\tGhostKOALA"; }
if ($blastkoala){ print OUT "\tEvalue\tBlastKOALA"; }
if ($dbcan2){ print OUT "\tEvalue\tdbCAN2"; }
if ($rblist[0]){
foreach my $ref (sort(keys(%references))){
$ref =~ s/\.list//;
Expand Down Expand Up @@ -444,6 +502,19 @@
else { print OUT "\tNA\tno match found"; }
}

## Printing dbCAN2 results, if any
if ($dbcan2){
if (exists $dbcan{$line}){
my $model = $dbcan{$line}{'model'};
my $size = scalar @{$cazy_ids{$model}};
my $leftovers = $size - 1;
my $note = '';
if ($leftovers > 0){ $note = " + $leftovers similar enzyme(s)"; }
print OUT "\t$dbcan{$line}{'evalue'}\t$dbcan{$line}{'model'}; $cazy_ids{$model}[0]"."$note";
}
else { print OUT "\tNA\tno match found"; }
}

## Printing reference(s) results, if any
if ($rblist[0]){
foreach my $ref (sort(keys(%references))){
Expand All @@ -461,5 +532,5 @@
$time = localtime();
print "$time: Done writing annotations in $time_taken seconds...\n";
$time = localtime();
print "$time: Task completed in $ttime seconds. Exiting...\n";
print "$time: Task completed in $ttime seconds. Exiting...\n\n";
exit();
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ parse_annotators.pl \
-rb $ANNOT/REFERENCES/*.diamond.6
```

If desired, results from KEGG searches can also be parsed accordingly by invoking the -ko, -gk and/or -bk command line options. Current options for [parse_annotators.pl](https://github.com/PombertLab/A2GB/blob/master/Function_prediction/parse_annotators.pl) are:
If desired, results from [KEGG](https://www.genome.jp/kegg/) and [dbCAN2](http://bcb.unl.edu/dbCAN2/) searches can also be parsed accordingly by invoking the corresponding command line options. Current options for [parse_annotators.pl](https://github.com/PombertLab/A2GB/blob/master/Function_prediction/parse_annotators.pl) are:
```
-q List of proteins queried against annotators
-o Output file
Expand All @@ -613,6 +613,10 @@ If desired, results from KEGG searches can also be parsed accordingly by invokin
-ko KofamKOALA output file
-gk GhostKOALA output file
-bk BlastKOALA output file
## dbCAN2 CAZy searches: http://bcb.unl.edu/dbCAN2/
-ca dbCAN2 output file
-cl CAZy families list ## http://bcb.unl.edu/dbCAN2/download/Databases/CAZyDB.07302020.fam-activities.txt
```

#### Curating the protein annotations
Expand Down Expand Up @@ -1127,3 +1131,5 @@ Kanehisa M, Sato Y, Morishima K. **BlastKOALA and GhostKOALA: KEGG tools for fun
Suzuki S, Kakuta M, Ishida T, Akiyama Y. **GHOSTX: An improved sequence homology search algorithm using a query suffix array and a database suffix array.** *PLoS One.* 2014;9(8):e103833 PMID: 25099887 PMCID: [PMC4123905](http://www.ncbi.nlm.nih.gov/pmc/articles/pmc4123905/) DOI: [10.1371/journal.pone.0103833](https://doi.org/10.1371/journal.pone.0103833)

Aramaki T, Blanc‐Mathieu R, Endo H, et al. **KofamKOALA: KEGG ortholog assignment based on profile HMM and adaptive score threshold.** *Bioinformatics.* 2020 Apr 1;36(7):2251-2252. PMID: 31742321 PMCID: [PMC7141845](http://www.ncbi.nlm.nih.gov/pmc/articles/pmc7141845/) DOI: [10.1093/bioinformatics/btz859](https://doi.org/10.1093/bioinformatics/btz859)

Zhang H, Yohe T, Huang L, Entwistle S, Wu P, Yang Z, Busk PK, Xu Y, Yin Y. **dbCAN2: a meta server for automated carbohydrate-active enzyme annotation.** *Nucleic Acids Res.* 2018 Jul 2;46(W1):W95-W101. PMID: 29771380 PMCID: [PMC6031026](http://www.ncbi.nlm.nih.gov/pmc/articles/pmc6031026/) DOI: [10.1093/nar/gky418](https://doi.org/10.1093/nar/gky418)

0 comments on commit 96fc738

Please sign in to comment.