Skip to content

Commit

Permalink
+ minimum contig size option
Browse files Browse the repository at this point in the history
  • Loading branch information
Pombert-JF committed May 8, 2024
1 parent 80bf21a commit 4b69b74
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 49 deletions.
3 changes: 3 additions & 0 deletions Examples/commands.conf
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
### Allowable number of gaps between gene pairs,e.g. --gaps 0, 1, 5 [Default = 0]
--gaps 0

### Minimum contig size (in bp) [Default: 1]; i.e. investigates all contigs
--minsize 1

### Specify minimap2 max divergence preset (--asm 5, 10 or 20) [Default: off]
# --asm 20

Expand Down
17 changes: 15 additions & 2 deletions Plots/nucleotide_biases.pl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/perl
## Pombert Lab, 2022
my $name = 'nucleotide_biases.pl';
my $version = '0.7c';
my $updated = '2024-05-01';
my $version = '0.7d';
my $updated = '2024-05-08';

use strict;
use warnings;
Expand All @@ -25,6 +25,7 @@
-outdir output_directory \\
-winsize 1000 \\
-step 500 \\
-minsize 1 \\
-reference CCMP1205 \\
-gap 0 \\
-custom_preset chloropicon
Expand All @@ -34,6 +35,7 @@
-o (--outdir) Output directory [Default: ntBiases]
-w (--winsize) Sliding window size [Default: 10000]
-s (--step) Sliding window step [Default: 5000]
-m (--minsize) Minimum contig size (in bp) [Default: 1]
-n (--ncheck) Check for ambiguous/masked (Nn) nucleotides
-t (--tsv) Output tab-delimited files (e.g. for excel plotting)
Expand Down Expand Up @@ -63,6 +65,7 @@

my @fasta;
my $outdir = 'ntBiases';
my $minsize = 1;
my $winsize = 10000;
my $step = 5000;
my $reference;
Expand All @@ -88,6 +91,7 @@
GetOptions(
'f|fasta=s@{1,}' => \@fasta,
'o|outdir=s' => \$outdir,
'minsize=i' => \$minsize,
'w|winsize=i' => \$winsize,
's|step=i' => \$step,
'n|ncheck' => \$ncheck,
Expand Down Expand Up @@ -163,6 +167,7 @@

open FASTA, "<", $fasta or die "Can't open $fasta: $!\n";

## Database of sequences
while (my $line = <FASTA>){
chomp $line;
if ($line =~ /^>(\S+)/){
Expand All @@ -173,6 +178,14 @@
}
}

## Delete sequence if smaller than minsize
foreach my $seq (keys %{$sequences{$fileprefix}}){
my $seq_len = length($sequences{$fileprefix}{$seq});
if ($seq_len < $minsize){
delete $sequences{$fileprefix};
}
}

my $fasta_dir = $singledir.'/'.$fileprefix;
unless (-d $fasta_dir){
mkdir ($fasta_dir,0755) or die "Can't create $fasta_dir: $!\n";
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ Options for run_SYNY.pl are:
-o (--outdir) Output directory [Default = SYNY]
-e (--evalue) DIAMOND BLASTP evalue cutoff [Default = 1e-10]
-g (--gaps) Allowable number of gaps between gene pairs [Default = 0]
--minsize Minimum contig size (in bp) [Default: 1]
--asm Specify minimap2 max divergence preset (--asm 5, 10 or 20) [Default: off]
--resume Resume minimap2 computations (skip completed alignments)
--no_map Skip minimap2 pairwise genome alignments
Expand Down
95 changes: 72 additions & 23 deletions Utils/list_maker.pl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# Pombert lab, 2020

my $name = 'list_maker.pl';
my $version = '0.5.4b';
my $updated = '2024-04-24';
my $version = '0.5.5';
my $updated = '2024-05-08';

use strict;
use warnings;
Expand All @@ -29,6 +29,7 @@
OPTIONS:
-i (--input) Input file(s) (GZIP supported; File type determined by file extension)
-o (--outdir) Output directory [Default: LIST_MAKER]
-m (--minsize) Keep only contigs larger or equal to specified size (in bp) [Default: 1]
-v (--verbose) Add verbosity
REGEX
die "$usage\n" unless @ARGV;
Expand All @@ -42,10 +43,12 @@

my @input_files;
my $outdir = 'LIST_MAKER';
my $minsize = 1;
my $verbose;
GetOptions(
'i|input=s@{1,}' => \@input_files,
'o|outdir=s' => \$outdir,
'm|minsize=i' => \$minsize,
'v|verbose' => \$verbose
);

Expand Down Expand Up @@ -89,7 +92,7 @@
}

if ($verbose){
print "Creating .list file for $file_name\n";
print "\nCreating .list file for $file_name\n";
}

my $annotation_list = "$list_dir/$file_prefix.list";
Expand All @@ -112,6 +115,13 @@

my $seq_flag;
my $contig;
my $contig_size;
my $contig_sum = 0;
my $contig_kept_counter = 0;
my $contig_kept_sum = 0;
my $contig_discarded_counter = 0;
my $contig_discarded_sum = 0;

my $gene;
my $CDS;
my $translate;
Expand All @@ -129,13 +139,31 @@

chomp $line;

if ($line =~ /^LOCUS\s+(\S+)/ ){
if ($line =~ /^LOCUS\s+(\S+).*?(\d+) bp/ ){

$contig = $1;
$contig_size = $2;
$contig_sum += $contig_size;

if ($verbose){
if ($contig_size >= $minsize){
$contig_kept_counter++;
$contig_kept_sum += $contig_size;
print ' '.$contig."\t".$contig_size."\t". ">= $minsize"."\t"."kept\n";
}
else {
$contig_discarded_counter++;
$contig_discarded_sum += $contig_size;
print ' '.$contig."\t".$contig_size."\t"."< $minsize"."\t"."discarded\n";
}
}

## if the contig is not named with a unique LOCUS
## add the file_prefix in front to prevent naming clashes
if ($contig =~ /chromosome|contig/i){
$contig = $file_prefix.'_'.$contig;
}

}

if ($line =~ /^ORIGIN/){
Expand Down Expand Up @@ -202,27 +230,31 @@

unless (exists $isoform{$locus}){ ## Keeping only the first isoform

$isoform{$locus} = '';
print OUT $locus."\t";
print OUT $contig."\t";
print OUT $start."\t";
print OUT $end."\t";
print OUT $strand."\t";
print OUT $gene_num."\t";
if ($contig_size >= $minsize){
$isoform{$locus} = '';
print OUT $locus."\t";
print OUT $contig."\t";
print OUT $start."\t";
print OUT $end."\t";
print OUT $strand."\t";
print OUT $gene_num."\t";

if (!defined $features{$locus}{'product'}){
$features{$locus}{'product'} = 'undefined product in accession';
}
print OUT $features{$locus}{'product'}."\n";

print PROT ">$locus \[$features{$locus}{'product'}\]\n"; #\t$contig\t$start\t$end\t$strand\n";
foreach my $line (unpack("(A60)*",$sequence)){
print PROT "$line\n";
}

if (!defined $features{$locus}{'product'}){
$features{$locus}{'product'} = 'undefined product in accession';
}
print OUT $features{$locus}{'product'}."\n";

print PROT ">$locus \[$features{$locus}{'product'}\]\n"; #\t$contig\t$start\t$end\t$strand\n";
foreach my $line (unpack("(A60)*",$sequence)){
print PROT "$line\n";
}

undef $translate;
undef $sequence;
$gene_num ++;

}

}
Expand Down Expand Up @@ -289,13 +321,26 @@

}

if ($verbose){
my $total_contigs = $contig_kept_counter + $contig_discarded_counter;
print "\n".'Total contigs: '."\t\t\t".$total_contigs."\t".$contig_sum." bp\n";
print "Contigs kept (>= $minsize): "."\t".$contig_kept_counter."\t".$contig_kept_sum." bp\n";
print "Contigs discarded (< $minsize): "."\t".$contig_discarded_counter."\t".$contig_discarded_sum." bp\n";
}

### Creating genome FASTA from GBFF
for my $sequence (sort(keys %genome)){
print GEN ">$sequence\n";
my @data = unpack ("(A60)*", $genome{$sequence});
while (my $tmp = shift@data){
print GEN "$tmp\n";

my $contig_len = length($genome{$sequence});

if ($contig_len >= $minsize){
print GEN ">$sequence\n";
my @data = unpack ("(A60)*", $genome{$sequence});
while (my $tmp = shift@data){
print GEN "$tmp\n";
}
}

}

}
Expand All @@ -307,4 +352,8 @@
}
exit;
}
}

if ($verbose){
print "\n";
}
53 changes: 29 additions & 24 deletions run_syny.pl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# Pombert lab, 2022

my $name = 'run_syny.pl';
my $version = '0.6.7a';
my $updated = '2024-05-07';
my $version = '0.6.7b';
my $updated = '2024-05-08';

use strict;
use warnings;
Expand Down Expand Up @@ -47,6 +47,7 @@
-o (--outdir) Output directory [Default = SYNY]
-e (--evalue) DIAMOND BLASTP evalue cutoff [Default = 1e-10]
-g (--gaps) Allowable number of gaps between gene pairs [Default = 0]
--minsize Minimum contig size (in bp) [Default: 1]
--asm Specify minimap2 max divergence preset (--asm 5, 10 or 20) [Default: off]
--resume Resume minimap2 computations (skip completed alignments)
--no_map Skip minimap2 pairwise genome alignments
Expand Down Expand Up @@ -119,6 +120,7 @@
my $outdir = 'SYNY';
my $threads = 16;
my $max_pthreads = $threads;
my $minsize = 1;
my $nomap;
my $noclus;
my $resume;
Expand Down Expand Up @@ -187,6 +189,7 @@
'o|outdir=s' => \$outdir,
'e|evalue=s' => \$evalue,
'g|gaps=i{0,}' => \@gaps,
'minsize=i' => \$minsize,
'no_map' => \$nomap,
'no_clus' => \$noclus,
'resume' => \$resume,
Expand Down Expand Up @@ -446,32 +449,10 @@
print LOG "SYNY started on: ".$time."\n";
print LOG "COMMAND: $0 @commands\n\n";

###################################################################################################
## Run list_maker.pl
###################################################################################################

my @threads = initThreads($threads);

print ERROR "\n### list_maker.pl ###\n";
print "\n##### Extracting data from GenBank files\n";

my @annotations :shared = @annot_files;
my $annot_num = scalar(@annot_files);

for my $thread (@threads){
$thread = threads->create(\&list_maker);
}
for my $thread (@threads){
$thread ->join();
}

logs(\*LOG, 'Parsing data - list_maker.pl');

###################################################################################################
## Shared options flags
###################################################################################################

# Option flags
my $cluster_flag = '';
if ($clusters){
$cluster_flag = '--clusters';
Expand Down Expand Up @@ -517,6 +498,28 @@
$hm_vauto_flag = '--vauto';
}

###################################################################################################
## Run list_maker.pl
###################################################################################################

my @threads = initThreads($threads);

print ERROR "\n### list_maker.pl ###\n";
print "\n##### Extracting data from GenBank files\n";

my @annotations :shared = @annot_files;
my $annot_num = scalar(@annot_files);

for my $thread (@threads){
$thread = threads->create(\&list_maker);
}
for my $thread (@threads){
$thread ->join();
}

logs(\*LOG, 'Parsing data - list_maker.pl');


###################################################################################################
## Get PAF files with minimap2
###################################################################################################
Expand Down Expand Up @@ -1146,6 +1149,7 @@
$plot_path/nucleotide_biases.pl \\
--outdir $circos_data_dir \\
--fasta $genome_dir/*.fasta \\
--minsize $minsize \\
--winsize $winsize \\
--step $stepsize \\
--gap $gap \\
Expand Down Expand Up @@ -1323,6 +1327,7 @@ sub list_maker {
$util_path/list_maker.pl \\
--input $annotation \\
--outdir $outdir \\
--minsize $minsize \\
2>> $log_err
") == 0 or checksig();

Expand Down

0 comments on commit 4b69b74

Please sign in to comment.