Skip to content

Commit

Permalink
Merge pull request #114 from wtsi-npg/devel
Browse files Browse the repository at this point in the history
devel to master for release v10.28
  • Loading branch information
dozy authored Jun 16, 2017
2 parents a2aa5dc + f8b3b48 commit 1a298a3
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 35 deletions.
133 changes: 98 additions & 35 deletions bin/makeMissingFiles.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use English qw(-no_match_vars);
use Carp;
use Getopt::Long;
use File::Spec;
use FileHandle;
use POSIX qw(ceil);

Expand All @@ -27,17 +28,20 @@ sub usage {
## no critic

print STDERR "\n";
print STDERR "makeMissingFiles.pl [-verbose] [-check] [-filter <filter>] [-cif <template>] [-bcl] [-noscl] <cif|dif|bcl|scl|stats>\n";
print STDERR "makeMissingFiles.pl [-verbose] [-check] [-filter <filter>] [-cif <template>] [-bcl] [-nocif] [-nodif] [-noscl] <cif|dif|bcl|scl|stats>\n";
print STDERR "\n";
print STDERR " -check just report missing files\n";
print STDERR "\n";
print STDERR " -olb only check for the files required for OLB i.e. cif files\n";
print STDERR " -hiseqx only check for the files created on a hiseqx i.e. bcl.gz but not cif, dif or stats\n";
print STDERR "\n";
print STDERR " -cif <template> if cif file is missing make one based on this template\n";
print STDERR "\n";
print STDERR " -filter <filter> if filter file does not exist make this one\n";
print STDERR " -bcl if bcl file is missing make one\n";
print STDERR "\n";
print STDERR " -nocif do not check for missing cif files\n";
print STDERR " -nodif do not check for missing dif files\n";
print STDERR " -noscl do not check for missing scl files\n";
print STDERR "\n";

Expand All @@ -47,19 +51,19 @@ sub usage {
sub process {

my $file = $ARGV[0];
die "Invalid file $file\n" unless my ($intensities) = ($file =~ /^(.+\/Data\/Intensities).+$/);
$file = File::Spec->rel2abs($file);

my ($lane, $cycle, $tile);
my ($intensities, $lane, $cycle, $tile);
if( $file =~ m/\.cif$/ ){
die "Invalid cif file $file\n" unless ($lane, $cycle, $tile) = ($file =~ m/\/L00(\d)\/(C\d+\.1)\/s_\1_(\d+)\.cif$/);
die "Invalid cif file $file\n" unless ($intensities, $lane, $cycle, $tile) = ($file =~ m/^(.+)\/L00(\d)\/(C\d+\.1)\/s_\2_(\d+)\.cif$/);
} elsif( $file =~ m/\.dif$/ ){
die "Invalid dif file $file\n" unless ($lane, $cycle, $tile) = ($file =~ m/\/L00(\d)\/(C\d+\.1)\/s_\1_(\d+)\.dif$/);
} elsif( $file =~ m/\.bcl$/ ){
die "Invalid bcl file $file\n" unless ($lane, $cycle, $tile) = ($file =~ m/BaseCalls\/L00(\d)\/(C\d+\.1)\/s_\1_(\d+)\.bcl$/);
die "Invalid dif file $file\n" unless ($intensities, $lane, $cycle, $tile) = ($file =~ m/^(.+)\/L00(\d)\/(C\d+\.1)\/s_\2_(\d+)\.dif$/);
} elsif( $file =~ m/\.bcl(.gz)?$/ ){
die "Invalid bcl file $file\n" unless ($intensities, $lane, $cycle, $tile) = ($file =~ m/^(.+)\/BaseCalls\/L00(\d)\/(C\d+\.1)\/s_\2_(\d+)\.bcl(.gz)?$/);
} elsif( $file =~ m/\.scl$/ ){
die "Invalid scl file $file\n" unless ($lane, $cycle, $tile) = ($file =~ m/BaseCalls\/L00(\d)\/(C\d+\.1)\/s_\1_(\d+)\.scl$/);
die "Invalid scl file $file\n" unless ($intensities, $lane, $cycle, $tile) = ($file =~ m/^(.+)\/BaseCalls\/L00(\d)\/(C\d+\.1)\/s_\2_(\d+)\.scl$/);
} elsif( $file =~ m/\.stats$/ ){
die "Invalid stats file $file\n" unless ($lane, $cycle, $tile) = ($file =~ m/BaseCalls\/L00(\d)\/(C\d+\.1)\/s_\1_(\d+)\.stats$/);
die "Invalid stats file $file\n" unless ($intensities, $lane, $cycle, $tile) = ($file =~ m/^(.+)\/BaseCalls\/L00(\d)\/(C\d+\.1)\/s_\2_(\d+)\.stats$/);
}else{
print {*STDERR} "\nUnexpected file $file\n" or croak 'print failed';
usage;
Expand All @@ -81,18 +85,24 @@ sub process {

$/ = undef;

# a cif file must exist unless the -nocif or the -cif option was specified
if (exists($opts->{'nocif'}) || exists($opts->{'cif'})) {
warn "Missing cif file $cif\n" unless ( $opts->{'nocif'} || -e $cif );
}else{
die "Missing cif file $cif\n" unless -e $cif;
# a cif file must exist unless the -nocif or the -cif or the hiseqx option was specified

unless (exists($opts->{'nocif'}) || exists($opts->{'cif'}) || exists($opts->{'hiseqx'})) {
if (exists($opts->{'check'})) {
warn "Missing cif file $cif\n" unless -e $cif;
}else{
die "Missing cif file $cif\n" unless -e $cif;
}
}

if (exists($opts->{'olb'})) {
exit if (exists($opts->{'check'}));
}else{
# hiseqx bcl files are gzipped
$bcl .= ".gz" if exists($opts->{'hiseqx'});

# a bcl file must exist unless the -bcl option was specified
if (exists($opts->{'bcl'})) {
if (exists($opts->{'check'}) || exists($opts->{'bcl'})) {
warn "Missing bcl file $bcl\n" unless -e $bcl;
}else{
die "Missing bcl file $bcl\n" unless -e $bcl;
Expand All @@ -102,21 +112,21 @@ sub process {
if (exists($opts->{'filter'})) {
warn "Missing filter file $filter\n" unless -e $filter;
}else{
# does an old filter file exist ? When ALL GAIIs and HiSeqs have been upgraded
# we can drop this check
unless ( -e $filter ){
my $old_filter = "$intensities/BaseCalls/s_${lane}_${ptile}.filter";
$filter = $old_filter if -e $old_filter;
}
die "Missing filter file $filter\n" unless -e $filter;
}

print "Missing dif file $dif\n" unless -e $dif;
print "Missing scl file $scl\n" unless ( $opts->{'noscl'} || -e $scl );
print "Missing stats file $stats\n" unless -e $stats;
# hiseqx we don't expect any other files
unless (exists($opts->{'hiseqx'})) {
print "Missing dif file $dif\n" unless ( $opts->{'nodif'} || -e $dif );
print "Missing scl file $scl\n" unless ( $opts->{'noscl'} || -e $scl );
print "Missing stats file $stats\n" unless -e $stats;
}

exit if (exists($opts->{'check'}));
}

exit if (exists($opts->{'hiseqx'}));

unless ( $opts->{'nocif'} || -e $cif ){
print "Writing zero intensity cif file $cif\n";

Expand Down Expand Up @@ -145,20 +155,23 @@ sub process {
my %format = ("1" => "c*", "2" => "s*", "4" => "l*");
$data .= pack($format{$dataType}, @data);

# pad intensity file to a multiple of 4096
my $s = length($data);
$s = 4096 * (int($s / 4096) + 1) if $s % 4096;
my $p = $s - length($data);
$data .= pack($format{1}, (0) x $p) if $p > 0;

$fh->open(">$cif") or croak "$cif:\n$ERRNO";
$fh->write($data);
$fh->close();
}

exit if (exists($opts->{'olb'}));

# if a filter file exists read it otherwise exists make one
# if a filter file exists read it otherwise make one
if ( -e $filter ){
print "Reading filter file $filter\n";
}else{
# When ALL GAIIs and HiSeqs have been upgraded we can remove the arg from the
# -filter option and modify makeFilterFile() to only suport new filter files
$filter = $opts->{'filter'} if (exists($opts->{'filter'}));
makeFilterFile($filter);
}

Expand Down Expand Up @@ -207,19 +220,63 @@ sub process {
map{ push(@bases, $_ & 3); push(@quals, $_ >> 2) } @data;
}
}else{
print "Writing zero bcl file $bcl\n";
if( $opts->{'rebasecall'} && -e $dif ) {
print "Re-basecalling intensity file $dif\n";
print "All qualities set to 2 except for bases called as N which must have a quality of 0\n";

$nclusters = $nclusters_filtered;
$data = pack("l", $nclusters);
my @data = (0) x $nclusters;
$data .= pack("C*", @data);
$fh->open("<$dif") or croak "$dif:\n$ERRNO";
$data = <$fh>;
$fh->close();

my (@header) = unpack("C13", $data);
my $magic = substr($data,0,4);
my $dataType = $header[4];
my $firstCycle = $header[5] | $header[6] << 8;
my $num_cycles = $header[7] | $header[8] << 8;
my $num_entries = $header[9] | $header[10] << 8 | $header[11] << 16 | $header[12] << 24;
$data = substr($data,13);
my %format = ("1" => "c*", "2" => "s*", "4" => "l*");
my (@dif_data) = unpack($format{$dataType}, $data);

unless ($num_entries == $nclusters_filtered) {
die "Inconsistent dif and filter files nclusters = $num_entries expected $nclusters_filtered\n";
}

$nclusters = $nclusters_filtered;
$data = pack("l", $nclusters);
my @bcl_data = ();
for (my $cluster=0; $cluster<$nclusters; $cluster++) {
my ($max, $base) = (-99999, 0);
for (my $channel=0; $channel<4; $channel++) {
if( $dif_data[$num_entries * $channel + $cluster] > $max ) {
$max = $dif_data[$num_entries * $channel + $cluster];
$base = $channel;
}
}
# where we have a basecall set the quality to 2 (B)
my $qual = ($max ? 2 : 0);
push(@bases, $base);
push(@quals, $qual);
push(@bcl_data, $base | ($qual << 2));
}
$data .= pack("C*", @bcl_data);
}else{
print "Writing zero bcl file $bcl\n";

$nclusters = $nclusters_filtered;
$data = pack("l", $nclusters);
@bases = (0) x $nclusters;
@quals = (0) x $nclusters;
my @data = (0) x $nclusters;
$data .= pack("C*", @data);
}

$fh->open(">$bcl") or croak "$bcl:\n$ERRNO";
$fh->write($data);
$fh->close();
}

unless ( -e $dif ){
unless ( $opts->{'nodif'} || -e $dif ){
print "Writing zero intensity dif file $dif\n";

# check the bcl file contains ALL N's <=> qualities ALL 0
Expand Down Expand Up @@ -252,6 +309,12 @@ sub process {
my %format = ("1" => "c*", "2" => "s*", "4" => "l*");
$data .= pack($format{$dataType}, @data);

# pad intensity file to a multiple of 4096
my $s = length($data);
$s = 4096 * (int($s / 4096) + 1) if $s % 4096;
my $p = $s - length($data);
$data .= pack($format{1}, (0) x $p) if $p > 0;

$fh->open(">$dif") or croak "$dif:\n$ERRNO";
$fh->write($data);
$fh->close();
Expand Down Expand Up @@ -387,7 +450,7 @@ sub process {
sub initialise {

my %opts;
my $rc = GetOptions(\%opts, 'verbose', 'check', 'olb', 'cif=s', 'filter=s', 'bcl', 'nocif', 'noscl', 'help');
my $rc = GetOptions(\%opts, 'verbose', 'check', 'olb', 'hiseqx', 'rebasecall', 'cif=s', 'filter=s', 'bcl', 'nocif', 'nodif', 'noscl', 'help');
if ( ! $rc) {
print {*STDERR} "\nerror in command line parameters\n" or croak 'print failed';
usage;
Expand Down
32 changes: 32 additions & 0 deletions src/calibration_pu/calibration_pu.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,19 @@
* CHECK_BASECALL
* this option checks the base call corresponds to channel with
* the maximum intensity
*
* PPPRCN
* we build an error profile based on a 4-base sequence context
* previous, ref, called and next (PRCN) this option will enable
* a 7-base context 4 previous, ref, called and next (PPPPRCN)
*/

#define QC_FAIL
#define PROPERLY_PAIRED
//#define CALDATA
//#define CHECK_BASECALL
//#define PPPPRCN


#ifdef HAVE_CONFIG_H
#include "pb_config.h"
Expand Down Expand Up @@ -128,8 +135,13 @@

#define LEN_SUBST 2
#define NUM_SUBST 16 // 4 ^ LEN_SUBST
#ifdef PPPPRCN
#define LEN_CNTXT 7
#define NUM_CNTXT 16384 // 4 ^ LEN_CNTXT
#else
#define LEN_CNTXT 4
#define NUM_CNTXT 256 // 4 ^ LEN_CNTXT
#endif

#define ST_HILO_PURITY 0.63
#define ST_HILO_QUALITY 29.5
Expand Down Expand Up @@ -685,7 +697,11 @@ static void outputErrorTable(Settings *s, int ntiles, SurvTable **sts)
/* read summary */
SurvTable *st = read_sts[read];
if( NULL == st) continue;
#ifdef PPPPRCN
int cntxt_off = 3;
#else
int cntxt_off = 0;
#endif
int len_cntxt = 3;
int num_cntxt = 64;
long *count = (long *)smalloc(num_cntxt * sizeof(long));
Expand Down Expand Up @@ -723,7 +739,11 @@ static void outputErrorTable(Settings *s, int ntiles, SurvTable **sts)
/* read summary */
SurvTable *st = read_sts[read];
if( NULL == st) continue;
#ifdef PPPPRCN
int cntxt_off = 3;
#else
int cntxt_off = 0;
#endif
int len_cntxt = 3;
int num_cntxt = 64;
long *count = (long *)smalloc(num_cntxt * sizeof(long));
Expand Down Expand Up @@ -763,7 +783,11 @@ static void outputErrorTable(Settings *s, int ntiles, SurvTable **sts)
/* read summary */
SurvTable *st = read_sts[read];
if( NULL == st) continue;
#ifdef PPPPRCN
int cntxt_off = 3;
#else
int cntxt_off = 0;
#endif
int len_cntxt = 4;
int num_cntxt = 256;
long *count = (long *)smalloc(num_cntxt * sizeof(long));
Expand Down Expand Up @@ -801,7 +825,11 @@ static void outputErrorTable(Settings *s, int ntiles, SurvTable **sts)
/* read summary */
SurvTable *st = read_sts[read];
if( NULL == st ) continue;
#ifdef PPPPRCN
int cntxt_off = 3;
#else
int cntxt_off = 0;
#endif
int len_cntxt = 4;
int num_cntxt = 256;
long *count = (long *)smalloc(num_cntxt * sizeof(long));
Expand Down Expand Up @@ -832,6 +860,7 @@ static void outputErrorTable(Settings *s, int ntiles, SurvTable **sts)
free(count);
}

#ifdef PPPPRCN
/* previous 4 base homopolymer HRC. */

fprintf(fp, "# Homopolymer effect high predictor. Use `grep ^HRCH | cut -f 2-` to extract this part\n");
Expand Down Expand Up @@ -979,6 +1008,7 @@ static void outputErrorTable(Settings *s, int ntiles, SurvTable **sts)
fprintf(fp, "\n");
free(count);
}
#endif

freeSurvTable(s, 0, read_sts, 1);
if( NULL != fp) fclose(fp);
Expand Down Expand Up @@ -1477,9 +1507,11 @@ static int updateSurvTable(Settings *s, SurvTable **sts, CifData *cif_data,
}

chr=0;
#ifdef PPPPRCN
cntxt[chr++]=(b > 3 ? read_ref[b-4] : 'N');
cntxt[chr++]=(b > 2 ? read_ref[b-3] : 'N');
cntxt[chr++]=(b > 1 ? read_ref[b-2] : 'N');
#endif
cntxt[chr++]=(b > 0 ? read_ref[b-1] : 'N');
cntxt[chr++]=read_ref[b];
cntxt[chr++]=read_seq[b];
Expand Down

0 comments on commit 1a298a3

Please sign in to comment.