combinatorial indexing

sdparekh · Apr 12, 2018 · 0dd7308 · 0dd7308
1 parent 7d3e71a
commit 0dd7308
Show file tree

Hide file tree

Showing 4 changed files with 172 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -11,6 +11,9 @@ You can read more about zUMIs in our [biorxiv preprint](https://www.biorxiv.org/
 You can glance through zUMIs in [zUMIs poster](https://github.com/sdparekh/zUMIs/blob/master/zUMIs_GI2017_poster.pdf)!
 
 ## Releases/Changelog
+12 Apr 2018: [zUMIs.0.0.6 released](https://github.com/sdparekh/zUMIs/releases/tag/zUMIs.0.0.6).
+Improved support for combinatorial indexing methods.
+
 30 Mar 2018: [zUMIs.0.0.5 released](https://github.com/sdparekh/zUMIs/releases/tag/zUMIs.0.0.5).
 Rewrote hamming distance binning of UMIs and barcodes. In addition to faster running times, removed dependency on the stringdist package that may have led to issues with parallel computing in some systems. Furthermore removed a possible bug when resuming running with the -w switch in combination with plate barcode usage.
 
@@ -39,14 +42,17 @@ zUMIs is compatible with these single-cell UMI protocols:
 - SORT-seq (Muraro et al., 2016)
 - DroNc-seq (Habib et al., 2017)
 - Seq-Well (Gierahn et al., 2017)
-- SPLiT-seq (Rosenberg et al., 2017)
-- STRT-2i (Hochgerner et al., 2017)
+- SPLiT-seq (Rosenberg et al., 2018)
+- sci-RNA-seq (Cao et al., 2017)
+- STRT-2i (Hochgerner et al., 2018)
 - Quartz-seq2 (Sasagawa et al., 2017)
 - 10x Genomics Chromium (Zheng et al., 2017)
 - Wafergen ICELL8 (Gao et al., 2017)
 - Illumina ddSEQ SureCell
 - inDrops (Zilionis et al., 2017; Klein et al. 2015)
 
+For combinatorial indexing protocols, be sure to [check our wiki page](https://github.com/sdparekh/zUMIs/wiki/Combinatorial-Indexing).
+
 ## Getting help
 
 Refer to [zUMIs Github wiki](https://github.com/sdparekh/zUMIs/wiki) for help.

diff --git a/cat3fq.pl b/cat3fq.pl
@@ -0,0 +1,79 @@
+#!/usr/bin/perl
+# LMU Munich. AG Enard
+# A script to filter reads based on Barcode base quality.
+# Author: Swati Parekh
+# Contact: parekh@bio.lmu.de or ziegenhain@bio.lmu.de
+
+if(@ARGV != 5)
+{
+print
+"\n#####################################################################################
+Usage: perl $0 <Read1.fq.gz> <Read2.fq.gz> <Read3.fq.gz> <output.fq> <threads>\n
+Explanation of parameters:
+
+output.fq	- Output file name. pigz will put the .gz only provide the base name.
+threads		- number of processors to zip.
+Please drop your suggestions and clarifications to <parekh\@bio.lmu.de>\n
+######################################################################################\n\n";
+exit;
+}
+
+$oneread=$ARGV[0];
+$tworead=$ARGV[1];
+$threeread = $ARGV[2];
+$bcreadoutfull = $ARGV[3];
+$threads=$ARGV[4];
+
+
+if ($oneread =~ /\.gz$/) {
+open AF, '-|', 'gzip', '-dc', $oneread || die "Couldn't open file $oneread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+open BF, '-|', 'gzip', '-dc', $tworead || die "Couldn't open file $tworead. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+open CF, '-|', 'gzip', '-dc', $threeread || die "Couldn't open file $threeread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+}
+else {
+open AF, "<", $oneread || die "Couldn't open file $oneread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+open BF, "<", $tworead || die "Couldn't open file $tworead. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+open CF, "<", $threeread || die "Couldn't open file $threeread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+}
+
+open BCOUTFULL, ">", $bcreadoutfull || die "Couldn't open file $bcreadoutfull to write\n\n";;
+
+$count=0;
+$total=0;
+$filtered=0;
+
+while(<AF>){
+$total++;
+	$arid=$_;
+	$arseq=<AF>;
+	chomp($arseq);
+
+	$aqid=<AF>;
+	$aqseq=<AF>;
+	chomp($aqseq);
+
+	$brid=<BF>;
+	$brseq=<BF>;
+	chomp($brseq);
+
+	$bqid=<BF>;
+	$bqseq=<BF>;
+	chomp($bqseq);
+
+	$crid=<CF>;
+	$crseq=<CF>;
+	chomp($crseq);
+
+	$cqid=<CF>;
+	$cqseq=<CF>;
+	chomp($cqseq);
+
+	$seq=$arseq.$brseq.$crseq;
+	$qseq=$aqseq.$bqseq.$cqseq;
+	print BCOUTFULL $arid,$seq,"\n",$aqid,$qseq,"\n";
+}
+close AF;
+close BF;
+close CF;
+close BCOUTFULL;
+`pigz -f -p $threads $bcreadoutfull`;
diff --git a/preprocess_splitseq.pl b/preprocess_splitseq.pl
@@ -0,0 +1,84 @@
+#!/usr/bin/perl
+# LMU Munich. AG Enard
+# A script to preprocess Split-seq data.
+# Author: Swati Parekh&Christoph Ziegenhain
+# Contact: parekh@bio.lmu.de or ziegenhain@bio.lmu.de or hellmann@bio.lmu.de
+
+if(@ARGV != 8)
+{
+print
+"\n#####################################################################################
+Usage: perl $0 <barcode-Read.fq.gz> <Range1> <Range2> <Range3> <Threads> <StudyName> <Outdir> <pigz-executable> \n
+Explanation of parameter:
+
+barcode-Read.fq.gz	- Input barcode reads fastq file name.
+Threads			- Number of threads to use.
+Study       - Study name.
+Ranges 1,2,3	- Barcode Ranges to extract
+OUTDIR      - Output directory.
+pigz-executable - Location of pigz executable
+######################################################################################\n\n";
+exit;
+}
+
+$bcread=$ARGV[0];
+$arange=$ARGV[1];
+$brange=$ARGV[2];
+$crange=$ARGV[3];
+$threads=$ARGV[4];
+$study=$ARGV[5];
+$outdir=$ARGV[6];
+$pigz=$ARGV[7];
+
+@a = split("-",$arange);
+@b = split("-",$brange);
+@c = split("-",$crange);
+$as = $a[0] - 1;
+$bs = $b[0] - 1;
+$cs = $c[0] - 1;
+
+$al = $a[1]-$a[0]+1;
+$bl = $b[1]-$b[0]+1;
+$cl = $c[1]-$c[0]+1;
+
+$bcreadoutfull = $outdir."/".$study.".barcoderead.preprocess.fastq";
+
+if ($bcread =~ /\.gz$/) {
+open BCF, '-|', $pigz, '-dc', $bcread || die "Couldn't open file $bcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+}
+else {
+open BCF, "<", $bcread || die "Couldn't open file $bcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
+}
+
+open BCOUTFULL, ">", $bcreadoutfull || die "Couldn't open file $bcreadoutfull to write\n\n";;
+
+$total=0;
+
+while(<BCF>){
+$total++;
+	$rid=$_;
+	$rseq=<BCF>;
+	$qid=<BCF>;
+	$qseq=<BCF>;
+
+
+	$aqual = substr($qseq,$as,$al);
+	$bqual = substr($qseq,$bs,$bl);
+	$cqual = substr($qseq,$cs,$cl);
+
+	$aseq = substr($rseq,$as,$al);
+	$bseq = substr($rseq,$bs,$bl);
+	$cseq = substr($rseq,$cs,$cl);
+
+
+
+	print BCOUTFULL $rid,$aseq,$bseq,$cseq,"\n",$qid,$aqual,$bqual,$cqual,"\n";
+
+
+}
+close BCF;
+close BCOUTFULL;
+
+print "Reads processed: $total \n\n";
+
+`$pigz -f -p $threads $bcreadoutfull`;
diff --git a/zUMIs-master.sh b/zUMIs-master.sh
@@ -3,7 +3,7 @@
 # Pipeline to run UMI-seq analysis from fastq to read count tables.
 # Authors: Swati Parekh &  Christoph Ziegenhain
 # Contact: parekh@bio.lmu.de or christoph.ziegenhain@ki.se or hellmann@bio.lmu.de
-vers=0.0.5b
+vers=0.0.6
 function check_opts() {
     value=$1
     name=$2