From 75aae10ede2b7d4f360c9f5f76509f3099fab5c8 Mon Sep 17 00:00:00 2001 From: Runsheng Date: Sat, 17 Feb 2024 11:55:26 +0800 Subject: [PATCH 1/6] change default to not find new genes --- script/trackrun.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/script/trackrun.py b/script/trackrun.py index 546fc9c..eb8d6c5 100644 --- a/script/trackrun.py +++ b/script/trackrun.py @@ -174,6 +174,8 @@ def cluster(self): if args.prefix is None: args.prefix=get_file_prefix(args.sample,sep=".") + find_novelgene=False if args.find_novelgene=="no" else True + #args = parser.parse_args(args=None if sys.argv[2:] else ['--help']) flow_cluster_all_gene_novel(wkdir=args.folder, @@ -188,7 +190,7 @@ def cluster(self): intronweight=args.intronweight, cutoff1=args.cutoff1, cutoff2=args.cutoff2, - find_novelgene=args.find_novelgene) + find_novelgene=find_novelgene) def clusterj(self): @@ -224,6 +226,8 @@ def clusterj(self): if args.prefix is None: args.prefix=get_file_prefix(args.sample,sep=".") + find_novelgene=False if args.find_novelgene=="no" else True + flow_clusterj_all_gene_novel(wkdir=args.folder, prefix=args.prefix, nano_bed=args.sample, @@ -233,7 +237,7 @@ def clusterj(self): f2=args.intersect2, count_cutoff=args.count, batchsize=args.batchsize, - find_novelgene=args.find_novelgene) + find_novelgene=find_novelgene) def count(self): parser = argparse.ArgumentParser( From 7f8cf169852229930ff286c27cbcf29fb058469d Mon Sep 17 00:00:00 2001 From: Runsheng Date: Sat, 17 Feb 2024 23:59:02 +0800 Subject: [PATCH 2/6] add CDS finding code --- trackcluster/track.py | 56 +++++++++++++++------------------------- trackcluster/utils.py | 59 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 40 deletions(-) diff --git a/trackcluster/track.py b/trackcluster/track.py index f4622bd..42e05a4 100644 --- a/trackcluster/track.py +++ b/trackcluster/track.py @@ -5,7 +5,7 @@ # @File : track.py # third part import -from trackcluster.utils import chr_select, reverse_complement +from trackcluster.utils import chr_select, reverse_complement, find_longest_orf import os @@ -325,7 +325,6 @@ def mrna_pos_to_chro(self, mrna_pos): self.get_exon() block_sum=self.get_cumsum(self.blockSizes) - ### sanity check if 0<=mrna_pos<=block_sum[-1]: pass @@ -346,7 +345,7 @@ def mrna_pos_to_chro(self, mrna_pos): chro_pos=self.chromStart+self.chromStarts[exon_num]+exon_offset - return self.chrom, chro_pos + return chro_pos def bind_chroseq(self, refdic, gap=0, intron=False): """ @@ -375,51 +374,38 @@ def bind_chroseq(self, refdic, gap=0, intron=False): _, seq_intron=chr_select(refdic, chro, start_i, end_i) seq_l.append(seq_intron.lower()) # lower case for intron seq_raw="".join(seq_l) + if self.strand=="+": seq_out=seq_raw else: seq_out=reverse_complement(seq_raw) - self.seq_chro="".join(seq_out) - def orf_find(self, refdic): + def get_exon_seq(self, refdic): """ - :param: refdic: the reference genome + get the exon sequence from the reference genome + is a specific case for bind_chroseq + :param refdic: + :return: write seq_mrna to self.seq_mrna """ - # need self.seq_chro - if self.seq_chro is None: - return None - else: - pass + self.bind_chroseq(refdic, gap=0, intron=False) + self.seq_mrna= self.seq_chro - def find_orfs_with_trans(self, trans_table=1, min_protein_length=10): + def orf_find(self, refdic, tans_table=1, min_protein_length=0): """ - compare the three orfs, use the longest one as new protein - min size may need to be defined as 30aa?,so cds need to >90 - :param trans_table: - :param min_protein_length: - :return: + :param: refdic: the reference genome """ - if self.seq_chro is None: - return None - else: - from Bio.Seq import Seq - seq=Seq(self.seq_chro) - - seq0=seq.translate(table=1) - seq1=seq[1:].translate(table=1) - seq2=seq[2:].translate(table=1) - - print((str(seq0))) - print("======================================") - print((str(seq1))) - print("======================================") - print((str(seq2))) - - answer=[seq0, seq1, seq2] + # need self.seq_mrna + if self.seq_mrna is None: + self.get_exon_seq(refdic) + start, end, orf_seq=find_longest_orf(self.seq_mrna, tans_table, min_protein_length) + self.thickStart=self.mrna_pos_to_chro(start) + self.thickEnd=self.mrna_pos_to_chro(end) - return answer + # todo, need to add self.cdsStartStat and self.cdsEndStat + # need to be (none, unknown, incomplete, or complete) + # with ATG and stop codon should be complete def is_nmd(self): """ diff --git a/trackcluster/utils.py b/trackcluster/utils.py index f68489e..56f29dc 100644 --- a/trackcluster/utils.py +++ b/trackcluster/utils.py @@ -15,7 +15,7 @@ from collections import OrderedDict # third part import -from Bio import SeqIO,Seq +from Bio import SeqIO,Seq,Data import gzip @@ -67,8 +67,53 @@ def chr_select(record_dict, chro, start,end): seq=str(record_dict[chro][start:end].seq) return name,seq +def find_longest_orf(sequence, table_id=1, min_protein_length=0): + """ + Find the longest ORF in a given sequence using BioPython. + + Args: + - sequence (str): The DNA sequence to search within. + - table_id (int): The NCBI translation table ID to use. + - min_protein_length (int): Minimum length of protein to consider as valid ORF. -def myexe(cmd, timeout=10): + Returns: + - Tuple of (start, end, orf_sequence) indicating the longest ORF found. + """ + seq = Seq.Seq(sequence) + standard_table = Data.CodonTable.unambiguous_dna_by_id[table_id] + + longest_orf = "" + longest_orf_start = None + longest_orf_end = None + + for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]: + for frame in range(3): + trans = str(nuc[frame:].translate(table_id)) + trans_len = len(trans) + aa_start = 0 + aa_end = 0 + while aa_start < trans_len: + aa_end = trans.find("*", aa_start) + if aa_end == -1: + aa_end = trans_len + if aa_end - aa_start >= min_protein_length: + if strand == 1: + start = frame + aa_start * 3 + end = frame + aa_end * 3 + 3 + else: + start = len(sequence) - (frame + aa_end * 3 + 3) + end = len(sequence) - (frame + aa_start * 3) + orf_seq = sequence[start:end] + if len(orf_seq) > len(longest_orf): + longest_orf = orf_seq + longest_orf_start = start + longest_orf_end = end + aa_start = aa_end + 1 + + return longest_orf_start, longest_orf_end, longest_orf + + +def myexe(cmd, timeout=100): """ a simple wrap of the shell mainly used to run the bwa mem mapping and samtool orders @@ -83,8 +128,12 @@ def alarmHandler(signum, frame): proc=subprocess.Popen(cmd, shell=True, preexec_fn=setupAlarm, stdout=subprocess.PIPE, stderr=subprocess.PIPE,cwd=os.getcwd()) #print("Running: ", cmd) - out, err=proc.communicate() - #print("stderr out: ",err, "\n","return code: ",proc.returncode) + try: + out, err=proc.communicate(timeout=timeout) + # print("stderr out: ",err, "\n","return code: ",proc.returncode) + except subprocess.TimeoutExpired: + proc.kill() + out, err = proc.communicate() return out @@ -119,7 +168,7 @@ def is_package_installed(name): def set_tmp(wkdir=None): """ - set the the tmp dir to the mem + set the tmp dir to the mem """ if wkdir is None: aa=os.popen("echo $XDG_RUNTIME_DIR") From b14e3dabea26eba8594b43cf2e7f172efa7e6948 Mon Sep 17 00:00:00 2001 From: Runsheng Date: Mon, 17 Jun 2024 15:13:57 +0800 Subject: [PATCH 3/6] add CDS annotation for track --- test/track_test.py | 10 ++++++---- test/utils_test.py | 4 ++++ trackcluster/track.py | 6 +++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/test/track_test.py b/test/track_test.py index 5f00a06..5dd403d 100644 --- a/test/track_test.py +++ b/test/track_test.py @@ -148,14 +148,16 @@ def test_orfs(self): can only run with ce10 ref """ # ref_dic is a large external file - ref_dict=fasta2dic("/data/reference/ce10_ucsc.fa") + ref_dict=fasta2dic("/data/reference/ce11.fa") bigg_one=self.bigg[30] + print(bigg_one) bigg_one.bind_chroseq(ref_dict, gap=0, intron=False) print((bigg_one.seq_chro)) - ans=bigg_one.find_orfs_with_trans() + bigg_one.orf_find(refdic=ref_dict) + print((bigg_one.seq_mrna)) - print(ans) - print(bigg_one) + print() + print(bigg_one.thickStart, bigg_one.thickEnd) def test_orf(self): pass diff --git a/test/utils_test.py b/test/utils_test.py index 316ed71..43ccc87 100644 --- a/test/utils_test.py +++ b/test/utils_test.py @@ -50,6 +50,10 @@ def test_detail_file(self): logger.debug("This is just a test, print number 100 as float" + "," + str(float(1) * 100)) logger.debug("In total %d line, %s" % (2, "just a test number")) + def test_find_longest_orf(self): + seq="AAATGAACTTGACGAGTAATGAG" + print(find_longest_orf(seq)) + def tearDown(self): self.fafile=None diff --git a/trackcluster/track.py b/trackcluster/track.py index 42e05a4..d604adf 100644 --- a/trackcluster/track.py +++ b/trackcluster/track.py @@ -85,6 +85,9 @@ def __init__(self, prefix=""): self.seq=None self.subread=set() # use to store the reads contained inside the self.coverage=0 + self.seq_mrna=None + self.seq_chro=None + self.seq_cds=None self.junction=[] @@ -391,7 +394,7 @@ def get_exon_seq(self, refdic): self.bind_chroseq(refdic, gap=0, intron=False) self.seq_mrna= self.seq_chro - def orf_find(self, refdic, tans_table=1, min_protein_length=0): + def get_cds(self, refdic, tans_table=1, min_protein_length=0): """ :param: refdic: the reference genome """ @@ -402,6 +405,7 @@ def orf_find(self, refdic, tans_table=1, min_protein_length=0): self.thickStart=self.mrna_pos_to_chro(start) self.thickEnd=self.mrna_pos_to_chro(end) + self.seq_cds=orf_seq # todo, need to add self.cdsStartStat and self.cdsEndStat # need to be (none, unknown, incomplete, or complete) From 43aac47c5c3408eec08271bb9aaa62cc2afefd05 Mon Sep 17 00:00:00 2001 From: Runsheng Date: Mon, 17 Jun 2024 15:14:13 +0800 Subject: [PATCH 4/6] minor modifications --- Readme.md | 5 +++- doc/index.md | 27 ++++++++++++++++++++ doc/source/design.md | 0 script/bigg2seq.py | 58 ++++++++++++++++++++++++++++++++++++++++++ test/clustercj_test.py | 2 +- test/clusterj_test.py | 11 +++++--- trackcluster/flow.py | 4 +-- 7 files changed, 99 insertions(+), 8 deletions(-) create mode 100644 doc/index.md create mode 100644 doc/source/design.md create mode 100644 script/bigg2seq.py diff --git a/Readme.md b/Readme.md index 1177ae3..4cf064a 100644 --- a/Readme.md +++ b/Readme.md @@ -11,7 +11,10 @@ Trackcluster is an isoform calling and quantification pipeline for long RNA/cDNA - [Walkthrough](#walkthrough) #### -Hint: the ongoing development can be found in the ["dev"](https://github.com/Runsheng/trackcluster/tree/dev) branch. +The ongoing development can be found in the ["dev"](https://github.com/Runsheng/trackcluster/tree/dev) branch. +#### TODO: +1. fix "fusion" classification. 2. speed for clusterj. 3. splicing leader/5' indicator finding using pyssw. +4.add function to get the CDS start (maybe ATG) and CDS end. ## Overview A pipeline for reference-based isoform identification and quantification using long reads. This pipeline was designed to use **only** long and nosisy reads to make a valid transcriptome. An indicator for the intact 5' could be very helpful to the pipeline, i.e, the splicing leader in the mRNA of nematodes. diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 0000000..26bc3ce --- /dev/null +++ b/doc/index.md @@ -0,0 +1,27 @@ +# TrackCluster +![PyPI](https://img.shields.io/pypi/v/trackcluster?color=green) + +Trackcluster is an isoform calling and quantification pipeline for long RNA/cDNA reads. + + +Walkthrough for the using of trackcluster in isofrom calling in worms/mouse/human/virus datasets. + +## preprocessing +1. Long read QC + +Some of the basic characteristics need to be known before the further analysis. For example, the read length and mapped +read length; the read estimated quality, mapping quality and the read mapping rate. + +We recommend to use the following tools for the long read QC: + +Giraffe: https://github.com/lrslab/Giraffe_View + + +2. Mapping Nanopore long reads to genome + +The mapping process is the first step for the isoform calling. +We recommend to use minimap2 for the mapping of the long reads to the genome. + - Mapping the reads for eukaryotic genomes + ```bash + minimap2 -ax splice -uf -k14 --secondary=no --MD -t 8 ref.fa read.fq.gz | samtools view -bS - | samtools sort -o read.bam - + ``` diff --git a/doc/source/design.md b/doc/source/design.md new file mode 100644 index 0000000..e69de29 diff --git a/script/bigg2seq.py b/script/bigg2seq.py new file mode 100644 index 0000000..c2fc199 --- /dev/null +++ b/script/bigg2seq.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +#-*- coding: utf-8 -*- +# @Time : 19/2/2024 2:14pm +# @Author : Runsheng +# @File : bigg2seq.py + +""" +This script is used to write the exon information from the bigg file +can be used to output the +transcript sequence, CDS sequence and ensemble like EXON(uppercase)+intron(lowercase or N) sequence. + +""" +import argparse +import os,sys,inspect +from trackcluster.utils import fasta2dic + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(os.path.dirname(currentdir)) +sys.path.insert(0,parentdir) + +from trackcluster.tracklist import read_bigg, write_bigg + +parser=argparse.ArgumentParser() +parser.add_argument("-b", "--biggfile", + help="the bigg bed file") +parser.add_argument("-r", "--reference", + help="the genome reference") +parser.add_argument("-o", "--out", default="bigg.fasta", + help="the output file name, default is bigg.fasta") +parser.add_argument("-m", "--mode", default="exon", + help="the format of the output, can be 'exon', 'cds', " + "or 'ensembl': ensemble EXON(uppercase)+intron(lowercase) sequence" + "default is 'exon' mode") + +args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) + +# make a file using the functions +outfile=args.out +refdic=fasta2dic(args.reference) + +bigg_l=read_bigg(args.biggfile) +with open (outfile, "w") as fw: + if args.mode=="exon": + for bigg in bigg_l: + bigg.get_exon() + bigg.bind_chroseq(refdic, gap=0, intron=False) + name=bigg.name + seq=bigg.seq_chro + elif args.mode=="cds": + for bigg in bigg_l: + bigg.get_exon() + bigg.get_cds() + name=bigg.name + seq=bigg.seq_cds + + for bigg in bigg_l: + bigg.get_exon() + fw.write(bigg.name+"\t"+str(bigg.exonlen)+"\t"+bigg.geneName+"\t"+bigg.ttype+"\n") \ No newline at end of file diff --git a/test/clustercj_test.py b/test/clustercj_test.py index 2e4bc54..40b9652 100644 --- a/test/clustercj_test.py +++ b/test/clustercj_test.py @@ -80,7 +80,7 @@ def test_get_read_junction_dic(self): print((df["1f940a05-24a1-4455-a506-b1aa04caf81a"]-df["087b20ed-78ba-48f9-a038-f23a9c4b75c6"])) print((df["087b20ed-78ba-48f9-a038-f23a9c4b75c6"]-df["1f940a05-24a1-4455-a506-b1aa04caf81a"])) - def test_get_arrry_freq_is_junction_inside(self): + def __test_get_arrry_freq_is_junction_inside(self): self.site_dic=get_junction_dic(self.bigg) df=get_read_junction_dic(self.bigg, self.site_dic) #j1=df["087b20ed-78ba-48f9-a038-f23a9c4b75c6"] diff --git a/test/clusterj_test.py b/test/clusterj_test.py index 3a583c5..672e386 100644 --- a/test/clusterj_test.py +++ b/test/clusterj_test.py @@ -3,6 +3,7 @@ # @Time : 8/9/2018 10:53 AM # @Author : Runsheng # @File : cluster_test.py +import os from trackcluster.clusterj import * from trackcluster.tracklist import * @@ -32,9 +33,10 @@ def setUp(self): self.bigg_nano=bigg_nano self.bigg_gff=bigg_gff self.bigg=bigg_nano+bigg_gff + self.gene=gene - def test_get_corrected_junction(self): + def __test_get_corrected_junction(self): import random from copy import deepcopy #bigg=self.bigg[0] @@ -78,11 +80,12 @@ def test_is_bigg1_inside_bigg2_junction(self): def test_junction_simple_merge(self): - print(len(self.bigg)) + print(self.gene, "track size is ", len(self.bigg)) bigg_n=junction_simple_merge(self.bigg) print(len(bigg_n)) - write_bigg(bigg_n, "./test/genes/AT2G43410/tt.bed") + print(os.getcwd()) + write_bigg(bigg_n, "./genes/{gene}/tt.bed".format(gene=self.gene)) def test_is_single_exon_in(self): # get single and note single @@ -110,7 +113,7 @@ def test_flow_junction_cluster(self): #write_bigg(bigg_subread, "./test/genes/AT2G43410/AT2G43410_subread.bed") def test_bigg_get_namedic(self): - bigg_list=read_bigg("genes/AT2G43410/AT2G43410_subread.bed") + bigg_list=read_bigg("./genes/AT2G43410/AT2G43410_nano.bed") name_dic=bigg_get_namedic(bigg_list) print(name_dic) diff --git a/trackcluster/flow.py b/trackcluster/flow.py index de79442..88fafb6 100644 --- a/trackcluster/flow.py +++ b/trackcluster/flow.py @@ -577,7 +577,7 @@ def process_desc(k): fw.write("\n") -def flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=None): +def _flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=None): """ output all files used for further plotting :param bigg_read_file: @@ -601,7 +601,7 @@ def flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=N ############ -def flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30, +def _flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30, f1=0.01, f2=0.01, count_cutoff=5, batchsize=2000): """ The overall full run pipeline for impatient people From 9d4fd4571bf5736d94012a05c0d2e207e118d6ba Mon Sep 17 00:00:00 2001 From: Runsheng Date: Tue, 15 Oct 2024 10:52:31 +0800 Subject: [PATCH 5/6] fix desc jump out --- trackcluster/flow.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/trackcluster/flow.py b/trackcluster/flow.py index 88fafb6..94118f3 100644 --- a/trackcluster/flow.py +++ b/trackcluster/flow.py @@ -499,21 +499,27 @@ def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None, cor def process_class4(k): nanos = gene2isoform[k] refs = gene2ref[k] + out_class4_s = [] for bigg in nanos: try: out_class4 = flow_class4(bigg, refs, offset=offset) - return out_class4 except IndexError: - return None + continue + out_class4_s.append(out_class4) + return out_class4_s + def process_desc(k): nanos = gene2isoform[k] refs = gene2ref[k] + out_descs = [] for bigg in nanos: try: out_desc = flow_desc(bigg, refs, offset=offset) - return out_desc except IndexError: - return None + continue + out_descs.append(out_desc) + return out_descs + print("Running class4") class4=parmap(process_class4, tqdm(keys), nprocs=core) class4=[x for x in class4 if x is not None] @@ -601,21 +607,3 @@ def _flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix= ############ -def _flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30, - f1=0.01, f2=0.01, count_cutoff=5, batchsize=2000): - """ - The overall full run pipeline for impatient people - :param wkdir: - :param prefix: - :param ref_fasta: - :param fastq_l: - :param nano_bed: - :param gff_bed: - :param core: - :param f1: - :param f2: - :param count_cutoff: - :param batchsize: - :return: - """ - pass From 45fcb93b5dd82ae767bd40585774d50948ef1423 Mon Sep 17 00:00:00 2001 From: Runsheng Date: Tue, 15 Oct 2024 10:52:56 +0800 Subject: [PATCH 6/6] add doc and update functions --- doc/source/QC.md | 33 ++++++++++++ doc/source/design.md | 19 +++++++ trackcluster/batch.py | 2 +- trackcluster/clustercj.py | 75 --------------------------- trackcluster/clusterj.py | 106 -------------------------------------- 5 files changed, 53 insertions(+), 182 deletions(-) create mode 100644 doc/source/QC.md diff --git a/doc/source/QC.md b/doc/source/QC.md new file mode 100644 index 0000000..ba1c29a --- /dev/null +++ b/doc/source/QC.md @@ -0,0 +1,33 @@ +# long read QC for RNA reads +The read length and the relative read length/real length (full length ratio) are essential for the downstream analysis. +For instance, we will not suggest isoform calling with full length ratio lower than 10%. However, the read counting can still +be done. + +### Basic statistics + +Some of the basic characteristics need to be known before the further analysis. For example, the read length and mapped +read length; the read estimated quality, mapping quality and the read mapping rate. + +We recommend to use the following tools for the long read QC: + +Giraffe: https://github.com/lrslab/Giraffe_View + +### Full length read ratio estimation + +1. Common case, with no 5' indicator. + +The sequencing of most of the long reads are started from the 3' end (PolyA site), so 5' indicator like the splicing leader +or artificial sequence added after de-capping could indicate if one read is likely to be full length. But for most of the +sequencing reads, we do not have this resource. As a result, we will use >95% to estimate the full length ratio. + +2. Special case, with 5' indicator. +- Splicing leaders: Some species using both cis and trans splicing, like C. elegans. The 80% trans-spliced transcripts have the splicing leader sequence at the 5' end of +mRNA reads. In this case, we can use the splicing leader sequence to estimate the full length ratio. The remained splicing +leader sequences are ~22nt long short sequences hanging at the 5' end of the reads. +- Artificial sequence: Some of the long reads are added with artificial sequence after de-capping. +The artificial sequence could be used to indicate full length read. For example, the Cappble-seq reads are generated by +decapping the 5' G cap and adding the 5' artificial sequence. Some old fashion ways like 5'RACE will also give the users +some 5' sequences to indicate the start of a transcript. + + + diff --git a/doc/source/design.md b/doc/source/design.md index e69de29..ae9822b 100644 --- a/doc/source/design.md +++ b/doc/source/design.md @@ -0,0 +1,19 @@ +# Trackcluster design +## History of ONT read accuracy and trackcluster +The direct-RNA sequencing from ONT used to have very low quaility, especially for the non-human samples. The raw read quality +is roughly 85% for RNA001 kit with R9.4.1 flowcell. With a 15% error rate, the read junctions are also error prone, which +makes the isoform calling and quantification using junctions very difficult. To accomendate the low quality reads, we have +designed trackcluster by comparing the intersection of exon/intron regions to determine the distance between different reads, +and try to correct the junctions after clustering all similar reads from one isoform. + +The regional intersection method (original trackcluster) worked well for RNA001/002 data in model organisms like _C. elegans_ and _Arabidopsis thaliana_, +as the read count for each isoform is limited. The total yield for one flowcell is around 1-2Gb. And for one experiment, the +overall yield is usually below 10Gb, and the gene expression for highly expressed genes are generally less than 50,000. +However, the method is not fast enough as we have to calculate the intersection for every two reads who have an overlap. The +time complexity is O(n^c), which is not acceptable for genes with expression higher than 50,000 (may take 24h for computation). + + +With the new RNA002 kit and new basecall models, the read quality is improved to 92% for most samples. And 8% (instead of >15%) +of error rate would allow for the junction self-correction before clustering. So we also included the junction self-correction +methods, and also the clustering methods using junctions (trackclusterj method). The time complexity for trackclusterj is roughly O(nlogn) for most +of the cases, which is acceptable for the high expressed genes. \ No newline at end of file diff --git a/trackcluster/batch.py b/trackcluster/batch.py index 088a607..cc8cf5b 100644 --- a/trackcluster/batch.py +++ b/trackcluster/batch.py @@ -51,7 +51,7 @@ def process_one_junction_corrected_try(key, full=False, batchsize=500): if bigg_nano_raw is None: return 0 - ### add the bactsize part + ### add the batch size part n_count = 100 n = 0 bigg_nano = junction_pre(bigg_nano_raw, bigg_gff) diff --git a/trackcluster/clustercj.py b/trackcluster/clustercj.py index c392d0d..9e181b6 100644 --- a/trackcluster/clustercj.py +++ b/trackcluster/clustercj.py @@ -147,81 +147,6 @@ def split_site(junction_array): return groups -def _is_junction_inside(j1, j2): - """ - test if j1 is in j2, change into comp all junctions except single exon - still very slow, unused - :param j1: junction numpy array, with all aligned part - :param j2: - :return: 0 for contain, 3 for differ, 1 for j1 in j2, -1 for j2 in j1, "e" for error - """ - # do not consider the single exon track/read - # first to simplify the two unions from the large union - - if numpy.array_equal(j1,j2): - return 0 - - # select the union of the junction list - pos_simple=[pos for pos, value in enumerate(j1+j2) if value>=1] - j1_s=j1[pos_simple] - j2_s=j2[pos_simple] - - #print(j1_s) - #print(j2_s) - - if numpy.sum(j1_s)==0 or numpy.sum(j2_s)==0: - return "is_junction_inside error: single exon" - - j12_del=j1_s-j2_s - #freq_dic = get_array_freq(j12_del) - - # filter the easy differ ones - - #if freq_dic[-1] > 0 and freq_dic[1] > 0: # has more and less junction, must be 3 - # return 3 - - # consider 3 types using group_l, this will be slower - group_l=split_site(j12_del) - #print(group_l) - - # did not consider the len=0 and len=1 - # single exon and equal junction should have been excluded - if len(group_l)>=3: - return 3 - elif len(group_l)==2: # should be all 0, 1 or all 0 ,-1 - # need to consider 5' or 3' missing - # 5' is belong, 3' is differ - a,b=group_l - if -1=1: #j1_s > j2_s - if b[0]==0: # 5' missing - return -1 - else: # 3' missing - return 3 - - elif a[0]+ b[0]<=-1: #j1_s < j2_s - if b[0]==0: # 5' missing - return 1 - else: # 3' missing - return 3 - - -def __compare_junction(j1, j2): - """ - Two junctions in j1, j2, generated by get_read_junction_dic - :param j1: numpy array, contian 0 or 1 only, the position is well aligned for all tracks, - :param j2: - :return: use 0:"equal", 1:"in", 2:"contain", 3:"diff" - does not conserider single exon tracks - """ - # do not consider the single exon track/read - - # single junction first - pass - - - def get_corrected_dic(site_cov_dic, cov_cutoff=2, pos_cutoff=10): """ diff --git a/trackcluster/clusterj.py b/trackcluster/clusterj.py index 8db8943..2335020 100644 --- a/trackcluster/clusterj.py +++ b/trackcluster/clusterj.py @@ -19,30 +19,6 @@ from trackcluster.cluster import select_list -def __filter_site_dic(site_dic, ref=None): - """ - filter away the sites which is in other chro or strand - use the junction with highest freq as ref? - :param site_dic: - :param ref: in the format of tuple("chro", "_", "strand") - :return: new_site_dic - """ - site_dic_new={} - if ref is None: - cov_max=max(site_dic.values()) - for k, v in list(site_dic.items()): - if v==cov_max: - ref=k - chro_ref, _, strand_ref=ref - else: - chro_ref, _, strand_ref=ref - - for k, v in list(site_dic.items()): - chro, _, strand=k - if chro==chro_ref and strand==strand_ref: - site_dic_new[k]=v - return site_dic_new - def chose_read_from_list(name, bigg_list): for bigg in bigg_list: @@ -51,48 +27,6 @@ def chose_read_from_list(name, bigg_list): return None -def __get_corrected_junction(bigg, junction_dic, coverage_cutoff=2, offset=5): - """ - too slooooow, as need to iter the full junction set, have revised in clustercj as get_bigg_correct - Use the high confident junctions to correct the low frequent junctions - junction dic like : (chro, pos, strand): coverage - """ - strand=bigg.strand - chro=bigg.chrom - # sanity check - - bigg.get_junction() - - # get the pos that can be used - pos_pass=[] - for k, v in list(junction_dic.items()): - if v>=coverage_cutoff: - chro_d, pos_d, strand_d=k - if chro_d==chro and strand_d==strand: - pos_pass.append(pos_d) - pos_pass_set=set(pos_pass) - - ### - bigg_junction_new=[] - corrected=[] # used for debug - - for pos in bigg.junction: - if pos in pos_pass_set: - bigg_junction_new.append(pos) - corrected.append((pos, pos)) - else: - for pos_d in pos_pass: - if abs(pos_d-pos)<=offset: - bigg_junction_new.append(pos_d) - corrected.append((pos, pos_d)) - break - else: # can not be corrected, keep as it is - bigg_junction_new.append(pos) - # debug - # print("corrected", corrected) - return bigg_junction_new - - def __get_start_end_dic(bigg_list, type="start", ref_weight=1): """ Todo: may need to chop the long isoform to shorter one by refine the 5' and 3' @@ -126,46 +60,6 @@ def __get_start_end_dic(bigg_list, type="start", ref_weight=1): return site_dic -def __chain_site(sites_l, offset=5): - """ - use the sorted sites_l? - merge the sites within 5nt to a range contaning all the fields - """ - - # merge the identical one first - sites_l = list(set(sites_l)) - sites_sorted = sorted(sites_l, key=lambda x: (x[0], x[1], x[2])) - - site_range = [] - - # init the value out of the loop - line_p = sites_sorted[0] - chro_p, start_p, strand_p = line_p - - # init range one - range_one = [chro_p, start_p, start_p + 1, strand_p] - - for n, line in enumerate(sites_sorted): - if n == 0: - pass - else: - line_p = sites_sorted[n - 1] - chro_p, start_p, strand_p = line_p - - chro, start, strand = line - - if chro_p == chro and strand_p == strand and 0 < abs(start - start_p) <= offset: - range_one[2] = start + 1 - # print range_one - # break - - else: - site_range.append(range_one) - # re_init range_one - range_one = [chro, start, start + 1, strand] - - return site_range - def is_junction_5primer(junction): group_j=group_site(junction)