From 75aae10ede2b7d4f360c9f5f76509f3099fab5c8 Mon Sep 17 00:00:00 2001
From: Runsheng <runsheng.lee@gmail.com>
Date: Sat, 17 Feb 2024 11:55:26 +0800
Subject: [PATCH 1/6] change default to not find new genes

---
 script/trackrun.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/script/trackrun.py b/script/trackrun.py
index 546fc9c..eb8d6c5 100644
--- a/script/trackrun.py
+++ b/script/trackrun.py
@@ -174,6 +174,8 @@ def cluster(self):
         if args.prefix is None:
             args.prefix=get_file_prefix(args.sample,sep=".")
 
+        find_novelgene=False if args.find_novelgene=="no" else True
+
         #args = parser.parse_args(args=None if sys.argv[2:] else ['--help'])
 
         flow_cluster_all_gene_novel(wkdir=args.folder,
@@ -188,7 +190,7 @@ def cluster(self):
                                  intronweight=args.intronweight,
                                  cutoff1=args.cutoff1,
                                  cutoff2=args.cutoff2,
-                                 find_novelgene=args.find_novelgene)
+                                 find_novelgene=find_novelgene)
 
 
     def clusterj(self):
@@ -224,6 +226,8 @@ def clusterj(self):
         if args.prefix is None:
             args.prefix=get_file_prefix(args.sample,sep=".")
 
+        find_novelgene=False if args.find_novelgene=="no" else True
+
         flow_clusterj_all_gene_novel(wkdir=args.folder,
                                      prefix=args.prefix,
                                      nano_bed=args.sample,
@@ -233,7 +237,7 @@ def clusterj(self):
                                      f2=args.intersect2,
                                      count_cutoff=args.count,
                                      batchsize=args.batchsize,
-                                     find_novelgene=args.find_novelgene)
+                                     find_novelgene=find_novelgene)
 
     def count(self):
         parser = argparse.ArgumentParser(

From 7f8cf169852229930ff286c27cbcf29fb058469d Mon Sep 17 00:00:00 2001
From: Runsheng <runsheng.lee@gmail.com>
Date: Sat, 17 Feb 2024 23:59:02 +0800
Subject: [PATCH 2/6] add CDS finding code

---
 trackcluster/track.py | 56 +++++++++++++++-------------------------
 trackcluster/utils.py | 59 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 75 insertions(+), 40 deletions(-)

diff --git a/trackcluster/track.py b/trackcluster/track.py
index f4622bd..42e05a4 100644
--- a/trackcluster/track.py
+++ b/trackcluster/track.py
@@ -5,7 +5,7 @@
 # @File    : track.py
 
 # third part import
-from trackcluster.utils import chr_select, reverse_complement
+from trackcluster.utils import chr_select, reverse_complement, find_longest_orf
 import os
 
 
@@ -325,7 +325,6 @@ def mrna_pos_to_chro(self, mrna_pos):
             self.get_exon()
         block_sum=self.get_cumsum(self.blockSizes)
 
-
         ### sanity check
         if 0<=mrna_pos<=block_sum[-1]:
             pass
@@ -346,7 +345,7 @@ def mrna_pos_to_chro(self, mrna_pos):
 
         chro_pos=self.chromStart+self.chromStarts[exon_num]+exon_offset
 
-        return self.chrom, chro_pos
+        return chro_pos
 
     def bind_chroseq(self, refdic, gap=0, intron=False):
         """
@@ -375,51 +374,38 @@ def bind_chroseq(self, refdic, gap=0, intron=False):
                 _, seq_intron=chr_select(refdic, chro, start_i, end_i)
                 seq_l.append(seq_intron.lower()) # lower case for intron
         seq_raw="".join(seq_l)
+
         if self.strand=="+":
             seq_out=seq_raw
         else:
             seq_out=reverse_complement(seq_raw)
-
         self.seq_chro="".join(seq_out)
 
-    def orf_find(self, refdic):
+    def get_exon_seq(self, refdic):
         """
-        :param: refdic: the reference genome
+        get the exon sequence from the reference genome
+        is a specific case for bind_chroseq
+        :param refdic:
+        :return: write seq_mrna to self.seq_mrna
         """
-        # need self.seq_chro
-        if self.seq_chro is None:
-            return None
-        else:
-            pass
+        self.bind_chroseq(refdic, gap=0, intron=False)
+        self.seq_mrna= self.seq_chro
 
-    def find_orfs_with_trans(self, trans_table=1, min_protein_length=10):
+    def orf_find(self, refdic, tans_table=1, min_protein_length=0):
         """
-        compare the three orfs, use the longest one as new protein
-        min size may need to be defined as 30aa?,so cds need to >90
-        :param trans_table:
-        :param min_protein_length:
-        :return:
+        :param: refdic: the reference genome
         """
-        if self.seq_chro is None:
-            return None
-        else:
-            from Bio.Seq import Seq
-            seq=Seq(self.seq_chro)
-
-            seq0=seq.translate(table=1)
-            seq1=seq[1:].translate(table=1)
-            seq2=seq[2:].translate(table=1)
-
-            print((str(seq0)))
-            print("======================================")
-            print((str(seq1)))
-            print("======================================")
-            print((str(seq2)))
-
-            answer=[seq0, seq1, seq2]
+        # need self.seq_mrna
+        if self.seq_mrna is None:
+            self.get_exon_seq(refdic)
+        start, end, orf_seq=find_longest_orf(self.seq_mrna, tans_table, min_protein_length)
 
+        self.thickStart=self.mrna_pos_to_chro(start)
+        self.thickEnd=self.mrna_pos_to_chro(end)
 
-        return answer
+        # todo, need to add self.cdsStartStat and self.cdsEndStat
+        # need to be  (none, unknown, incomplete, or complete)
+        # with ATG and stop codon should be complete
 
     def is_nmd(self):
         """
diff --git a/trackcluster/utils.py b/trackcluster/utils.py
index f68489e..56f29dc 100644
--- a/trackcluster/utils.py
+++ b/trackcluster/utils.py
@@ -15,7 +15,7 @@
 from collections import OrderedDict
 
 # third part import
-from Bio import SeqIO,Seq
+from Bio import SeqIO,Seq,Data
 import gzip
 
 
@@ -67,8 +67,53 @@ def chr_select(record_dict, chro, start,end):
     seq=str(record_dict[chro][start:end].seq)
     return name,seq
 
+def find_longest_orf(sequence, table_id=1, min_protein_length=0):
+    """
+    Find the longest ORF in a given sequence using BioPython.
+
+    Args:
+    - sequence (str): The DNA sequence to search within.
+    - table_id (int): The NCBI translation table ID to use.
+    - min_protein_length (int): Minimum length of protein to consider as valid ORF.
 
-def myexe(cmd, timeout=10):
+    Returns:
+    - Tuple of (start, end, orf_sequence) indicating the longest ORF found.
+    """
+    seq = Seq.Seq(sequence)
+    standard_table = Data.CodonTable.unambiguous_dna_by_id[table_id]
+
+    longest_orf = ""
+    longest_orf_start = None
+    longest_orf_end = None
+
+    for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
+        for frame in range(3):
+            trans = str(nuc[frame:].translate(table_id))
+            trans_len = len(trans)
+            aa_start = 0
+            aa_end = 0
+            while aa_start < trans_len:
+                aa_end = trans.find("*", aa_start)
+                if aa_end == -1:
+                    aa_end = trans_len
+                if aa_end - aa_start >= min_protein_length:
+                    if strand == 1:
+                        start = frame + aa_start * 3
+                        end = frame + aa_end * 3 + 3
+                    else:
+                        start = len(sequence) - (frame + aa_end * 3 + 3)
+                        end = len(sequence) - (frame + aa_start * 3)
+                    orf_seq = sequence[start:end]
+                    if len(orf_seq) > len(longest_orf):
+                        longest_orf = orf_seq
+                        longest_orf_start = start
+                        longest_orf_end = end
+                aa_start = aa_end + 1
+
+    return longest_orf_start, longest_orf_end, longest_orf
+
+
+def myexe(cmd, timeout=100):
     """
     a simple wrap of the shell
     mainly used to run the bwa mem mapping and samtool orders
@@ -83,8 +128,12 @@ def alarmHandler(signum, frame):
     proc=subprocess.Popen(cmd, shell=True, preexec_fn=setupAlarm,
                  stdout=subprocess.PIPE, stderr=subprocess.PIPE,cwd=os.getcwd())
     #print("Running:  ",  cmd)
-    out, err=proc.communicate()
-    #print("stderr out:  ",err, "\n","return code:  ",proc.returncode)
+    try:
+        out, err=proc.communicate(timeout=timeout)
+        # print("stderr out:  ",err, "\n","return code:  ",proc.returncode)
+    except subprocess.TimeoutExpired:
+        proc.kill()
+        out, err = proc.communicate()
     return out
 
 
@@ -119,7 +168,7 @@ def is_package_installed(name):
 
 def set_tmp(wkdir=None):
     """
-    set the the tmp dir to the mem
+    set the tmp dir to the mem
     """
     if wkdir is None:
         aa=os.popen("echo $XDG_RUNTIME_DIR")

From b14e3dabea26eba8594b43cf2e7f172efa7e6948 Mon Sep 17 00:00:00 2001
From: Runsheng <runsheng.lee@gmail.com>
Date: Mon, 17 Jun 2024 15:13:57 +0800
Subject: [PATCH 3/6] add CDS annotation for track

---
 test/track_test.py    | 10 ++++++----
 test/utils_test.py    |  4 ++++
 trackcluster/track.py |  6 +++++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/test/track_test.py b/test/track_test.py
index 5f00a06..5dd403d 100644
--- a/test/track_test.py
+++ b/test/track_test.py
@@ -148,14 +148,16 @@ def test_orfs(self):
         can only run with ce10 ref
         """
         # ref_dic is a large external file
-        ref_dict=fasta2dic("/data/reference/ce10_ucsc.fa")
+        ref_dict=fasta2dic("/data/reference/ce11.fa")
         bigg_one=self.bigg[30]
+        print(bigg_one)
         bigg_one.bind_chroseq(ref_dict, gap=0, intron=False)
         print((bigg_one.seq_chro))
-        ans=bigg_one.find_orfs_with_trans()
+        bigg_one.orf_find(refdic=ref_dict)
+        print((bigg_one.seq_mrna))
 
-        print(ans)
-        print(bigg_one)
+        print()
+        print(bigg_one.thickStart, bigg_one.thickEnd)
 
     def test_orf(self):
         pass
diff --git a/test/utils_test.py b/test/utils_test.py
index 316ed71..43ccc87 100644
--- a/test/utils_test.py
+++ b/test/utils_test.py
@@ -50,6 +50,10 @@ def test_detail_file(self):
         logger.debug("This is just a test, print number 100 as float" + "," + str(float(1) * 100))
         logger.debug("In total %d line, %s" % (2, "just a test number"))
 
+    def test_find_longest_orf(self):
+        seq="AAATGAACTTGACGAGTAATGAG"
+        print(find_longest_orf(seq))
+
     def tearDown(self):
         self.fafile=None
 
diff --git a/trackcluster/track.py b/trackcluster/track.py
index 42e05a4..d604adf 100644
--- a/trackcluster/track.py
+++ b/trackcluster/track.py
@@ -85,6 +85,9 @@ def __init__(self, prefix=""):
         self.seq=None
         self.subread=set() # use to store the reads contained inside the
         self.coverage=0
+        self.seq_mrna=None
+        self.seq_chro=None
+        self.seq_cds=None
 
         self.junction=[]
 
@@ -391,7 +394,7 @@ def get_exon_seq(self, refdic):
         self.bind_chroseq(refdic, gap=0, intron=False)
         self.seq_mrna= self.seq_chro
 
-    def orf_find(self, refdic, tans_table=1, min_protein_length=0):
+    def get_cds(self, refdic, tans_table=1, min_protein_length=0):
         """
         :param: refdic: the reference genome
         """
@@ -402,6 +405,7 @@ def orf_find(self, refdic, tans_table=1, min_protein_length=0):
 
         self.thickStart=self.mrna_pos_to_chro(start)
         self.thickEnd=self.mrna_pos_to_chro(end)
+        self.seq_cds=orf_seq
 
         # todo, need to add self.cdsStartStat and self.cdsEndStat
         # need to be  (none, unknown, incomplete, or complete)

From 43aac47c5c3408eec08271bb9aaa62cc2afefd05 Mon Sep 17 00:00:00 2001
From: Runsheng <runsheng.lee@gmail.com>
Date: Mon, 17 Jun 2024 15:14:13 +0800
Subject: [PATCH 4/6] minor modifications

---
 Readme.md              |  5 +++-
 doc/index.md           | 27 ++++++++++++++++++++
 doc/source/design.md   |  0
 script/bigg2seq.py     | 58 ++++++++++++++++++++++++++++++++++++++++++
 test/clustercj_test.py |  2 +-
 test/clusterj_test.py  | 11 +++++---
 trackcluster/flow.py   |  4 +--
 7 files changed, 99 insertions(+), 8 deletions(-)
 create mode 100644 doc/index.md
 create mode 100644 doc/source/design.md
 create mode 100644 script/bigg2seq.py

diff --git a/Readme.md b/Readme.md
index 1177ae3..4cf064a 100644
--- a/Readme.md
+++ b/Readme.md
@@ -11,7 +11,10 @@ Trackcluster is an isoform calling and quantification pipeline for long RNA/cDNA
 - [Walkthrough](#walkthrough)
 
 ####
-Hint: the ongoing development can be found in the ["dev"](https://github.com/Runsheng/trackcluster/tree/dev) branch.
+The ongoing development can be found in the ["dev"](https://github.com/Runsheng/trackcluster/tree/dev) branch.
+#### TODO: 
+1. fix "fusion" classification. 2. speed for clusterj. 3. splicing leader/5' indicator finding using pyssw. 
+4.add function to get the CDS start (maybe ATG) and CDS end.   
 
 ## <a name="overview"></a>Overview
 A pipeline for reference-based isoform identification and quantification using long reads. This pipeline was designed to use **only** long and nosisy reads to make a valid transcriptome. An indicator for the intact 5' could be very helpful to the pipeline, i.e, the splicing leader in the mRNA of nematodes. 
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 0000000..26bc3ce
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,27 @@
+# TrackCluster
+![PyPI](https://img.shields.io/pypi/v/trackcluster?color=green)
+
+Trackcluster is an isoform calling and quantification pipeline for long RNA/cDNA reads.
+
+
+Walkthrough for the using of trackcluster in isofrom calling in worms/mouse/human/virus datasets.
+
+## preprocessing 
+1. Long read QC
+
+Some of the basic characteristics need to be known before the further analysis. For example, the read length and mapped
+read length; the read estimated quality, mapping quality and the read mapping rate. 
+
+We recommend to use the following tools for the long read QC:
+
+Giraffe: https://github.com/lrslab/Giraffe_View
+
+
+2. Mapping Nanopore long reads to genome
+
+The mapping process is the first step for the isoform calling. 
+We recommend to use minimap2 for the mapping of the long reads to the genome.
+    - Mapping the reads for eukaryotic genomes
+    ```bash
+    minimap2 -ax splice -uf -k14 --secondary=no --MD -t 8 ref.fa read.fq.gz | samtools view -bS - | samtools sort -o read.bam -
+    ```
diff --git a/doc/source/design.md b/doc/source/design.md
new file mode 100644
index 0000000..e69de29
diff --git a/script/bigg2seq.py b/script/bigg2seq.py
new file mode 100644
index 0000000..c2fc199
--- /dev/null
+++ b/script/bigg2seq.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+# @Time    : 19/2/2024 2:14pm
+# @Author  : Runsheng
+# @File    : bigg2seq.py
+
+"""
+This script is used to write the exon information from the bigg file
+can be used to output the
+transcript sequence, CDS sequence and ensemble like EXON(uppercase)+intron(lowercase or N) sequence.
+
+"""
+import argparse
+import os,sys,inspect
+from trackcluster.utils import fasta2dic
+
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(os.path.dirname(currentdir))
+sys.path.insert(0,parentdir)
+
+from trackcluster.tracklist import read_bigg, write_bigg
+
+parser=argparse.ArgumentParser()
+parser.add_argument("-b", "--biggfile",
+                    help="the bigg bed file")
+parser.add_argument("-r", "--reference",
+                    help="the genome reference")
+parser.add_argument("-o", "--out", default="bigg.fasta",
+                    help="the output file name, default is bigg.fasta")
+parser.add_argument("-m", "--mode", default="exon",
+                    help="the format of the output, can be 'exon', 'cds', "
+                         "or 'ensembl': ensemble EXON(uppercase)+intron(lowercase) sequence"
+                         "default is 'exon' mode")
+
+args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
+
+# make a file using the functions
+outfile=args.out
+refdic=fasta2dic(args.reference)
+
+bigg_l=read_bigg(args.biggfile)
+with open (outfile, "w") as fw:
+    if args.mode=="exon":
+        for bigg in bigg_l:
+            bigg.get_exon()
+            bigg.bind_chroseq(refdic, gap=0, intron=False)
+            name=bigg.name
+            seq=bigg.seq_chro
+    elif args.mode=="cds":
+        for bigg in bigg_l:
+            bigg.get_exon()
+            bigg.get_cds()
+            name=bigg.name
+            seq=bigg.seq_cds
+
+    for bigg in bigg_l:
+        bigg.get_exon()
+        fw.write(bigg.name+"\t"+str(bigg.exonlen)+"\t"+bigg.geneName+"\t"+bigg.ttype+"\n")
\ No newline at end of file
diff --git a/test/clustercj_test.py b/test/clustercj_test.py
index 2e4bc54..40b9652 100644
--- a/test/clustercj_test.py
+++ b/test/clustercj_test.py
@@ -80,7 +80,7 @@ def test_get_read_junction_dic(self):
         print((df["1f940a05-24a1-4455-a506-b1aa04caf81a"]-df["087b20ed-78ba-48f9-a038-f23a9c4b75c6"]))
         print((df["087b20ed-78ba-48f9-a038-f23a9c4b75c6"]-df["1f940a05-24a1-4455-a506-b1aa04caf81a"]))
 
-    def test_get_arrry_freq_is_junction_inside(self):
+    def __test_get_arrry_freq_is_junction_inside(self):
         self.site_dic=get_junction_dic(self.bigg)
         df=get_read_junction_dic(self.bigg, self.site_dic)
         #j1=df["087b20ed-78ba-48f9-a038-f23a9c4b75c6"]
diff --git a/test/clusterj_test.py b/test/clusterj_test.py
index 3a583c5..672e386 100644
--- a/test/clusterj_test.py
+++ b/test/clusterj_test.py
@@ -3,6 +3,7 @@
 # @Time    : 8/9/2018 10:53 AM
 # @Author  : Runsheng     
 # @File    : cluster_test.py
+import os
 
 from trackcluster.clusterj import *
 from trackcluster.tracklist import *
@@ -32,9 +33,10 @@ def setUp(self):
         self.bigg_nano=bigg_nano
         self.bigg_gff=bigg_gff
         self.bigg=bigg_nano+bigg_gff
+        self.gene=gene
 
 
-    def test_get_corrected_junction(self):
+    def __test_get_corrected_junction(self):
         import random
         from copy import deepcopy
         #bigg=self.bigg[0]
@@ -78,11 +80,12 @@ def test_is_bigg1_inside_bigg2_junction(self):
 
     def test_junction_simple_merge(self):
 
-        print(len(self.bigg))
+        print(self.gene, "track size is ", len(self.bigg))
         bigg_n=junction_simple_merge(self.bigg)
 
         print(len(bigg_n))
-        write_bigg(bigg_n, "./test/genes/AT2G43410/tt.bed")
+        print(os.getcwd())
+        write_bigg(bigg_n, "./genes/{gene}/tt.bed".format(gene=self.gene))
 
     def test_is_single_exon_in(self):
         # get single and note single
@@ -110,7 +113,7 @@ def test_flow_junction_cluster(self):
         #write_bigg(bigg_subread, "./test/genes/AT2G43410/AT2G43410_subread.bed")
 
     def test_bigg_get_namedic(self):
-        bigg_list=read_bigg("genes/AT2G43410/AT2G43410_subread.bed")
+        bigg_list=read_bigg("./genes/AT2G43410/AT2G43410_nano.bed")
         name_dic=bigg_get_namedic(bigg_list)
         print(name_dic)
 
diff --git a/trackcluster/flow.py b/trackcluster/flow.py
index de79442..88fafb6 100644
--- a/trackcluster/flow.py
+++ b/trackcluster/flow.py
@@ -577,7 +577,7 @@ def process_desc(k):
             fw.write("\n")
 
 
-def flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=None):
+def _flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=None):
     """
     output all files used for further plotting
     :param bigg_read_file:
@@ -601,7 +601,7 @@ def flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=N
 
 
 ############
-def flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30,
+def _flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30,
                                  f1=0.01, f2=0.01, count_cutoff=5, batchsize=2000):
     """
     The overall full run pipeline for impatient people

From 9d4fd4571bf5736d94012a05c0d2e207e118d6ba Mon Sep 17 00:00:00 2001
From: Runsheng <runsheng.lee@gmail.com>
Date: Tue, 15 Oct 2024 10:52:31 +0800
Subject: [PATCH 5/6] fix desc jump out

---
 trackcluster/flow.py | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/trackcluster/flow.py b/trackcluster/flow.py
index 88fafb6..94118f3 100644
--- a/trackcluster/flow.py
+++ b/trackcluster/flow.py
@@ -499,21 +499,27 @@ def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None, cor
     def process_class4(k):
         nanos = gene2isoform[k]
         refs = gene2ref[k]
+        out_class4_s = []
         for bigg in nanos:
             try:
                 out_class4 = flow_class4(bigg, refs, offset=offset)
-                return out_class4
             except IndexError:
-                return None
+                continue
+            out_class4_s.append(out_class4)
+        return out_class4_s
+
     def process_desc(k):
         nanos = gene2isoform[k]
         refs = gene2ref[k]
+        out_descs = []
         for bigg in nanos:
             try:
                 out_desc = flow_desc(bigg, refs, offset=offset)
-                return out_desc
             except IndexError:
-                return None
+                continue
+            out_descs.append(out_desc)
+        return out_descs
+
     print("Running class4")
     class4=parmap(process_class4, tqdm(keys), nprocs=core)
     class4=[x for x in class4 if x is not None]
@@ -601,21 +607,3 @@ def _flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=
 
 
 ############
-def _flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30,
-                                 f1=0.01, f2=0.01, count_cutoff=5, batchsize=2000):
-    """
-    The overall full run pipeline for impatient people
-    :param wkdir:
-    :param prefix:
-    :param ref_fasta:
-    :param fastq_l:
-    :param nano_bed:
-    :param gff_bed:
-    :param core:
-    :param f1:
-    :param f2:
-    :param count_cutoff:
-    :param batchsize:
-    :return:
-    """
-    pass

From 45fcb93b5dd82ae767bd40585774d50948ef1423 Mon Sep 17 00:00:00 2001
From: Runsheng <runsheng.lee@gmail.com>
Date: Tue, 15 Oct 2024 10:52:56 +0800
Subject: [PATCH 6/6] add doc and update functions

---
 doc/source/QC.md          |  33 ++++++++++++
 doc/source/design.md      |  19 +++++++
 trackcluster/batch.py     |   2 +-
 trackcluster/clustercj.py |  75 ---------------------------
 trackcluster/clusterj.py  | 106 --------------------------------------
 5 files changed, 53 insertions(+), 182 deletions(-)
 create mode 100644 doc/source/QC.md

diff --git a/doc/source/QC.md b/doc/source/QC.md
new file mode 100644
index 0000000..ba1c29a
--- /dev/null
+++ b/doc/source/QC.md
@@ -0,0 +1,33 @@
+# long read QC for RNA reads
+The read length and the relative read length/real length (full length ratio) are essential for the downstream analysis. 
+For instance, we will not suggest isoform calling with full length ratio lower than 10%. However, the read counting can still
+be done.
+
+### Basic statistics
+
+Some of the basic characteristics need to be known before the further analysis. For example, the read length and mapped
+read length; the read estimated quality, mapping quality and the read mapping rate. 
+
+We recommend to use the following tools for the long read QC:
+
+Giraffe: https://github.com/lrslab/Giraffe_View
+
+### Full length read ratio estimation
+
+1. Common case, with no 5' indicator. 
+
+The sequencing of most of the long reads are started from the 3' end (PolyA site), so 5' indicator like the splicing leader
+or artificial sequence added after de-capping could indicate if one read is likely to be full length. But for most of the
+sequencing reads, we do not have this resource. As a result, we will use >95% to estimate the full length ratio.
+
+2. Special case, with 5' indicator.
+-   Splicing leaders: Some species using both cis and trans splicing, like C. elegans. The 80% trans-spliced transcripts have the splicing leader sequence at the 5' end of 
+mRNA reads. In this case, we can use the splicing leader sequence to estimate the full length ratio. The remained splicing 
+leader sequences are ~22nt long short sequences hanging at the 5' end of the reads.
+-  Artificial sequence: Some of the long reads are added with artificial sequence after de-capping. 
+The artificial sequence could be used to indicate full length read. For example, the Cappble-seq reads are generated by 
+decapping the 5' G cap and adding the 5' artificial sequence. Some old fashion ways like 5'RACE will also give the users
+some 5' sequences to indicate the start of a transcript.
+
+
+
diff --git a/doc/source/design.md b/doc/source/design.md
index e69de29..ae9822b 100644
--- a/doc/source/design.md
+++ b/doc/source/design.md
@@ -0,0 +1,19 @@
+# Trackcluster design
+## History of ONT read accuracy and trackcluster
+The direct-RNA sequencing from ONT used to have very low quaility, especially for the non-human samples. The raw read quality
+is roughly 85% for RNA001 kit with R9.4.1 flowcell. With a 15% error rate, the read junctions are also error prone, which
+makes the isoform calling and quantification using junctions very difficult. To accomendate the low quality reads, we have
+designed trackcluster by comparing the intersection of exon/intron regions to determine the distance between different reads,
+and try to correct the junctions after clustering all similar reads from one isoform. 
+
+The regional intersection method (original trackcluster) worked well for RNA001/002 data in model organisms like _C. elegans_ and _Arabidopsis thaliana_, 
+as the read count for each isoform is limited. The total yield for one flowcell is around 1-2Gb. And for one experiment, the 
+overall yield is usually below 10Gb, and the gene expression for highly expressed genes are generally less than 50,000. 
+However, the method is not fast enough as we have to calculate the intersection for every two reads who have an overlap. The
+time complexity is O(n^c), which is not acceptable for genes with expression higher than 50,000 (may take 24h for computation).
+
+
+With the new RNA002 kit and new basecall models, the read quality is improved to 92% for most samples. And 8% (instead of >15%) 
+of error rate would allow for the junction self-correction before clustering. So we also included the junction self-correction
+methods, and also the clustering methods using junctions (trackclusterj method). The time complexity for trackclusterj is roughly O(nlogn) for most 
+of the cases, which is acceptable for the high expressed genes.
\ No newline at end of file
diff --git a/trackcluster/batch.py b/trackcluster/batch.py
index 088a607..cc8cf5b 100644
--- a/trackcluster/batch.py
+++ b/trackcluster/batch.py
@@ -51,7 +51,7 @@ def process_one_junction_corrected_try(key, full=False, batchsize=500):
     if bigg_nano_raw is None:
         return 0
 
-    ### add the bactsize part
+    ### add the batch size part
     n_count = 100
     n = 0
     bigg_nano = junction_pre(bigg_nano_raw, bigg_gff)
diff --git a/trackcluster/clustercj.py b/trackcluster/clustercj.py
index c392d0d..9e181b6 100644
--- a/trackcluster/clustercj.py
+++ b/trackcluster/clustercj.py
@@ -147,81 +147,6 @@ def split_site(junction_array):
     return groups
 
 
-def _is_junction_inside(j1, j2):
-    """
-    test if j1 is in j2， change into comp all junctions except single exon
-    still very slow， unused
-    :param j1: junction numpy array, with all aligned part
-    :param j2:
-    :return: 0 for contain, 3 for differ, 1 for j1 in j2, -1 for j2 in j1, "e" for error
-    """
-    # do not consider the single exon track/read
-    # first to simplify the two unions from the large union
-
-    if numpy.array_equal(j1,j2):
-        return 0
-
-    # select the union of the junction list
-    pos_simple=[pos for pos, value in enumerate(j1+j2) if value>=1]
-    j1_s=j1[pos_simple]
-    j2_s=j2[pos_simple]
-
-    #print(j1_s)
-    #print(j2_s)
-
-    if numpy.sum(j1_s)==0 or numpy.sum(j2_s)==0:
-        return "is_junction_inside error: single exon"
-
-    j12_del=j1_s-j2_s
-    #freq_dic = get_array_freq(j12_del)
-
-    # filter the easy differ ones
-
-    #if freq_dic[-1] > 0 and freq_dic[1] > 0:  # has more and less junction, must be 3
-    #    return 3
-
-    # consider 3 types using group_l, this will be slower
-    group_l=split_site(j12_del)
-    #print(group_l)
-
-    # did not consider the len=0 and len=1
-    # single exon and equal junction should have been excluded
-    if len(group_l)>=3:
-        return 3
-    elif len(group_l)==2: # should be all 0, 1 or all 0 ,-1
-        # need to consider 5' or 3' missing
-        # 5' is belong, 3' is differ
-        a,b=group_l
-        if -1<a[0]+b[0]<1: # should not happen for [-1,-1], [1,1], but no overalp
-            return 3
-        if a[0]+ b[0]>=1: #j1_s > j2_s
-            if b[0]==0: # 5' missing
-                return -1
-            else: # 3' missing
-                return 3
-
-        elif a[0]+ b[0]<=-1: #j1_s < j2_s
-            if b[0]==0: # 5' missing
-                return 1
-            else: # 3' missing
-                return 3
-
-
-def __compare_junction(j1, j2):
-    """
-    Two junctions in j1, j2, generated by get_read_junction_dic
-    :param j1: numpy array, contian 0 or 1 only, the position is well aligned for all tracks,
-    :param j2:
-    :return: use 0:"equal", 1:"in", 2:"contain", 3:"diff"
-    does not conserider single exon tracks
-    """
-    # do not consider the single exon track/read
-
-    # single junction first
-    pass
-
-
-
 
 def get_corrected_dic(site_cov_dic, cov_cutoff=2, pos_cutoff=10):
     """
diff --git a/trackcluster/clusterj.py b/trackcluster/clusterj.py
index 8db8943..2335020 100644
--- a/trackcluster/clusterj.py
+++ b/trackcluster/clusterj.py
@@ -19,30 +19,6 @@
 from trackcluster.cluster import select_list
 
 
-def __filter_site_dic(site_dic, ref=None):
-    """
-    filter away the sites which is in other chro or strand
-    use the junction with highest freq as ref?
-    :param site_dic:
-    :param ref: in the format of tuple("chro", "_", "strand")
-    :return: new_site_dic
-    """
-    site_dic_new={}
-    if ref is None:
-        cov_max=max(site_dic.values())
-        for k, v in list(site_dic.items()):
-            if v==cov_max:
-                ref=k
-        chro_ref, _, strand_ref=ref
-    else:
-        chro_ref, _, strand_ref=ref
-
-    for k, v in list(site_dic.items()):
-        chro, _, strand=k
-        if chro==chro_ref and strand==strand_ref:
-            site_dic_new[k]=v
-    return site_dic_new
-
 
 def chose_read_from_list(name, bigg_list):
     for bigg in bigg_list:
@@ -51,48 +27,6 @@ def chose_read_from_list(name, bigg_list):
     return None
 
 
-def __get_corrected_junction(bigg, junction_dic, coverage_cutoff=2, offset=5):
-    """
-    too slooooow, as need to iter the full junction set, have revised in clustercj as get_bigg_correct
-    Use the high confident junctions to correct the low frequent junctions
-    junction dic like : (chro, pos, strand): coverage
-    """
-    strand=bigg.strand
-    chro=bigg.chrom
-    # sanity check
-
-    bigg.get_junction()
-
-    # get the pos that can be used
-    pos_pass=[]
-    for k, v in list(junction_dic.items()):
-        if v>=coverage_cutoff:
-            chro_d, pos_d, strand_d=k
-            if chro_d==chro and strand_d==strand:
-                pos_pass.append(pos_d)
-    pos_pass_set=set(pos_pass)
-
-    ###
-    bigg_junction_new=[]
-    corrected=[] # used for debug
-
-    for pos in bigg.junction:
-        if pos in pos_pass_set:
-            bigg_junction_new.append(pos)
-            corrected.append((pos, pos))
-        else:
-            for pos_d in pos_pass:
-                if abs(pos_d-pos)<=offset:
-                    bigg_junction_new.append(pos_d)
-                    corrected.append((pos, pos_d))
-                    break
-                else: # can not be corrected, keep as it is
-                    bigg_junction_new.append(pos)
-    # debug
-    # print("corrected", corrected)
-    return bigg_junction_new
-
-
 def __get_start_end_dic(bigg_list, type="start", ref_weight=1):
     """
     Todo： may need to chop the long isoform to shorter one by refine the 5' and 3'
@@ -126,46 +60,6 @@ def __get_start_end_dic(bigg_list, type="start", ref_weight=1):
     return site_dic
 
 
-def __chain_site(sites_l, offset=5):
-    """
-    use the sorted sites_l?
-    merge the sites within 5nt to a range contaning all the fields
-    """
-
-    # merge the identical one first
-    sites_l = list(set(sites_l))
-    sites_sorted = sorted(sites_l, key=lambda x: (x[0], x[1], x[2]))
-
-    site_range = []
-
-    # init the value out of the loop
-    line_p = sites_sorted[0]
-    chro_p, start_p, strand_p = line_p
-
-    # init range one
-    range_one = [chro_p, start_p, start_p + 1, strand_p]
-
-    for n, line in enumerate(sites_sorted):
-        if n == 0:
-            pass
-        else:
-            line_p = sites_sorted[n - 1]
-            chro_p, start_p, strand_p = line_p
-
-            chro, start, strand = line
-
-            if chro_p == chro and strand_p == strand and 0 < abs(start - start_p) <= offset:
-                range_one[2] = start + 1
-                # print range_one
-                # break
-
-            else:
-                site_range.append(range_one)
-                # re_init range_one
-                range_one = [chro, start, start + 1, strand]
-
-    return site_range
-
 
 def is_junction_5primer(junction):
     group_j=group_site(junction)