Merge branch 'SQANTI-reads-devel' of https://github.com/ConesaLab/SQA…

…NTI3 into SQANTI-reads-devel Merging
ConesaLab · Dec 4, 2024 · 5892222 · 5892222
2 parents 116da80 + 8a317c6
commit 5892222
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -42,6 +42,8 @@ For detailed documentation, please visit [the SQANTI3 wiki](https://github.com/C
 
 * [Running SQANTI3 rescue](https://github.com/ConesaLab/SQANTI3/wiki/Running-SQANTI3-rescue)
 
+* [Running SQANTI-reads](https://github.com/ConesaLab/SQANTI3/wiki/Running-SQANTI%E2%80%90reads-(under-development))
+
 * [Tutorial: running SQANTI3 on an example dataset](https://github.com/ConesaLab/SQANTI3/wiki/Tutorial:-running-SQANTI3-on-an-example-dataset)
 
 Please, note that we are currently updating and expanding the wiki to provide as much information as possible and 

diff --git a/sqanti3_qc.py b/sqanti3_qc.py
@@ -884,12 +884,33 @@ def calc_exon_overlap(query_exons, ref_exons):
         return sum(q_bases.values())
 
     def get_diff_tss_tts(trec, ref):
+        # Calculating differences between transcript start sites (TSS) and
+        # Trasncript termination site (TTS) of two transcripts
         if trec.strand == '+':
-            diff_tss = trec.txStart - ref.txStart
-            diff_tts = trec.txEnd - ref.txEnd
+            # In positive (+) strand transcripts:
+            # TSS is calculated as reference start - transcript start
+            # TTS is calculated as transcript end - reference end
+            # This way,  TSS < 0 means the transcript is shortened, and 
+            # TSS > 0 means that transcript is elongated. Similarly, a
+            # TTS < 0 means that transcript is shortened, and a TTS > 0
+            # means that the transcript is elongated
+            diff_tss = ref.txStart - trec.txStart
+            diff_tts = trec.txEnd  - ref.txEnd
         else:
+            # In negative (-) strand transcripts:
+            # The transcripts in negative strands are loaded with trans.start = end
+            # and trans.end = start, to assure that trans.end > trans.start 
+            # regardless of the transcript. TTS and TSS are calculated with
+            # the same formula, but taking the loading fact into account,
+            # The formulas are inverted
+            # TTS is ref start - transcript start
+            # TSS is transcript end - treference end
+            # Being consistent in that  TSS < 0 means the transcript is shortened, and 
+            # TSS > 0 means that transcript is elongated. Similarly, a
+            # TTS < 0 means that transcript is shortened, and a TTS > 0
+            # means that the transcript is elongated
             diff_tts = ref.txStart - trec.txStart
-            diff_tss = ref.txEnd - trec.txEnd
+            diff_tss = trec.txEnd  - ref.txEnd
         return diff_tss, diff_tts
 
 
@@ -899,7 +920,7 @@ def get_gene_diff_tss_tts(isoform_hit):
         nearest_start_diff, nearest_end_diff = float('inf'), float('inf')
         for ref_gene in isoform_hit.genes:
             for x in start_ends_by_gene[ref_gene]['begin']:
-                d = trec.txStart - x
+                d =  x - trec.txStart
                 if abs(d) < abs(nearest_start_diff):
                     nearest_start_diff = d
             for x in start_ends_by_gene[ref_gene]['end']:
@@ -911,8 +932,8 @@ def get_gene_diff_tss_tts(isoform_hit):
             isoform_hit.tss_gene_diff = nearest_start_diff if nearest_start_diff!=float('inf') else 'NA'
             isoform_hit.tts_gene_diff = nearest_end_diff if nearest_end_diff!=float('inf') else 'NA'
         else:
-            isoform_hit.tss_gene_diff = -nearest_end_diff if nearest_start_diff!=float('inf') else 'NA'
-            isoform_hit.tts_gene_diff = -nearest_start_diff if nearest_end_diff!=float('inf') else 'NA'
+            isoform_hit.tss_gene_diff = nearest_end_diff if nearest_start_diff!=float('inf') else 'NA'
+            isoform_hit.tts_gene_diff = nearest_start_diff if nearest_end_diff!=float('inf') else 'NA'
 
     def categorize_incomplete_matches(trec, ref):
         """

diff --git a/sqanti3_wrapper.conf b/sqanti3_wrapper.conf
@@ -139,4 +139,4 @@ rescue_rules_json_file="${json_for_rules}"
 
 rescue_ml_reference_genome=${reference_fasta}
 rescue_ml_reference_gtf=${reference_gtf}
-rescue_ml_threshold=${threshold}
+rescue_ml_threshold=${threshold}
diff --git a/sqanti3_wrapper.sh b/sqanti3_wrapper.sh
@@ -1,4 +1,4 @@
-#! /bin/bash -xe
+filter_corrected_gtf#! /bin/bash -xe
 
 # Author: Fabián Robledo
 # Email: fabian.robledo@csic.es
@@ -155,7 +155,7 @@ function main () {
             if [ -z ${filter_mode} ] || [ ${filter_mode} == "ml" ] || [ $filter_mode == "both" ];
             then 
                 ${sqanti3_filter} ml ${filter_ml_ouput_folder} \
-                ${filter_ml_prefix} ${filter_isoforms} ${filter_isoannotgff3} \
+                ${filter_ml_prefix} ${filter_corrected_gtf} ${filter_isoforms} ${filter_isoannotgff3} \
                 ${filter_sam} ${filter_faa} ${monoexonic} ${filter_monoexonic} ${filter_ml_percent_training} ${filter_ml_TP} \
                 ${filter_ml_TN} ${filter_ml_threshold} \
                 ${filter_ml_max_class_size} ${filter_ml_intermediate_files} ${filter_ml_intrapriming} \
@@ -207,4 +207,4 @@ function main () {
 source "$1"
 
 # Execuing the main function
-main
+main