FIX STAR output, indices, annotation

dieterich-lab · May 19, 2023 · 63ae787 · 63ae787
1 parent 0cd8aab
commit 63ae787
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ## [Unreleased] - started 2023-02
 
+## [3.0.2] 2023-05-19
+
+### Added
+
+- `riboseq_sample_name_map`, `riboseq_condition_name_map` to `rpbp_predictions_dashboard.py`
+- Exception for duplicated transcript ids with _de novo_ annotation.
+
+### Fixed
+
+- STAR output
+- Redundant transcripts with _de novo_ annotation in `summarize_rpbp_predictions.py`
+- ORF numbers in labels
+
 ## [3.0.1] 2023-02-10
 
 ### Changed

diff --git a/src/rpbp/__init__.py b/src/rpbp/__init__.py
@@ -1,2 +1,2 @@
-__version_info__ = ("3", "0", "1")
+__version_info__ = ("3", "0", "2")
 __version__ = ".".join(__version_info__)
diff --git a/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py b/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py
@@ -17,6 +17,8 @@
 import pandas as pd
 import plotly.express as px
 
+import rpbp.ribo_utils.utils as ribo_utils
+
 from rpbp.defaults import orf_type_colors, orf_type_labels, orf_type_name_map
 
 # ------------------------------------------------------ Functions ------------------------------------------------------
@@ -154,6 +156,14 @@ def filter_sort_table(filter_query, sort_by):
     **Novel**: Translation event inter- or intragenic (only when Rp-Bp is run with a *de novo* assembly)
     """
 
+# ribo_utils._return_key_dict
+sample_name_map = ribo_utils.get_sample_name_map(
+    config
+)  # default to riboseq_samples.keys()
+condition_name_map = ribo_utils.get_condition_name_map(
+    config
+)  # default to riboseq_biological_replicates.keys()
+
 col_rev = {v: k for k, v in orf_type_colors.items()}
 row_col = {}
 for orf_type, labels in orf_type_labels.items():
@@ -164,6 +174,9 @@ def filter_sort_table(filter_query, sort_by):
 # *** load/wrangle data
 orfs = pd.read_csv(config["predicted_orfs"], sep="\t", low_memory=False)  # bed_utils
 orfs.columns = orfs.columns.str.replace("#", "")
+orfs["condition"] = orfs["condition"].apply(lambda x: sample_name_map[x])
+# apply condition name map, in case we also have conditions
+orfs["condition"] = orfs["condition"].apply(lambda x: condition_name_map[x])
 orfs["orf_len"] = orfs["orf_len"] / 3
 orfs["profile_sum"] = orfs[["x_1_sum", "x_2_sum", "x_3_sum"]].sum(axis=1)
 orfs["profile_sum"] = orfs["profile_sum"].astype(int)

diff --git a/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py b/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py
@@ -687,6 +687,10 @@ def main():
         bed_df_dn = bed_utils.read_bed(transcript_bed_dn, low_memory=False)[cols]
         bed_df_dn.rename(columns={"id": "transcript_id"}, inplace=True)
         bed_df = pd.concat([bed_df, bed_df_dn])
+        # now we have the problem that bed_df may contain duplicate transcript ids
+        # for the purpose of display/annotation, we favour the annotated one
+        # in general the novel transcript will be a variant (matched introns, extension, etc.)
+        bed_df.drop_duplicates(subset=["transcript_id"], inplace=True)
 
     labeled_orfs = filenames.get_labels(
         config["genome_base_path"], config["genome_name"], note=orf_note

diff --git a/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py b/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py
@@ -16,6 +16,8 @@
 
 from pathlib import Path
 
+import pandas as pd
+
 import pbiotools.misc.logging_utils as logging_utils
 import pbiotools.misc.shell_utils as shell_utils
 import pbiotools.misc.slurm as slurm
@@ -38,6 +40,11 @@
 logger = logging.getLogger(__name__)
 
 
+class DuplicateIdsError(Exception):
+    def __init__(self, message):
+        self.message = message
+
+
 def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False):
     """Process a GTF file into its ORFs."""
 
@@ -333,11 +340,20 @@ def main():
         logger.info(msg)
 
         if call:
-            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
-            concatenated_bed["orf_num"] = range(len(concatenated_bed))
+            concatenated_orfs = bed_utils.concatenate(orfs_files, sort_bed=True)
+            # this can happen... and is not currently well handled
+            if not concatenated_orfs.id.is_unique:
+                msg = (
+                    "Duplicate ORF ids were found when merging annotated and de novo ORFs. "
+                    "This is due to matching transcript ids and start/stop boundaries. "
+                    "Check you de novo annotation, and remove (or rename) these transcripts."
+                )
+                logger.error(msg)
+                raise DuplicateIdsError(msg)
+            concatenated_orfs["orf_num"] = range(len(concatenated_orfs))
             additional_columns = ["orf_num", "orf_len"]
             fields = bed_utils.bed12_field_names + additional_columns
-            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
+            bed_utils.write_bed(concatenated_orfs[fields], orfs_genomic)
         else:
             msg = "Skipping concatenation due to --call value"
             logger.info(msg)
@@ -359,9 +375,9 @@ def main():
         logger.info(msg)
 
         if call:
-            concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True)
+            concatenated_exons = bed_utils.concatenate(exons_files, sort_bed=True)
             fields = bed_utils.bed6_field_names + ["exon_index", "transcript_start"]
-            bed_utils.write_bed(concatenated_bed[fields], exons_file)
+            bed_utils.write_bed(concatenated_exons[fields], exons_file)
         else:
             msg = "Skipping concatenation due to --call value"
             logger.info(msg)
@@ -384,8 +400,17 @@ def main():
 
         if call:
             # not a BED file
-            concatenated_bed = bed_utils.concatenate(label_files, sort_bed=False)
-            bed_utils.write_bed(concatenated_bed, labeled_orfs)
+            concatenated_labels = bed_utils.concatenate(label_files, sort_bed=False)[
+                ["id", "orf_type", "transcripts"]
+            ]
+            # make sure the orf numbering is the same
+            concatenated_labels = pd.merge(
+                concatenated_labels,
+                concatenated_orfs[["id", "orf_num"]],
+                how="left",
+                on="id",
+            )
+            bed_utils.write_bed(concatenated_labels, labeled_orfs)
         else:
             msg = "Skipping concatenation due to --call value"
             logger.info(msg)