Skip to content

Commit

Permalink
FIX STAR output, indices, annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
eboileau committed May 19, 2023
1 parent 0cd8aab commit 63ae787
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 8 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased] - started 2023-02

## [3.0.2] 2023-05-19

### Added

- `riboseq_sample_name_map`, `riboseq_condition_name_map` to `rpbp_predictions_dashboard.py`
- Exception for duplicated transcript ids with _de novo_ annotation.

### Fixed

- STAR output
- Redundant transcripts with _de novo_ annotation in `summarize_rpbp_predictions.py`
- ORF numbers in labels

## [3.0.1] 2023-02-10

### Changed
Expand Down
2 changes: 1 addition & 1 deletion src/rpbp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version_info__ = ("3", "0", "1")
__version_info__ = ("3", "0", "2")
__version__ = ".".join(__version_info__)
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import pandas as pd
import plotly.express as px

import rpbp.ribo_utils.utils as ribo_utils

from rpbp.defaults import orf_type_colors, orf_type_labels, orf_type_name_map

# ------------------------------------------------------ Functions ------------------------------------------------------
Expand Down Expand Up @@ -154,6 +156,14 @@ def filter_sort_table(filter_query, sort_by):
**Novel**: Translation event inter- or intragenic (only when Rp-Bp is run with a *de novo* assembly)
"""

# ribo_utils._return_key_dict
sample_name_map = ribo_utils.get_sample_name_map(
config
) # default to riboseq_samples.keys()
condition_name_map = ribo_utils.get_condition_name_map(
config
) # default to riboseq_biological_replicates.keys()

col_rev = {v: k for k, v in orf_type_colors.items()}
row_col = {}
for orf_type, labels in orf_type_labels.items():
Expand All @@ -164,6 +174,9 @@ def filter_sort_table(filter_query, sort_by):
# *** load/wrangle data
orfs = pd.read_csv(config["predicted_orfs"], sep="\t", low_memory=False) # bed_utils
orfs.columns = orfs.columns.str.replace("#", "")
orfs["condition"] = orfs["condition"].apply(lambda x: sample_name_map[x])
# apply condition name map, in case we also have conditions
orfs["condition"] = orfs["condition"].apply(lambda x: condition_name_map[x])
orfs["orf_len"] = orfs["orf_len"] / 3
orfs["profile_sum"] = orfs[["x_1_sum", "x_2_sum", "x_3_sum"]].sum(axis=1)
orfs["profile_sum"] = orfs["profile_sum"].astype(int)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,10 @@ def main():
bed_df_dn = bed_utils.read_bed(transcript_bed_dn, low_memory=False)[cols]
bed_df_dn.rename(columns={"id": "transcript_id"}, inplace=True)
bed_df = pd.concat([bed_df, bed_df_dn])
# now we have the problem that bed_df may contain duplicate transcript ids
# for the purpose of display/annotation, we favour the annotated one
# in general the novel transcript will be a variant (matched introns, extension, etc.)
bed_df.drop_duplicates(subset=["transcript_id"], inplace=True)

labeled_orfs = filenames.get_labels(
config["genome_base_path"], config["genome_name"], note=orf_note
Expand Down
39 changes: 32 additions & 7 deletions src/rpbp/reference_preprocessing/prepare_rpbp_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

from pathlib import Path

import pandas as pd

import pbiotools.misc.logging_utils as logging_utils
import pbiotools.misc.shell_utils as shell_utils
import pbiotools.misc.slurm as slurm
Expand All @@ -38,6 +40,11 @@
logger = logging.getLogger(__name__)


class DuplicateIdsError(Exception):
def __init__(self, message):
self.message = message


def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False):
"""Process a GTF file into its ORFs."""

Expand Down Expand Up @@ -333,11 +340,20 @@ def main():
logger.info(msg)

if call:
concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
concatenated_bed["orf_num"] = range(len(concatenated_bed))
concatenated_orfs = bed_utils.concatenate(orfs_files, sort_bed=True)
# this can happen... and is not currently well handled
if not concatenated_orfs.id.is_unique:
msg = (
"Duplicate ORF ids were found when merging annotated and de novo ORFs. "
"This is due to matching transcript ids and start/stop boundaries. "
"Check you de novo annotation, and remove (or rename) these transcripts."
)
logger.error(msg)
raise DuplicateIdsError(msg)
concatenated_orfs["orf_num"] = range(len(concatenated_orfs))
additional_columns = ["orf_num", "orf_len"]
fields = bed_utils.bed12_field_names + additional_columns
bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
bed_utils.write_bed(concatenated_orfs[fields], orfs_genomic)
else:
msg = "Skipping concatenation due to --call value"
logger.info(msg)
Expand All @@ -359,9 +375,9 @@ def main():
logger.info(msg)

if call:
concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True)
concatenated_exons = bed_utils.concatenate(exons_files, sort_bed=True)
fields = bed_utils.bed6_field_names + ["exon_index", "transcript_start"]
bed_utils.write_bed(concatenated_bed[fields], exons_file)
bed_utils.write_bed(concatenated_exons[fields], exons_file)
else:
msg = "Skipping concatenation due to --call value"
logger.info(msg)
Expand All @@ -384,8 +400,17 @@ def main():

if call:
# not a BED file
concatenated_bed = bed_utils.concatenate(label_files, sort_bed=False)
bed_utils.write_bed(concatenated_bed, labeled_orfs)
concatenated_labels = bed_utils.concatenate(label_files, sort_bed=False)[
["id", "orf_type", "transcripts"]
]
# make sure the orf numbering is the same
concatenated_labels = pd.merge(
concatenated_labels,
concatenated_orfs[["id", "orf_num"]],
how="left",
on="id",
)
bed_utils.write_bed(concatenated_labels, labeled_orfs)
else:
msg = "Skipping concatenation due to --call value"
logger.info(msg)
Expand Down

0 comments on commit 63ae787

Please sign in to comment.