Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: use estimated counts instead of TPM #170

Merged
merged 5 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified data/transcripts.fasta.gz
Binary file not shown.
32 changes: 16 additions & 16 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def get_source(
)

# process expression levels
tpm_df = self.get_source_expression(
counts_df = self.get_source_expression(
kallisto_dir=kallisto_dir,
)

Expand All @@ -173,7 +173,7 @@ def get_source(
Path(self.out_dir) / f"library_source_{fastq.name}.json"
)
LOGGER.debug(f"Writing results to file: {filename}")
tpm_df.to_json(
counts_df.to_json(
filename,
orient='split',
index=False,
Expand All @@ -182,13 +182,13 @@ def get_source(

# validate results
if validate_top_score(
vector=tpm_df['tpm'].to_list(),
vector=counts_df['est_counts'].to_list(),
min_value=self.min_match_pct,
min_ratio=self.min_freq_ratio,
rev_sorted=True,
accept_zero=True,
):
source.short_name, taxon_id = tpm_df.iloc[0]['source_ids']
source.short_name, taxon_id = counts_df.iloc[0]['source_ids']
source.taxon_id = int(taxon_id)

LOGGER.debug(f"Source: {source}")
Expand Down Expand Up @@ -247,17 +247,17 @@ def run_kallisto_quantification(
def get_source_expression(
kallisto_dir: Path,
) -> DataFrame:
"""Return percentages of total expression per read source.
"""Return percentages of total estimated counts per read source.

Args:
kallisto_dir: Directory containing Kallisto quantification results.

Returns:
Data frame with columns `source_ids` (a tuple of source short name
and taxon identifier, e.g., `("hsapiens", 9606)`) and `tpm`,
signifying the percentages of total expression per read source.
The data frame is sorted by total expression in descending
order.
and taxon identifier, e.g., `("hsapiens", 9606)`) and
`est_counts`, signifying the percentages of total estimated
counts per read source. The data frame is sorted by total
estimated counts in descending order.

Raises:
FileProblem: Kallisto quantification results could not be
Expand All @@ -283,9 +283,9 @@ def get_source_expression(
)

# handle case where no alignments are found
dat.tpm.fillna(0, inplace=True)
dat.est_counts.fillna(0, inplace=True)

# aggregate expression by source identifiers
# aggregate counts by source identifiers
dat[[
'gene_symbol',
'gene_id',
Expand All @@ -294,17 +294,17 @@ def get_source_expression(
'taxon_id'
]] = dat.target_id.str.split('|', n=4, expand=True)
dat['source_ids'] = list(zip(dat.short_name, dat.taxon_id))
total_tpm = dat.tpm.sum()
dat_agg = dat.groupby(['source_ids'])[['tpm']].agg('sum')
total_counts = dat.est_counts.sum()
dat_agg = dat.groupby(['source_ids'])[['est_counts']].agg('sum')
dat_agg['source_ids'] = dat_agg.index
dat_agg.reset_index(drop=True, inplace=True)

# calculate percentages
if total_tpm != 0:
dat_agg.tpm = dat_agg.tpm / total_tpm * 100
if total_counts != 0:
dat_agg.est_counts = dat_agg.est_counts / total_counts * 100

# return as dictionary
return dat_agg.sort_values(["tpm"], ascending=False)
return dat_agg.sort_values(["est_counts"], ascending=False)

@staticmethod
def get_source_name(
Expand Down
Loading