Skip to content

Commit

Permalink
feat: use estimated counts instead of tpm for total expression
Browse files Browse the repository at this point in the history
  • Loading branch information
balajtimate committed Oct 22, 2024
1 parent a081c35 commit 1ec956d
Showing 1 changed file with 14 additions and 14 deletions.
28 changes: 14 additions & 14 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def get_source(
)

# process expression levels
tpm_df = self.get_source_expression(
counts_df = self.get_source_expression(
kallisto_dir=kallisto_dir,
)

Expand All @@ -173,7 +173,7 @@ def get_source(
Path(self.out_dir) / f"library_source_{fastq.name}.json"
)
LOGGER.debug(f"Writing results to file: {filename}")
tpm_df.to_json(
counts_df.to_json(
filename,
orient='split',
index=False,
Expand All @@ -182,13 +182,13 @@ def get_source(

# validate results
if validate_top_score(
vector=tpm_df['tpm'].to_list(),
vector=counts_df['est_counts'].to_list(),
min_value=self.min_match_pct,
min_ratio=self.min_freq_ratio,
rev_sorted=True,
accept_zero=True,
):
source.short_name, taxon_id = tpm_df.iloc[0]['source_ids']
source.short_name, taxon_id = counts_df.iloc[0]['source_ids']
source.taxon_id = int(taxon_id)

LOGGER.debug(f"Source: {source}")
Expand Down Expand Up @@ -254,10 +254,10 @@ def get_source_expression(
Returns:
Data frame with columns `source_ids` (a tuple of source short name
and taxon identifier, e.g., `("hsapiens", 9606)`) and `tpm`,
signifying the percentages of total expression per read source.
The data frame is sorted by total expression in descending
order.
and taxon identifier, e.g., `("hsapiens", 9606)`) and
`est_counts`, signifying the percentages of total expression
per read source. The data frame is sorted by total expression
in descending order.
Raises:
FileProblem: Kallisto quantification results could not be
Expand All @@ -283,7 +283,7 @@ def get_source_expression(
)

# handle case where no alignments are found
dat.tpm.fillna(0, inplace=True)
dat.est_counts.fillna(0, inplace=True)

# aggregate expression by source identifiers
dat[[
Expand All @@ -294,17 +294,17 @@ def get_source_expression(
'taxon_id'
]] = dat.target_id.str.split('|', n=4, expand=True)
dat['source_ids'] = list(zip(dat.short_name, dat.taxon_id))
total_tpm = dat.tpm.sum()
dat_agg = dat.groupby(['source_ids'])[['tpm']].agg('sum')
total_counts = dat.est_counts.sum()
dat_agg = dat.groupby(['source_ids'])[['est_counts']].agg('sum')
dat_agg['source_ids'] = dat_agg.index
dat_agg.reset_index(drop=True, inplace=True)

# calculate percentages
if total_tpm != 0:
dat_agg.tpm = dat_agg.tpm / total_tpm * 100
if total_counts != 0:
dat_agg.est_counts = dat_agg.est_counts / total_counts * 100

# return as dictionary
return dat_agg.sort_values(["tpm"], ascending=False)
return dat_agg.sort_values(["est_counts"], ascending=False)

@staticmethod
def get_source_name(
Expand Down

0 comments on commit 1ec956d

Please sign in to comment.