From d856cb6eb4b4d6f1601797910a4d232e63ac4af6 Mon Sep 17 00:00:00 2001 From: Nick Semenkovich Date: Fri, 19 Jan 2024 13:58:02 -0600 Subject: [PATCH] extra validity tests + debug --- src/bam2tensor/__main__.py | 8 +++----- src/bam2tensor/embedding.py | 22 +++++++++++++++------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/bam2tensor/__main__.py b/src/bam2tensor/__main__.py index 2d01c6b..07994fb 100644 --- a/src/bam2tensor/__main__.py +++ b/src/bam2tensor/__main__.py @@ -132,13 +132,11 @@ def main( """Bam2Tensor.""" time_start = time.time() # Print run information + print(f"Genome name: {genome_name}") print(f"Reference fasta: {reference_fasta}") + print(f"Expected chromosomes: {expected_chromosomes}") print(f"Input path: {input_path}") - - print(f"\nLoading (or generating) methylation embedding named: {reference_fasta}") - - # Convert expected_chromosomes to a list - print(f"\tExpected chromosomes: {expected_chromosomes}") + print(f"\nLoading (or generating) methylation embedding for: {genome_name}") # Create (or load) a GenomeMethylationEmbedding object genome_methylation_embedding = GenomeMethylationEmbedding( diff --git a/src/bam2tensor/embedding.py b/src/bam2tensor/embedding.py index 7754876..62e801f 100644 --- a/src/bam2tensor/embedding.py +++ b/src/bam2tensor/embedding.py @@ -84,6 +84,12 @@ def __init__( if not skip_cache: try: cache_available = self.load_embedding_cache() + assert ( + expected_chromosomes == self.expected_chromosomes + ), "Expected chromosomes do not match cached chromosomes!" + assert ( + window_size == self.window_size + ), "Window size does not match cached window size!" except FileNotFoundError as e: if self.verbose: print("Could not load methylation embedding from cache: " + str(e)) @@ -108,7 +114,10 @@ def __init__( # How many CpG sites are there? self.total_cpg_sites = sum([len(v) for v in self.cpg_sites_dict.values()]) if self.verbose: - print(f"\t\tTotal CpG sites: {self.total_cpg_sites:,}") + print(f"\tTotal CpG sites: {self.total_cpg_sites:,}") + print( + f"\tTotal number of windows (at window_size = {self.window_size}): {len(self.windowed_cpg_sites_dict):,}" + ) # Create a dictionary of chromosome -> CpG site -> index (embedding) for efficient lookup self.chr_to_cpg_to_embedding_dict = { @@ -127,7 +136,7 @@ def __init__( ) if verbose: - print(f"Loaded methylation embedding for: {self.genome_name}") + print("Loaded methylation embedding.") def save_embedding_cache(self): """Save a cache of expensive objects as our methylation embedding.""" @@ -163,12 +172,9 @@ def load_embedding_cache(self) -> bool: If the cached CpG site file cannot be found. """ - if self.verbose: - print(f"\tLoading embedding data for: {self.genome_name}") - if os.path.exists(self.cache_file): if self.verbose: - print(f"\t\tReading embedding from cache: {self.cache_file}") + print(f"\tReading embedding from cache: {self.cache_file}") # TODO: Add type hinting via TypedDicts? # e.g. https://stackoverflow.com/questions/51291722/define-a-jsonable-type-using-mypy-pep-526 @@ -195,8 +201,10 @@ def load_embedding_cache(self) -> bool: ].items() } + if self.verbose: + print(f"\tCached genome fasta source: {self.fasta_source}") else: - raise FileNotFoundError("\tNo cache of embedding found.") + raise FileNotFoundError("No cache of embedding found.") return True