extra validity tests + debug

mcwdsi · Jan 19, 2024 · d856cb6 · d856cb6
1 parent 7cd4f04
commit d856cb6
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 12 deletions.
diff --git a/src/bam2tensor/__main__.py b/src/bam2tensor/__main__.py
@@ -132,13 +132,11 @@ def main(
     """Bam2Tensor."""
     time_start = time.time()
     # Print run information
+    print(f"Genome name: {genome_name}")
     print(f"Reference fasta: {reference_fasta}")
+    print(f"Expected chromosomes: {expected_chromosomes}")
     print(f"Input path: {input_path}")
-
-    print(f"\nLoading (or generating) methylation embedding named: {reference_fasta}")
-
-    # Convert expected_chromosomes to a list
-    print(f"\tExpected chromosomes: {expected_chromosomes}")
+    print(f"\nLoading (or generating) methylation embedding for: {genome_name}")
 
     # Create (or load) a GenomeMethylationEmbedding object
     genome_methylation_embedding = GenomeMethylationEmbedding(

diff --git a/src/bam2tensor/embedding.py b/src/bam2tensor/embedding.py
@@ -84,6 +84,12 @@ def __init__(
         if not skip_cache:
             try:
                 cache_available = self.load_embedding_cache()
+                assert (
+                    expected_chromosomes == self.expected_chromosomes
+                ), "Expected chromosomes do not match cached chromosomes!"
+                assert (
+                    window_size == self.window_size
+                ), "Window size does not match cached window size!"
             except FileNotFoundError as e:
                 if self.verbose:
                     print("Could not load methylation embedding from cache: " + str(e))
@@ -108,7 +114,10 @@ def __init__(
         # How many CpG sites are there?
         self.total_cpg_sites = sum([len(v) for v in self.cpg_sites_dict.values()])
         if self.verbose:
-            print(f"\t\tTotal CpG sites: {self.total_cpg_sites:,}")
+            print(f"\tTotal CpG sites: {self.total_cpg_sites:,}")
+            print(
+                f"\tTotal number of windows (at window_size = {self.window_size}): {len(self.windowed_cpg_sites_dict):,}"
+            )
 
         # Create a dictionary of chromosome -> CpG site -> index (embedding) for efficient lookup
         self.chr_to_cpg_to_embedding_dict = {
@@ -127,7 +136,7 @@ def __init__(
         )
 
         if verbose:
-            print(f"Loaded methylation embedding for: {self.genome_name}")
+            print("Loaded methylation embedding.")
 
     def save_embedding_cache(self):
         """Save a cache of expensive objects as our methylation embedding."""
@@ -163,12 +172,9 @@ def load_embedding_cache(self) -> bool:
             If the cached CpG site file cannot be found.
         """
 
-        if self.verbose:
-            print(f"\tLoading embedding data for: {self.genome_name}")
-
         if os.path.exists(self.cache_file):
             if self.verbose:
-                print(f"\t\tReading embedding from cache: {self.cache_file}")
+                print(f"\tReading embedding from cache: {self.cache_file}")
 
             # TODO: Add type hinting via TypedDicts?
             # e.g. https://stackoverflow.com/questions/51291722/define-a-jsonable-type-using-mypy-pep-526
@@ -195,8 +201,10 @@ def load_embedding_cache(self) -> bool:
                 ].items()
             }
 
+            if self.verbose:
+                print(f"\tCached genome fasta source: {self.fasta_source}")
         else:
-            raise FileNotFoundError("\tNo cache of embedding found.")
+            raise FileNotFoundError("No cache of embedding found.")
 
         return True