Merge branch 'main' into feature/wgs-preprocessing

PMBio · Jan 2, 2024 · 9b2371c · 9b2371c
2 parents 66c8afd + fc14c51
commit 9b2371c
Show file tree

Hide file tree

Showing 283 changed files with 2,622 additions and 668 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,13 @@
+# What
+
+*What does this PR do, and preferably how* 
+
+# Testing
+*Testing this PR involves X,Y,Z*
+
+## Test scenarios
+*Test these things*
+
+1. Instructions
+2. ...
+3. ...
diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
@@ -8,26 +8,33 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Training Association Testing smoke test
-        uses: snakemake/snakemake-github-action@v1.24.0
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: "example"
-          snakefile: "pipelines/training_association_testing.snakefile"
-          args: "-j 2 -n"
+          environment-name: deeprvat-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml
+          cache-environment: true
+          cache-downloads: true
+      - name: Smoketest training_association_testing pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
       - name: Link pretrained models
         run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models
-      - name: Association Testing Pretrained Smoke Test
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: "example"
-          snakefile: "pipelines/association_testing_pretrained.snakefile"
-          args: "-j 2 -n"
-      - name: Seed Gene Discovery Smoke Test
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: "example"
-          snakefile: "pipelines/seed_gene_discovery.snakefile"
-          args: "-j 2 -n"
+        shell: bash -el {0}
+      - name: Smoketest association_testing_pretrained pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
+      - name: Copy seed gene discovery snakemake config
+        run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml .
+        shell: bash -el {0}
+      - name: Smoketest seed_gene_discovery pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
 
   DeepRVAT-Pipeline-Tests:
     runs-on: ubuntu-latest
@@ -75,33 +82,52 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Preprocessing Smoke Test With QC
-        uses: snakemake/snakemake-github-action@v1.24.0
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: "example/preprocess"
-          snakefile: "pipelines/preprocess_with_qc.snakefile"
-          args: "-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml"
-          stagein: "touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz"
+          environment-name: deeprvat-preprocess-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
+          cache-environment: true
+          cache-downloads: true
+
+      - name: Fake fasta data
+        if: steps.cache-fasta.outputs.cache-hit != 'true'
+        run: |
+          cd ${{ github.workspace }}/example/preprocess && touch workdir/reference/GRCh38.primary_assembly.genome.fa
+
+      - name: Run preprocessing pipeline no qc Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
+          --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
+
+
+      - name: Preprocessing pipeline with qc Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
+          --snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
 
-      - name: Preprocessing Smoke Test No QC
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: "example/preprocess"
-          snakefile: "pipelines/preprocess_no_qc.snakefile"
-          args: "-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml"
-          stagein: "touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz"
 
   DeepRVAT-Annotation-Pipeline-Smoke-Tests:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Annotations Smoke Test
-        uses: snakemake/snakemake-github-action@v1.25.1
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: "example/annotations"
-          snakefile: "pipelines/annotations.snakefile"
-          args: "-j 2 -n --configfile pipelines/config/deeprvat_annotation_config.yaml"
+          environment-name: deeprvat-preprocess-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
+          cache-environment: true
+          cache-downloads: true
+      - name: Annotations Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/annotations \
+          --snakefile ${{ github.workspace }}/pipelines/annotations.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_annotation_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
+
 
   DeepRVAT-Preprocessing-Pipeline-Tests-No-QC:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/test-runner.yml b/.github/workflows/test-runner.yml
@@ -4,6 +4,25 @@ on: [ push ]
 
 jobs:
   DeepRVAT-Tests-Runner:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v3
+      - uses: mamba-org/setup-micromamba@v1.4.3
+        with:
+          environment-name: deeprvat-preprocess-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml
+          cache-environment: true
+          cache-downloads: true
+
+      - name: Install DeepRVAT
+        run: pip install -e ${{ github.workspace }}
+        shell: micromamba-shell {0}
+      - name: Run pytest deeprvat
+        run: pytest -v ${{ github.workspace }}/tests/deeprvat
+        shell: micromamba-shell {0}
+
+  DeepRVAT-Tests-Runner-Preprocessing:
     runs-on: ubuntu-latest
     steps:
 
@@ -20,6 +39,6 @@ jobs:
         run: pip install -e ${{ github.workspace }}
         shell: micromamba-shell {0}
 
-      - name: Run pytest
-        run: pytest -v ${{ github.workspace }}/tests
+      - name: Run pytest preprocessing
+        run: pytest -v ${{ github.workspace }}/tests/preprocessing
         shell: micromamba-shell {0}
diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py
@@ -50,7 +50,7 @@ def get_burden(
     agg_models: Dict[str, List[nn.Module]],
     device: torch.device = torch.device("cpu"),
     skip_burdens=False,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Compute burden scores for rare variants.
 
@@ -63,8 +63,8 @@ def get_burden(
     :type device: torch.device
     :param skip_burdens: Flag to skip burden computation, defaults to False.
     :type skip_burdens: bool
-    :return: Tuple containing burden scores, target y phenotype values, and x phenotypes.
-    :rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    :return: Tuple containing burden scores, target y phenotype values, x phenotypes and sample ids.
+    :rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
 
     .. note::
         Checkpoint models all corresponding to the same repeat are averaged for that repeat.
@@ -87,8 +87,9 @@ def get_burden(
 
     y = batch["y"]
     x = batch["x_phenotypes"]
+    sample_ids = batch["sample"]
 
-    return burden, y, x
+    return burden, y, x, sample_ids
 
 
 def separate_parallel_results(results: List) -> Tuple[List, ...]:
@@ -196,7 +197,9 @@ def compute_burdens_(
     bottleneck: bool = False,
     compression_level: int = 1,
     skip_burdens: bool = False,
-) -> Tuple[np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array]:
+) -> Tuple[
+    np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array, zarr.core.Array
+]:
     """
     Compute burdens using the PyTorch model for each repeat.
 
@@ -223,8 +226,8 @@ def compute_burdens_(
     :type compression_level: int
     :param skip_burdens: Flag to skip burden computation, defaults to False.
     :type skip_burdens: bool
-    :return: Tuple containing genes, burdens, target y phenotypes, and x phenotypes.
-    :rtype: Tuple[np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array]
+    :return: Tuple containing genes, burdens, target y phenotypes, x phenotypes and sample ids.
+    :rtype: Tuple[np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array, zarr.core.Array]
 
     .. note::
         Checkpoint models all corresponding to the same repeat are averaged for that repeat.
@@ -280,14 +283,15 @@ def compute_burdens_(
             file=sys.stdout,
             total=(n_samples // batch_size + (n_samples % batch_size != 0)),
         ):
-            this_burdens, this_y, this_x = get_burden(
+            this_burdens, this_y, this_x, this_sampleid = get_burden(
                 batch, agg_models, device=device, skip_burdens=skip_burdens
             )
             if i == 0:
                 if not skip_burdens:
                     chunk_burden = np.zeros(shape=(n_samples,) + this_burdens.shape[1:])
                 chunk_y = np.zeros(shape=(n_samples,) + this_y.shape[1:])
                 chunk_x = np.zeros(shape=(n_samples,) + this_x.shape[1:])
+                chunk_sampleid = np.zeros(shape=(n_samples))
 
                 logger.info(f"Batch size: {batch['rare_variant_annotations'].shape}")
 
@@ -320,6 +324,14 @@ def compute_burdens_(
                     dtype=np.float32,
                     compressor=Blosc(clevel=compression_level),
                 )
+                sample_ids = zarr.open(
+                    Path(cache_dir) / "sample_ids.zarr",
+                    mode="a",
+                    shape=(n_total_samples),
+                    chunks=(None),
+                    dtype=np.float32,
+                    compressor=Blosc(clevel=compression_level),
+                )
 
             start_idx = i * batch_size
             end_idx = min(start_idx + batch_size, chunk_end)  # read from chunk shape
@@ -329,6 +341,7 @@ def compute_burdens_(
 
             chunk_y[start_idx:end_idx] = this_y
             chunk_x[start_idx:end_idx] = this_x
+            chunk_sampleid[start_idx:end_idx] = this_sampleid
 
             if debug:
                 logger.info(
@@ -343,13 +356,14 @@ def compute_burdens_(
 
         y[chunk_start:chunk_end] = chunk_y
         x[chunk_start:chunk_end] = chunk_x
+        sample_ids[chunk_start:chunk_end] = chunk_sampleid
 
     if torch.cuda.is_available():
         logger.info(
             "Max GPU memory allocated: " f"{torch.cuda.max_memory_allocated(0)} bytes"
         )
 
-    return ds_full.rare_embedding.genes, burdens, y, x
+    return ds_full.rare_embedding.genes, burdens, y, x, sample_ids
 
 
 def load_one_model(
@@ -580,8 +594,8 @@ def compute_burdens(
     :type checkpoint_files: Tuple[str]
     :param out_dir: Path to the output directory.
     :type out_dir: str
-    :return: Corresonding genes, computed burdens, y phenotypes, and x phenotypes are saved in the out_dir.
-    :rtype: [np.ndarray], [zarr.core.Array], [zarr.core.Array], [zarr.core.Array]
+    :return: Corresonding genes, computed burdens, y phenotypes, x phenotypes and sample ids are saved in the out_dir.
+    :rtype: [np.ndarray], [zarr.core.Array], [zarr.core.Array], [zarr.core.Array], [zarr.core.Array]
 
     .. note::
         Checkpoint models all corresponding to the same repeat are averaged for that repeat.
@@ -614,7 +628,7 @@ def compute_burdens(
     else:
         agg_models = None
 
-    genes, _, _, _ = compute_burdens_(
+    genes, _, _, _, _ = compute_burdens_(
         debug,
         data_config,
         dataset,