Skip to content

Commit

Permalink
Merge branch 'main' into feature/wgs-preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed Jan 2, 2024
2 parents 66c8afd + fc14c51 commit 9b2371c
Show file tree
Hide file tree
Showing 283 changed files with 2,622 additions and 668 deletions.
13 changes: 13 additions & 0 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# What

*What does this PR do, and preferably how*

# Testing
*Testing this PR involves X,Y,Z*

## Test scenarios
*Test these things*

1. Instructions
2. ...
3. ...
96 changes: 61 additions & 35 deletions .github/workflows/github-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,33 @@ jobs:
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Training Association Testing smoke test
uses: snakemake/snakemake-github-action@v1.24.0
- uses: mamba-org/setup-micromamba@v1.4.3
with:
directory: "example"
snakefile: "pipelines/training_association_testing.snakefile"
args: "-j 2 -n"
environment-name: deeprvat-gh-action
environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml
cache-environment: true
cache-downloads: true
- name: Smoketest training_association_testing pipeline
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
--snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs
shell: micromamba-shell {0}
- name: Link pretrained models
run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models
- name: Association Testing Pretrained Smoke Test
uses: snakemake/snakemake-github-action@v1.24.0
with:
directory: "example"
snakefile: "pipelines/association_testing_pretrained.snakefile"
args: "-j 2 -n"
- name: Seed Gene Discovery Smoke Test
uses: snakemake/snakemake-github-action@v1.24.0
with:
directory: "example"
snakefile: "pipelines/seed_gene_discovery.snakefile"
args: "-j 2 -n"
shell: bash -el {0}
- name: Smoketest association_testing_pretrained pipeline
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
--snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs
shell: micromamba-shell {0}
- name: Copy seed gene discovery snakemake config
run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml .
shell: bash -el {0}
- name: Smoketest seed_gene_discovery pipeline
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
--snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs
shell: micromamba-shell {0}

DeepRVAT-Pipeline-Tests:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -75,33 +82,52 @@ jobs:
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Preprocessing Smoke Test With QC
uses: snakemake/snakemake-github-action@v1.24.0
- uses: mamba-org/setup-micromamba@v1.4.3
with:
directory: "example/preprocess"
snakefile: "pipelines/preprocess_with_qc.snakefile"
args: "-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml"
stagein: "touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz"
environment-name: deeprvat-preprocess-gh-action
environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
cache-environment: true
cache-downloads: true

- name: Fake fasta data
if: steps.cache-fasta.outputs.cache-hit != 'true'
run: |
cd ${{ github.workspace }}/example/preprocess && touch workdir/reference/GRCh38.primary_assembly.genome.fa
- name: Run preprocessing pipeline no qc Smoke Test
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
--snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \
--configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
shell: micromamba-shell {0}


- name: Preprocessing pipeline with qc Smoke Test
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
--snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \
--configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
shell: micromamba-shell {0}

- name: Preprocessing Smoke Test No QC
uses: snakemake/snakemake-github-action@v1.24.0
with:
directory: "example/preprocess"
snakefile: "pipelines/preprocess_no_qc.snakefile"
args: "-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml"
stagein: "touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz"

DeepRVAT-Annotation-Pipeline-Smoke-Tests:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Annotations Smoke Test
uses: snakemake/snakemake-github-action@v1.25.1
- uses: mamba-org/setup-micromamba@v1.4.3
with:
directory: "example/annotations"
snakefile: "pipelines/annotations.snakefile"
args: "-j 2 -n --configfile pipelines/config/deeprvat_annotation_config.yaml"
environment-name: deeprvat-preprocess-gh-action
environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
cache-environment: true
cache-downloads: true
- name: Annotations Smoke Test
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/annotations \
--snakefile ${{ github.workspace }}/pipelines/annotations.snakefile \
--configfile ${{ github.workspace }}/pipelines/config/deeprvat_annotation_config.yaml --show-failed-logs
shell: micromamba-shell {0}


DeepRVAT-Preprocessing-Pipeline-Tests-No-QC:
runs-on: ubuntu-latest
Expand Down
23 changes: 21 additions & 2 deletions .github/workflows/test-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,25 @@ on: [ push ]

jobs:
DeepRVAT-Tests-Runner:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v3
- uses: mamba-org/setup-micromamba@v1.4.3
with:
environment-name: deeprvat-preprocess-gh-action
environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml
cache-environment: true
cache-downloads: true

- name: Install DeepRVAT
run: pip install -e ${{ github.workspace }}
shell: micromamba-shell {0}
- name: Run pytest deeprvat
run: pytest -v ${{ github.workspace }}/tests/deeprvat
shell: micromamba-shell {0}

DeepRVAT-Tests-Runner-Preprocessing:
runs-on: ubuntu-latest
steps:

Expand All @@ -20,6 +39,6 @@ jobs:
run: pip install -e ${{ github.workspace }}
shell: micromamba-shell {0}

- name: Run pytest
run: pytest -v ${{ github.workspace }}/tests
- name: Run pytest preprocessing
run: pytest -v ${{ github.workspace }}/tests/preprocessing
shell: micromamba-shell {0}
38 changes: 26 additions & 12 deletions deeprvat/deeprvat/associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def get_burden(
agg_models: Dict[str, List[nn.Module]],
device: torch.device = torch.device("cpu"),
skip_burdens=False,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Compute burden scores for rare variants.
Expand All @@ -63,8 +63,8 @@ def get_burden(
:type device: torch.device
:param skip_burdens: Flag to skip burden computation, defaults to False.
:type skip_burdens: bool
:return: Tuple containing burden scores, target y phenotype values, and x phenotypes.
:rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
:return: Tuple containing burden scores, target y phenotype values, x phenotypes and sample ids.
:rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
.. note::
Checkpoint models all corresponding to the same repeat are averaged for that repeat.
Expand All @@ -87,8 +87,9 @@ def get_burden(

y = batch["y"]
x = batch["x_phenotypes"]
sample_ids = batch["sample"]

return burden, y, x
return burden, y, x, sample_ids


def separate_parallel_results(results: List) -> Tuple[List, ...]:
Expand Down Expand Up @@ -196,7 +197,9 @@ def compute_burdens_(
bottleneck: bool = False,
compression_level: int = 1,
skip_burdens: bool = False,
) -> Tuple[np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array]:
) -> Tuple[
np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array, zarr.core.Array
]:
"""
Compute burdens using the PyTorch model for each repeat.
Expand All @@ -223,8 +226,8 @@ def compute_burdens_(
:type compression_level: int
:param skip_burdens: Flag to skip burden computation, defaults to False.
:type skip_burdens: bool
:return: Tuple containing genes, burdens, target y phenotypes, and x phenotypes.
:rtype: Tuple[np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array]
:return: Tuple containing genes, burdens, target y phenotypes, x phenotypes and sample ids.
:rtype: Tuple[np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array, zarr.core.Array]
.. note::
Checkpoint models all corresponding to the same repeat are averaged for that repeat.
Expand Down Expand Up @@ -280,14 +283,15 @@ def compute_burdens_(
file=sys.stdout,
total=(n_samples // batch_size + (n_samples % batch_size != 0)),
):
this_burdens, this_y, this_x = get_burden(
this_burdens, this_y, this_x, this_sampleid = get_burden(
batch, agg_models, device=device, skip_burdens=skip_burdens
)
if i == 0:
if not skip_burdens:
chunk_burden = np.zeros(shape=(n_samples,) + this_burdens.shape[1:])
chunk_y = np.zeros(shape=(n_samples,) + this_y.shape[1:])
chunk_x = np.zeros(shape=(n_samples,) + this_x.shape[1:])
chunk_sampleid = np.zeros(shape=(n_samples))

logger.info(f"Batch size: {batch['rare_variant_annotations'].shape}")

Expand Down Expand Up @@ -320,6 +324,14 @@ def compute_burdens_(
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
sample_ids = zarr.open(
Path(cache_dir) / "sample_ids.zarr",
mode="a",
shape=(n_total_samples),
chunks=(None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)

start_idx = i * batch_size
end_idx = min(start_idx + batch_size, chunk_end) # read from chunk shape
Expand All @@ -329,6 +341,7 @@ def compute_burdens_(

chunk_y[start_idx:end_idx] = this_y
chunk_x[start_idx:end_idx] = this_x
chunk_sampleid[start_idx:end_idx] = this_sampleid

if debug:
logger.info(
Expand All @@ -343,13 +356,14 @@ def compute_burdens_(

y[chunk_start:chunk_end] = chunk_y
x[chunk_start:chunk_end] = chunk_x
sample_ids[chunk_start:chunk_end] = chunk_sampleid

if torch.cuda.is_available():
logger.info(
"Max GPU memory allocated: " f"{torch.cuda.max_memory_allocated(0)} bytes"
)

return ds_full.rare_embedding.genes, burdens, y, x
return ds_full.rare_embedding.genes, burdens, y, x, sample_ids


def load_one_model(
Expand Down Expand Up @@ -580,8 +594,8 @@ def compute_burdens(
:type checkpoint_files: Tuple[str]
:param out_dir: Path to the output directory.
:type out_dir: str
:return: Corresonding genes, computed burdens, y phenotypes, and x phenotypes are saved in the out_dir.
:rtype: [np.ndarray], [zarr.core.Array], [zarr.core.Array], [zarr.core.Array]
:return: Corresonding genes, computed burdens, y phenotypes, x phenotypes and sample ids are saved in the out_dir.
:rtype: [np.ndarray], [zarr.core.Array], [zarr.core.Array], [zarr.core.Array], [zarr.core.Array]
.. note::
Checkpoint models all corresponding to the same repeat are averaged for that repeat.
Expand Down Expand Up @@ -614,7 +628,7 @@ def compute_burdens(
else:
agg_models = None

genes, _, _, _ = compute_burdens_(
genes, _, _, _, _ = compute_burdens_(
debug,
data_config,
dataset,
Expand Down
Loading

0 comments on commit 9b2371c

Please sign in to comment.