diff --git a/.github/workflows/code-tests.yml b/.github/workflows/code-tests.yml new file mode 100644 index 00000000..968b9b59 --- /dev/null +++ b/.github/workflows/code-tests.yml @@ -0,0 +1,24 @@ +name: DeepRVAT code test runner +run-name: DeepRVAT Code Tests ๐Ÿง‘๐Ÿผโ€๐Ÿ’ปโœ… +on: [ push ] + +jobs: + DeepRVAT-Tests-Runner-Preprocessing: + uses: ./.github/workflows/run-pytest.yml + with: + environment_file: ./deeprvat_preprocessing_env.yml + test_path: ./tests/preprocessing + + DeepRVAT-Tests-Runner-Annotations: + uses: ./.github/workflows/run-pytest.yml + with: + environment_file: ./deeprvat_annotations.yml + test_path: ./tests/annotations + parallel_tests: true + + DeepRVAT-Tests-Runner: + uses: ./.github/workflows/run-pytest.yml + with: + environment_file: ./deeprvat_env_no_gpu.yml + parallel_tests: true + test_path: ./tests/deeprvat diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml deleted file mode 100644 index 3708d48b..00000000 --- a/.github/workflows/github-actions.yml +++ /dev/null @@ -1,211 +0,0 @@ -name: DeepRVAT -run-name: DeepRVAT ๐Ÿงฌ๐Ÿงช๐Ÿ’ป๐Ÿง‘โ€๐Ÿ”ฌ -on: [ push ] - -jobs: - DeepRVAT-Pipeline-Smoke-Tests: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-gh-action - environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml - cache-environment: true - cache-downloads: true - - name: Smoketest training_association_testing pipeline - run: | - python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \ - --snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs - shell: micromamba-shell {0} - - name: Link pretrained models - run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - shell: bash -el {0} - - name: Smoketest association_testing_pretrained pipeline - run: | - python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \ - --snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs - shell: micromamba-shell {0} - - name: Copy seed gene discovery snakemake config - run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml . - shell: bash -el {0} - - name: Smoketest seed_gene_discovery pipeline - run: | - python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \ - --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs - shell: micromamba-shell {0} - - DeepRVAT-Pipeline-Tests: - runs-on: ubuntu-latest - needs: DeepRVAT-Pipeline-Smoke-Tests - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-gh-action - environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml - cache-environment: true - cache-downloads: true - - name: Install DeepRVAT - run: pip install -e ${{ github.workspace }} - shell: micromamba-shell {0} - # There are no GPUs on the gh worker, so we disable it in the config - - name: Update config to use no gpus - run: "sed -i 's/gpus: 1/gpus: 0/' ${{ github.workspace }}/example/config.yaml" - shell: bash -el {0} - - name: Run training_association_testing pipeline - run: | - python -m snakemake -j 2 --directory ${{ github.workspace }}/example \ - --snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs - shell: micromamba-shell {0} - - name: Link pretrained models - run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - shell: bash -el {0} - - name: Run association_testing_pretrained pipeline - run: | - python -m snakemake -j 2 --directory ${{ github.workspace }}/example \ - --snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs - shell: micromamba-shell {0} - - name: Copy seed gene discovery snakemake config - run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml . - shell: bash -el {0} - - name: Run seed_gene_discovery pipeline - run: | - python -m snakemake -j 2 --directory ${{ github.workspace }}/example \ - --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs - shell: micromamba-shell {0} - - - DeepRVAT-Preprocessing-Pipeline-Smoke-Tests: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-preprocess-gh-action - environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml - cache-environment: true - cache-downloads: true - - - name: Fake fasta data - if: steps.cache-fasta.outputs.cache-hit != 'true' - run: | - cd ${{ github.workspace }}/example/preprocess && touch workdir/reference/GRCh38.primary_assembly.genome.fa - - - name: Run preprocessing pipeline no qc Smoke Test - run: | - python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \ - --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \ - --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs - shell: micromamba-shell {0} - - - - name: Preprocessing pipeline with qc Smoke Test - run: | - python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \ - --snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \ - --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs - shell: micromamba-shell {0} - - - DeepRVAT-Annotation-Pipeline-Smoke-Tests: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-preprocess-gh-action - environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml - cache-environment: true - cache-downloads: true - - name: Annotations Smoke Test - run: | - python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/annotations \ - --snakefile ${{ github.workspace }}/pipelines/annotations.snakefile \ - --configfile ${{ github.workspace }}/pipelines/config/deeprvat_annotation_config.yaml --show-failed-logs - shell: micromamba-shell {0} - - - DeepRVAT-Preprocessing-Pipeline-Tests-No-QC: - runs-on: ubuntu-latest - needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-preprocess-gh-action - environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml - cache-environment: true - cache-downloads: true - - - name: Install DeepRVAT - run: pip install -e ${{ github.workspace }} - shell: micromamba-shell {0} - - - name: Cache Fasta file - id: cache-fasta - uses: actions/cache@v4 - with: - path: example/preprocess/workdir/reference - key: ${{ runner.os }}-reference-fasta - - - name: Download and unpack fasta data - if: steps.cache-fasta.outputs.cache-hit != 'true' - run: | - cd ${{ github.workspace }}/example/preprocess && \ - wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38.primary_assembly.genome.fa.gz \ - -O workdir/reference/GRCh38.primary_assembly.genome.fa.gz \ - && gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz - - - name: Run preprocessing pipeline - run: | - python -m snakemake -j 2 --directory ${{ github.workspace }}/example/preprocess \ - --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \ - --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs - shell: micromamba-shell {0} - - - DeepRVAT-Preprocessing-Pipeline-Tests-With-QC: - runs-on: ubuntu-latest - needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests - steps: - - - name: Check out repository code - uses: actions/checkout@v4 - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-preprocess-gh-action - environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml - cache-environment: true - cache-downloads: true - - - name: Install DeepRVAT - run: pip install -e ${{ github.workspace }} - shell: micromamba-shell {0} - - - name: Cache Fasta file - id: cache-fasta - uses: actions/cache@v4 - with: - path: example/preprocess/workdir/reference - key: ${{ runner.os }}-reference-fasta - - - name: Download and unpack fasta data - if: steps.cache-fasta.outputs.cache-hit != 'true' - run: | - cd ${{ github.workspace }}/example/preprocess && \ - wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38.primary_assembly.genome.fa.gz \ - -O workdir/reference/GRCh38.primary_assembly.genome.fa.gz \ - && gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz - - - name: Run preprocessing pipeline - run: | - python -m snakemake -j 2 --directory ${{ github.workspace }}/example/preprocess \ - --snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \ - --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs - shell: micromamba-shell {0} diff --git a/.github/workflows/pipeline-tests.yml b/.github/workflows/pipeline-tests.yml new file mode 100644 index 00000000..0cf7e64c --- /dev/null +++ b/.github/workflows/pipeline-tests.yml @@ -0,0 +1,155 @@ +name: DeepRVAT Pipeline Tests +run-name: DeepRVAT Pipeline Tests ๐Ÿงฌ๐Ÿงช๐Ÿ’ป๐Ÿง‘โ€๐Ÿ”ฌ +on: [ push ] + +jobs: + # Training Pipeline + Smoke-RunTraining: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/run_training.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + + Pipeline-Tests-RunTraining: + needs: Smoke-RunTraining + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/run_training.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + dry_run: false + + # Association Testing Pretrained Pipeline + Smoke-Association-Testing-Pretrained: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/association_testing_pretrained.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + prerun_cmd: cd ./example && ln -s ../pretrained_models + + Pipeline-Tests-Training-Association-Testing: + needs: Smoke-Association-Testing-Pretrained + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/association_testing_pretrained.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + prerun_cmd: cd ./example && ln -s ../pretrained_models + dry_run: false + + # Association Testing Pretrained Regenie + Smoke-Association-Testing-Pretrained-Regenie: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/association_testing_pretrained_regenie.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + prerun_cmd: cd ./example && ln -s ../pretrained_models + + Pipeline-Tests-Association-Testing-Pretrained-Regenie: + needs: Smoke-Association-Testing-Pretrained-Regenie + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/association_testing_pretrained_regenie.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + prerun_cmd: cd ./example && ln -s ../pretrained_models + dry_run: false + + # Association Testing Training + Smoke-Association-Testing-Training: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/training_association_testing.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + + Pipeline-Tests-Association-Testing-Training: + needs: Smoke-Association-Testing-Training + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/training_association_testing.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + dry_run: false + + # Association Testing Training Regenie + Smoke-Association-Testing-Training-Regenie: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/training_association_testing_regenie.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + + Pipeline-Tests-Training-Association-Testing-Regenie: + needs: Smoke-Association-Testing-Training-Regenie + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/training_association_testing_regenie.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + dry_run: false + + # Seed Gene Discovery + Smoke-Seed-Gene-Discovery: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/seed_gene_discovery.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + prerun_cmd: cd ./example && cp ../deeprvat/seed_gene_discovery/config.yaml . + + Pipeline-Tests-Seed-Gene-Discovery: + needs: Smoke-Seed-Gene-Discovery + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/seed_gene_discovery.snakefile + environment_file: ./deeprvat_env_no_gpu.yml + prerun_cmd: cd ./example && cp ../deeprvat/seed_gene_discovery/config.yaml . + dry_run: false + + # Preprocessing With QC + Smoke-Preprocessing-With-QC: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/preprocess_with_qc.snakefile + environment_file: ./deeprvat_preprocessing_env.yml + pipeline_directory: ./example/preprocess + pipeline_config: ./pipelines/config/deeprvat_preprocess_config.yaml + download_fasta_data: true + fasta_download_path: ./example/preprocess/workdir/reference + + Pipeline-Tests-Preprocessing-With-QC: + needs: Smoke-Preprocessing-With-QC + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/preprocess_with_qc.snakefile + environment_file: ./deeprvat_preprocessing_env.yml + pipeline_directory: ./example/preprocess + pipeline_config: ./pipelines/config/deeprvat_preprocess_config.yaml + dry_run: false + download_fasta_data: true + fasta_download_path: ./example/preprocess/workdir/reference + + # Preprocessing-No-QC + Smoke-Preprocessing-No-QC: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/preprocess_no_qc.snakefile + environment_file: ./deeprvat_preprocessing_env.yml + pipeline_directory: ./example/preprocess + pipeline_config: ./pipelines/config/deeprvat_preprocess_config.yaml + download_fasta_data: true + fasta_download_path: ./example/preprocess/workdir/reference + + Pipeline-Tests-Preprocessing-No-QC: + needs: Smoke-Preprocessing-No-QC + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/preprocess_no_qc.snakefile + environment_file: ./deeprvat_preprocessing_env.yml + pipeline_directory: ./example/preprocess + pipeline_config: ./pipelines/config/deeprvat_preprocess_config.yaml + dry_run: false + download_fasta_data: true + fasta_download_path: ./example/preprocess/workdir/reference + + # Annotation Pipeline + Smoke-Annotation-Pipeline: + uses: ./.github/workflows/run-pipeline.yml + with: + pipeline_file: ./pipelines/annotations.snakefile + environment_file: ./deeprvat_annotations.yml + pipeline_config: ./pipelines/config/deeprvat_annotation_config.yaml + pipeline_directory: ./example/annotations diff --git a/.github/workflows/run-pipeline.yml b/.github/workflows/run-pipeline.yml new file mode 100644 index 00000000..6971a7fd --- /dev/null +++ b/.github/workflows/run-pipeline.yml @@ -0,0 +1,87 @@ +name: Run snakemake pipeline + +on: + workflow_call: + inputs: + environment_file: + required: true + type: string + prerun_cmd: + required: false + type: string + pipeline_file: + required: true + type: string + no_gpu: + required: false + type: boolean + default: true + pipeline_directory: + required: false + type: string + default: ./example + pipeline_config: + required: false + type: string + dry_run: + required: false + default: true + type: boolean + download_fasta_data: + required: false + default: false + type: boolean + fasta_download_path: + required: false + type: string + postrun_cmd: + required: false + type: string + +jobs: + Run-Pipeline: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v4 + - uses: mamba-org/setup-micromamba@v1.8.1 + with: + environment-file: ${{inputs.environment_file}} + cache-environment: true + cache-downloads: true + - name: Install DeepRVAT + run: pip install -e ${{ github.workspace }} + shell: micromamba-shell {0} + - name: Cache Fasta file + if: inputs.download_fasta_data + id: cache-fasta + uses: actions/cache@v4 + with: + path: ${{ inputs.fasta_download_path}} + key: cache-reference-fasta-${{ inputs.fasta_download_path}} + - name: Download and unpack fasta data + if: inputs.download_fasta_data && steps.cache-fasta.outputs.cache-hit != 'true' + run: | + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38.primary_assembly.genome.fa.gz \ + -O ${{ inputs.fasta_download_path}}/GRCh38.primary_assembly.genome.fa.gz \ + && gzip -d ${{ inputs.fasta_download_path}}/GRCh38.primary_assembly.genome.fa.gz + - name: Run pre pipeline cmd + if: inputs.prerun_cmd + run: ${{inputs.prerun_cmd}} + shell: bash -el {0} + - name: Set to 0 GPUs in config + if: inputs.no_gpu + # There are no GPUs on the gh worker, so we can disable it in the config + run: "sed -i 's/gpus: 1/gpus: 0/' ./example/config.yaml" + shell: bash -el {0} + - name: "Running pipeline ${{ github.jobs[github.job].name }}" + run: | + python -m snakemake ${{ (inputs.dry_run && '-n') || '' }} \ + -j 2 --directory ${{inputs.pipeline_directory}} \ + ${{ (endsWith(inputs.pipeline_config, 'ml') && '--configfile') || '' }} ${{ inputs.pipeline_config }} \ + --snakefile ${{inputs.pipeline_file}} --show-failed-logs -F + shell: micromamba-shell {0} + - name: Run post pipeline cmd + if: inputs.postrun_cmd + run: ${{inputs.postrun_cmd}} + shell: micromamba-shell {0} diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml new file mode 100644 index 00000000..09309904 --- /dev/null +++ b/.github/workflows/run-pytest.yml @@ -0,0 +1,33 @@ +name: Run pytest + +on: + workflow_call: + inputs: + environment_file: + required: true + type: string + test_path: + required: true + type: string + parallel_tests: + required: false + default: false + type: boolean + +jobs: + Run-Pytest: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v4 + - uses: mamba-org/setup-micromamba@v1.8.1 + with: + environment-file: ${{inputs.environment_file}} + cache-environment: true + cache-downloads: true + - name: Install DeepRVAT + run: pip install -e ${{ github.workspace }} + shell: micromamba-shell {0} + - name: Run pytest + run: pytest ${{ (inputs.parallel_tests && '-n auto') || '' }} -v ${{ inputs.test_path }} + shell: micromamba-shell {0} diff --git a/.github/workflows/test-runner.yml b/.github/workflows/test-runner.yml deleted file mode 100644 index 32e33474..00000000 --- a/.github/workflows/test-runner.yml +++ /dev/null @@ -1,65 +0,0 @@ -name: DeepRVAT test runner -run-name: DeepRVAT Tests ๐Ÿง‘๐Ÿผโ€๐Ÿ’ปโœ… -on: [ push ] - -jobs: - DeepRVAT-Tests-Runner: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-preprocess-gh-action - environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml - cache-environment: true - cache-downloads: true - - - name: Install DeepRVAT - run: pip install -e ${{ github.workspace }} - shell: micromamba-shell {0} - - name: Run pytest deeprvat - run: pytest -n auto -v ${{ github.workspace }}/tests/deeprvat - shell: micromamba-shell {0} - - DeepRVAT-Tests-Runner-Preprocessing: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-preprocess-gh-action - environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml - cache-environment: true - cache-downloads: true - - - name: Install DeepRVAT - run: pip install -e ${{ github.workspace }} - shell: micromamba-shell {0} - - - name: Run pytest preprocessing - run: pytest -v ${{ github.workspace }}/tests/preprocessing - shell: micromamba-shell {0} - - DeepRVAT-Tests-Runner-Annotations: - runs-on: ubuntu-latest - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - uses: mamba-org/setup-micromamba@v1.8.1 - with: - environment-name: deeprvat-annotation-gh-action - environment-file: ${{ github.workspace }}/deeprvat_annotations.yml - cache-environment: true - cache-downloads: true - - - name: Install DeepRVAT - run: pip install -e ${{ github.workspace }} - shell: micromamba-shell {0} - - - name: Run pytest annotations - run: pytest -n auto -v ${{ github.workspace }}/tests/annotations - shell: micromamba-shell {0} diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 5af2e770..f8d18c97 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -19,7 +19,7 @@ import statsmodels.api as sm import yaml from bgen import BgenWriter -from numcodecs import Blosc +from numcodecs import Blosc, JSON from seak import scoretest from statsmodels.tools.tools import add_constant from torch.utils.data import DataLoader, Dataset, Subset @@ -295,7 +295,7 @@ def compute_burdens_( chunk_burden = np.zeros(shape=(n_samples,) + this_burdens.shape[1:]) chunk_y = np.zeros(shape=(n_samples,) + this_y.shape[1:]) chunk_x = np.zeros(shape=(n_samples,) + this_x.shape[1:]) - chunk_sampleid = np.zeros(shape=(n_samples)) + chunk_sampleid = [""] * n_samples logger.info(f"Batch size: {batch['rare_variant_annotations'].shape}") @@ -333,8 +333,8 @@ def compute_burdens_( mode="a", shape=(n_total_samples), chunks=(None), - dtype=np.float32, - compressor=Blosc(clevel=compression_level), + dtype=object, + object_codec=JSON(), ) start_idx = i * batch_size end_idx = min(start_idx + batch_size, chunk_end) # read from chunk shape @@ -513,7 +513,7 @@ def make_regenie_input_( with BgenWriter( bgen, n_samples, - samples=list(sample_ids), + samples=list(sample_ids.astype(str)), metadata="Pseudovariants containing DeepRVAT gene impairment scores. One pseudovariant per gene.", ) as f: for i in trange(n_genes): diff --git a/deeprvat/preprocessing/preprocess.py b/deeprvat/preprocessing/preprocess.py index e979912b..9f5f22d1 100644 --- a/deeprvat/preprocessing/preprocess.py +++ b/deeprvat/preprocessing/preprocess.py @@ -277,6 +277,8 @@ def process_sparse_gt( variants = variants[~variants["id"].isin(variant_ids_to_exclude)] if not skip_sanity_checks: assert total_variants - len(variants) == len(variant_ids_to_exclude) + if variants.empty: + raise ValueError("All variants have been filtered out.") logging.info(f"Dropped {total_variants - len(variants)} variants") logging.info(f"...done ({time.time() - start_time} s)") @@ -313,6 +315,9 @@ def process_sparse_gt( samples = sorted(list(samples)) + if len(samples) == 0: + raise ValueError("All samples have been excluded.") + logging.info("Processing sparse GT files by chromosome") total_calls_dropped = 0 variant_groups = variants.groupby("chrom") diff --git a/example/config.yaml b/example/config.yaml index 33ffb03b..9eca68be 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -32,6 +32,21 @@ n_repeats: 2 do_scoretest: True +gtf_file: gencode.v38.basic.annotation.gtf.gz + +regenie: + step_1: + bgen: imputation.bgen + snplist: imputation.snplist + bsize: 1000 + options: + - "--sample imputation.sample" + - "--qt" + step_2: + bsize: 400 + options: + - "--qt" + training: min_variant_count: 1 n_bags: 1 diff --git a/example/gencode.v38.basic.annotation.gtf.gz b/example/gencode.v38.basic.annotation.gtf.gz new file mode 100644 index 00000000..ab6059d5 Binary files /dev/null and b/example/gencode.v38.basic.annotation.gtf.gz differ diff --git a/example/imputation.bgen b/example/imputation.bgen new file mode 100644 index 00000000..7ae8d5ea Binary files /dev/null and b/example/imputation.bgen differ diff --git a/example/imputation.bgen.bgi b/example/imputation.bgen.bgi new file mode 100644 index 00000000..2a537d70 Binary files /dev/null and b/example/imputation.bgen.bgi differ diff --git a/example/imputation.sample b/example/imputation.sample new file mode 100644 index 00000000..f3b2f396 --- /dev/null +++ b/example/imputation.sample @@ -0,0 +1,1002 @@ +ID_1 ID_2 missing sex +0 0 0 D +0 0 0 0 +1 1 0 1 +2 2 0 0 +3 3 0 1 +4 4 0 0 +5 5 0 1 +6 6 0 0 +7 7 0 1 +8 8 0 0 +9 9 0 1 +10 10 0 0 +11 11 0 1 +12 12 0 0 +13 13 0 1 +14 14 0 0 +15 15 0 1 +16 16 0 0 +17 17 0 1 +18 18 0 0 +19 19 0 1 +20 20 0 0 +21 21 0 1 +22 22 0 0 +23 23 0 1 +24 24 0 0 +25 25 0 1 +26 26 0 0 +27 27 0 1 +28 28 0 0 +29 29 0 1 +30 30 0 0 +31 31 0 1 +32 32 0 0 +33 33 0 1 +34 34 0 0 +35 35 0 1 +36 36 0 0 +37 37 0 1 +38 38 0 0 +39 39 0 1 +40 40 0 0 +41 41 0 1 +42 42 0 0 +43 43 0 1 +44 44 0 0 +45 45 0 1 +46 46 0 0 +47 47 0 1 +48 48 0 0 +49 49 0 1 +50 50 0 0 +51 51 0 1 +52 52 0 0 +53 53 0 1 +54 54 0 0 +55 55 0 1 +56 56 0 0 +57 57 0 1 +58 58 0 0 +59 59 0 1 +60 60 0 0 +61 61 0 1 +62 62 0 0 +63 63 0 1 +64 64 0 0 +65 65 0 1 +66 66 0 0 +67 67 0 1 +68 68 0 0 +69 69 0 1 +70 70 0 0 +71 71 0 1 +72 72 0 0 +73 73 0 1 +74 74 0 0 +75 75 0 1 +76 76 0 0 +77 77 0 1 +78 78 0 0 +79 79 0 1 +80 80 0 0 +81 81 0 1 +82 82 0 0 +83 83 0 1 +84 84 0 0 +85 85 0 1 +86 86 0 0 +87 87 0 1 +88 88 0 0 +89 89 0 1 +90 90 0 0 +91 91 0 1 +92 92 0 0 +93 93 0 1 +94 94 0 0 +95 95 0 1 +96 96 0 0 +97 97 0 1 +98 98 0 0 +99 99 0 1 +100 100 0 0 +101 101 0 1 +102 102 0 0 +103 103 0 1 +104 104 0 0 +105 105 0 1 +106 106 0 0 +107 107 0 1 +108 108 0 0 +109 109 0 1 +110 110 0 0 +111 111 0 1 +112 112 0 0 +113 113 0 1 +114 114 0 0 +115 115 0 1 +116 116 0 0 +117 117 0 1 +118 118 0 0 +119 119 0 1 +120 120 0 0 +121 121 0 1 +122 122 0 0 +123 123 0 1 +124 124 0 0 +125 125 0 1 +126 126 0 0 +127 127 0 1 +128 128 0 0 +129 129 0 1 +130 130 0 0 +131 131 0 1 +132 132 0 0 +133 133 0 1 +134 134 0 0 +135 135 0 1 +136 136 0 0 +137 137 0 1 +138 138 0 0 +139 139 0 1 +140 140 0 0 +141 141 0 1 +142 142 0 0 +143 143 0 1 +144 144 0 0 +145 145 0 1 +146 146 0 0 +147 147 0 1 +148 148 0 0 +149 149 0 1 +150 150 0 0 +151 151 0 1 +152 152 0 0 +153 153 0 1 +154 154 0 0 +155 155 0 1 +156 156 0 0 +157 157 0 1 +158 158 0 0 +159 159 0 1 +160 160 0 0 +161 161 0 1 +162 162 0 0 +163 163 0 1 +164 164 0 0 +165 165 0 1 +166 166 0 0 +167 167 0 1 +168 168 0 0 +169 169 0 1 +170 170 0 0 +171 171 0 1 +172 172 0 0 +173 173 0 1 +174 174 0 0 +175 175 0 1 +176 176 0 0 +177 177 0 1 +178 178 0 0 +179 179 0 1 +180 180 0 0 +181 181 0 1 +182 182 0 0 +183 183 0 1 +184 184 0 0 +185 185 0 1 +186 186 0 0 +187 187 0 1 +188 188 0 0 +189 189 0 1 +190 190 0 0 +191 191 0 1 +192 192 0 0 +193 193 0 1 +194 194 0 0 +195 195 0 1 +196 196 0 0 +197 197 0 1 +198 198 0 0 +199 199 0 1 +200 200 0 0 +201 201 0 1 +202 202 0 0 +203 203 0 1 +204 204 0 0 +205 205 0 1 +206 206 0 0 +207 207 0 1 +208 208 0 0 +209 209 0 1 +210 210 0 0 +211 211 0 1 +212 212 0 0 +213 213 0 1 +214 214 0 0 +215 215 0 1 +216 216 0 0 +217 217 0 1 +218 218 0 0 +219 219 0 1 +220 220 0 0 +221 221 0 1 +222 222 0 0 +223 223 0 1 +224 224 0 0 +225 225 0 1 +226 226 0 0 +227 227 0 1 +228 228 0 0 +229 229 0 1 +230 230 0 0 +231 231 0 1 +232 232 0 0 +233 233 0 1 +234 234 0 0 +235 235 0 1 +236 236 0 0 +237 237 0 1 +238 238 0 0 +239 239 0 1 +240 240 0 0 +241 241 0 1 +242 242 0 0 +243 243 0 1 +244 244 0 0 +245 245 0 1 +246 246 0 0 +247 247 0 1 +248 248 0 0 +249 249 0 1 +250 250 0 0 +251 251 0 1 +252 252 0 0 +253 253 0 1 +254 254 0 0 +255 255 0 1 +256 256 0 0 +257 257 0 1 +258 258 0 0 +259 259 0 1 +260 260 0 0 +261 261 0 1 +262 262 0 0 +263 263 0 1 +264 264 0 0 +265 265 0 1 +266 266 0 0 +267 267 0 1 +268 268 0 0 +269 269 0 1 +270 270 0 0 +271 271 0 1 +272 272 0 0 +273 273 0 1 +274 274 0 0 +275 275 0 1 +276 276 0 0 +277 277 0 1 +278 278 0 0 +279 279 0 1 +280 280 0 0 +281 281 0 1 +282 282 0 0 +283 283 0 1 +284 284 0 0 +285 285 0 1 +286 286 0 0 +287 287 0 1 +288 288 0 0 +289 289 0 1 +290 290 0 0 +291 291 0 1 +292 292 0 0 +293 293 0 1 +294 294 0 0 +295 295 0 1 +296 296 0 0 +297 297 0 1 +298 298 0 0 +299 299 0 1 +300 300 0 0 +301 301 0 1 +302 302 0 0 +303 303 0 1 +304 304 0 0 +305 305 0 1 +306 306 0 0 +307 307 0 1 +308 308 0 0 +309 309 0 1 +310 310 0 0 +311 311 0 1 +312 312 0 0 +313 313 0 1 +314 314 0 0 +315 315 0 1 +316 316 0 0 +317 317 0 1 +318 318 0 0 +319 319 0 1 +320 320 0 0 +321 321 0 1 +322 322 0 0 +323 323 0 1 +324 324 0 0 +325 325 0 1 +326 326 0 0 +327 327 0 1 +328 328 0 0 +329 329 0 1 +330 330 0 0 +331 331 0 1 +332 332 0 0 +333 333 0 1 +334 334 0 0 +335 335 0 1 +336 336 0 0 +337 337 0 1 +338 338 0 0 +339 339 0 1 +340 340 0 0 +341 341 0 1 +342 342 0 0 +343 343 0 1 +344 344 0 0 +345 345 0 1 +346 346 0 0 +347 347 0 1 +348 348 0 0 +349 349 0 1 +350 350 0 0 +351 351 0 1 +352 352 0 0 +353 353 0 1 +354 354 0 0 +355 355 0 1 +356 356 0 0 +357 357 0 1 +358 358 0 0 +359 359 0 1 +360 360 0 0 +361 361 0 1 +362 362 0 0 +363 363 0 1 +364 364 0 0 +365 365 0 1 +366 366 0 0 +367 367 0 1 +368 368 0 0 +369 369 0 1 +370 370 0 0 +371 371 0 1 +372 372 0 0 +373 373 0 1 +374 374 0 0 +375 375 0 1 +376 376 0 0 +377 377 0 1 +378 378 0 0 +379 379 0 1 +380 380 0 0 +381 381 0 1 +382 382 0 0 +383 383 0 1 +384 384 0 0 +385 385 0 1 +386 386 0 0 +387 387 0 1 +388 388 0 0 +389 389 0 1 +390 390 0 0 +391 391 0 1 +392 392 0 0 +393 393 0 1 +394 394 0 0 +395 395 0 1 +396 396 0 0 +397 397 0 1 +398 398 0 0 +399 399 0 1 +400 400 0 0 +401 401 0 1 +402 402 0 0 +403 403 0 1 +404 404 0 0 +405 405 0 1 +406 406 0 0 +407 407 0 1 +408 408 0 0 +409 409 0 1 +410 410 0 0 +411 411 0 1 +412 412 0 0 +413 413 0 1 +414 414 0 0 +415 415 0 1 +416 416 0 0 +417 417 0 1 +418 418 0 0 +419 419 0 1 +420 420 0 0 +421 421 0 1 +422 422 0 0 +423 423 0 1 +424 424 0 0 +425 425 0 1 +426 426 0 0 +427 427 0 1 +428 428 0 0 +429 429 0 1 +430 430 0 0 +431 431 0 1 +432 432 0 0 +433 433 0 1 +434 434 0 0 +435 435 0 1 +436 436 0 0 +437 437 0 1 +438 438 0 0 +439 439 0 1 +440 440 0 0 +441 441 0 1 +442 442 0 0 +443 443 0 1 +444 444 0 0 +445 445 0 1 +446 446 0 0 +447 447 0 1 +448 448 0 0 +449 449 0 1 +450 450 0 0 +451 451 0 1 +452 452 0 0 +453 453 0 1 +454 454 0 0 +455 455 0 1 +456 456 0 0 +457 457 0 1 +458 458 0 0 +459 459 0 1 +460 460 0 0 +461 461 0 1 +462 462 0 0 +463 463 0 1 +464 464 0 0 +465 465 0 1 +466 466 0 0 +467 467 0 1 +468 468 0 0 +469 469 0 1 +470 470 0 0 +471 471 0 1 +472 472 0 0 +473 473 0 1 +474 474 0 0 +475 475 0 1 +476 476 0 0 +477 477 0 1 +478 478 0 0 +479 479 0 1 +480 480 0 0 +481 481 0 1 +482 482 0 0 +483 483 0 1 +484 484 0 0 +485 485 0 1 +486 486 0 0 +487 487 0 1 +488 488 0 0 +489 489 0 1 +490 490 0 0 +491 491 0 1 +492 492 0 0 +493 493 0 1 +494 494 0 0 +495 495 0 1 +496 496 0 0 +497 497 0 1 +498 498 0 0 +499 499 0 1 +500 500 0 0 +501 501 0 1 +502 502 0 0 +503 503 0 1 +504 504 0 0 +505 505 0 1 +506 506 0 0 +507 507 0 1 +508 508 0 0 +509 509 0 1 +510 510 0 0 +511 511 0 1 +512 512 0 0 +513 513 0 1 +514 514 0 0 +515 515 0 1 +516 516 0 0 +517 517 0 1 +518 518 0 0 +519 519 0 1 +520 520 0 0 +521 521 0 1 +522 522 0 0 +523 523 0 1 +524 524 0 0 +525 525 0 1 +526 526 0 0 +527 527 0 1 +528 528 0 0 +529 529 0 1 +530 530 0 0 +531 531 0 1 +532 532 0 0 +533 533 0 1 +534 534 0 0 +535 535 0 1 +536 536 0 0 +537 537 0 1 +538 538 0 0 +539 539 0 1 +540 540 0 0 +541 541 0 1 +542 542 0 0 +543 543 0 1 +544 544 0 0 +545 545 0 1 +546 546 0 0 +547 547 0 1 +548 548 0 0 +549 549 0 1 +550 550 0 0 +551 551 0 1 +552 552 0 0 +553 553 0 1 +554 554 0 0 +555 555 0 1 +556 556 0 0 +557 557 0 1 +558 558 0 0 +559 559 0 1 +560 560 0 0 +561 561 0 1 +562 562 0 0 +563 563 0 1 +564 564 0 0 +565 565 0 1 +566 566 0 0 +567 567 0 1 +568 568 0 0 +569 569 0 1 +570 570 0 0 +571 571 0 1 +572 572 0 0 +573 573 0 1 +574 574 0 0 +575 575 0 1 +576 576 0 0 +577 577 0 1 +578 578 0 0 +579 579 0 1 +580 580 0 0 +581 581 0 1 +582 582 0 0 +583 583 0 1 +584 584 0 0 +585 585 0 1 +586 586 0 0 +587 587 0 1 +588 588 0 0 +589 589 0 1 +590 590 0 0 +591 591 0 1 +592 592 0 0 +593 593 0 1 +594 594 0 0 +595 595 0 1 +596 596 0 0 +597 597 0 1 +598 598 0 0 +599 599 0 1 +600 600 0 0 +601 601 0 1 +602 602 0 0 +603 603 0 1 +604 604 0 0 +605 605 0 1 +606 606 0 0 +607 607 0 1 +608 608 0 0 +609 609 0 1 +610 610 0 0 +611 611 0 1 +612 612 0 0 +613 613 0 1 +614 614 0 0 +615 615 0 1 +616 616 0 0 +617 617 0 1 +618 618 0 0 +619 619 0 1 +620 620 0 0 +621 621 0 1 +622 622 0 0 +623 623 0 1 +624 624 0 0 +625 625 0 1 +626 626 0 0 +627 627 0 1 +628 628 0 0 +629 629 0 1 +630 630 0 0 +631 631 0 1 +632 632 0 0 +633 633 0 1 +634 634 0 0 +635 635 0 1 +636 636 0 0 +637 637 0 1 +638 638 0 0 +639 639 0 1 +640 640 0 0 +641 641 0 1 +642 642 0 0 +643 643 0 1 +644 644 0 0 +645 645 0 1 +646 646 0 0 +647 647 0 1 +648 648 0 0 +649 649 0 1 +650 650 0 0 +651 651 0 1 +652 652 0 0 +653 653 0 1 +654 654 0 0 +655 655 0 1 +656 656 0 0 +657 657 0 1 +658 658 0 0 +659 659 0 1 +660 660 0 0 +661 661 0 1 +662 662 0 0 +663 663 0 1 +664 664 0 0 +665 665 0 1 +666 666 0 0 +667 667 0 1 +668 668 0 0 +669 669 0 1 +670 670 0 0 +671 671 0 1 +672 672 0 0 +673 673 0 1 +674 674 0 0 +675 675 0 1 +676 676 0 0 +677 677 0 1 +678 678 0 0 +679 679 0 1 +680 680 0 0 +681 681 0 1 +682 682 0 0 +683 683 0 1 +684 684 0 0 +685 685 0 1 +686 686 0 0 +687 687 0 1 +688 688 0 0 +689 689 0 1 +690 690 0 0 +691 691 0 1 +692 692 0 0 +693 693 0 1 +694 694 0 0 +695 695 0 1 +696 696 0 0 +697 697 0 1 +698 698 0 0 +699 699 0 1 +700 700 0 0 +701 701 0 1 +702 702 0 0 +703 703 0 1 +704 704 0 0 +705 705 0 1 +706 706 0 0 +707 707 0 1 +708 708 0 0 +709 709 0 1 +710 710 0 0 +711 711 0 1 +712 712 0 0 +713 713 0 1 +714 714 0 0 +715 715 0 1 +716 716 0 0 +717 717 0 1 +718 718 0 0 +719 719 0 1 +720 720 0 0 +721 721 0 1 +722 722 0 0 +723 723 0 1 +724 724 0 0 +725 725 0 1 +726 726 0 0 +727 727 0 1 +728 728 0 0 +729 729 0 1 +730 730 0 0 +731 731 0 1 +732 732 0 0 +733 733 0 1 +734 734 0 0 +735 735 0 1 +736 736 0 0 +737 737 0 1 +738 738 0 0 +739 739 0 1 +740 740 0 0 +741 741 0 1 +742 742 0 0 +743 743 0 1 +744 744 0 0 +745 745 0 1 +746 746 0 0 +747 747 0 1 +748 748 0 0 +749 749 0 1 +750 750 0 0 +751 751 0 1 +752 752 0 0 +753 753 0 1 +754 754 0 0 +755 755 0 1 +756 756 0 0 +757 757 0 1 +758 758 0 0 +759 759 0 1 +760 760 0 0 +761 761 0 1 +762 762 0 0 +763 763 0 1 +764 764 0 0 +765 765 0 1 +766 766 0 0 +767 767 0 1 +768 768 0 0 +769 769 0 1 +770 770 0 0 +771 771 0 1 +772 772 0 0 +773 773 0 1 +774 774 0 0 +775 775 0 1 +776 776 0 0 +777 777 0 1 +778 778 0 0 +779 779 0 1 +780 780 0 0 +781 781 0 1 +782 782 0 0 +783 783 0 1 +784 784 0 0 +785 785 0 1 +786 786 0 0 +787 787 0 1 +788 788 0 0 +789 789 0 1 +790 790 0 0 +791 791 0 1 +792 792 0 0 +793 793 0 1 +794 794 0 0 +795 795 0 1 +796 796 0 0 +797 797 0 1 +798 798 0 0 +799 799 0 1 +800 800 0 0 +801 801 0 1 +802 802 0 0 +803 803 0 1 +804 804 0 0 +805 805 0 1 +806 806 0 0 +807 807 0 1 +808 808 0 0 +809 809 0 1 +810 810 0 0 +811 811 0 1 +812 812 0 0 +813 813 0 1 +814 814 0 0 +815 815 0 1 +816 816 0 0 +817 817 0 1 +818 818 0 0 +819 819 0 1 +820 820 0 0 +821 821 0 1 +822 822 0 0 +823 823 0 1 +824 824 0 0 +825 825 0 1 +826 826 0 0 +827 827 0 1 +828 828 0 0 +829 829 0 1 +830 830 0 0 +831 831 0 1 +832 832 0 0 +833 833 0 1 +834 834 0 0 +835 835 0 1 +836 836 0 0 +837 837 0 1 +838 838 0 0 +839 839 0 1 +840 840 0 0 +841 841 0 1 +842 842 0 0 +843 843 0 1 +844 844 0 0 +845 845 0 1 +846 846 0 0 +847 847 0 1 +848 848 0 0 +849 849 0 1 +850 850 0 0 +851 851 0 1 +852 852 0 0 +853 853 0 1 +854 854 0 0 +855 855 0 1 +856 856 0 0 +857 857 0 1 +858 858 0 0 +859 859 0 1 +860 860 0 0 +861 861 0 1 +862 862 0 0 +863 863 0 1 +864 864 0 0 +865 865 0 1 +866 866 0 0 +867 867 0 1 +868 868 0 0 +869 869 0 1 +870 870 0 0 +871 871 0 1 +872 872 0 0 +873 873 0 1 +874 874 0 0 +875 875 0 1 +876 876 0 0 +877 877 0 1 +878 878 0 0 +879 879 0 1 +880 880 0 0 +881 881 0 1 +882 882 0 0 +883 883 0 1 +884 884 0 0 +885 885 0 1 +886 886 0 0 +887 887 0 1 +888 888 0 0 +889 889 0 1 +890 890 0 0 +891 891 0 1 +892 892 0 0 +893 893 0 1 +894 894 0 0 +895 895 0 1 +896 896 0 0 +897 897 0 1 +898 898 0 0 +899 899 0 1 +900 900 0 0 +901 901 0 1 +902 902 0 0 +903 903 0 1 +904 904 0 0 +905 905 0 1 +906 906 0 0 +907 907 0 1 +908 908 0 0 +909 909 0 1 +910 910 0 0 +911 911 0 1 +912 912 0 0 +913 913 0 1 +914 914 0 0 +915 915 0 1 +916 916 0 0 +917 917 0 1 +918 918 0 0 +919 919 0 1 +920 920 0 0 +921 921 0 1 +922 922 0 0 +923 923 0 1 +924 924 0 0 +925 925 0 1 +926 926 0 0 +927 927 0 1 +928 928 0 0 +929 929 0 1 +930 930 0 0 +931 931 0 1 +932 932 0 0 +933 933 0 1 +934 934 0 0 +935 935 0 1 +936 936 0 0 +937 937 0 1 +938 938 0 0 +939 939 0 1 +940 940 0 0 +941 941 0 1 +942 942 0 0 +943 943 0 1 +944 944 0 0 +945 945 0 1 +946 946 0 0 +947 947 0 1 +948 948 0 0 +949 949 0 1 +950 950 0 0 +951 951 0 1 +952 952 0 0 +953 953 0 1 +954 954 0 0 +955 955 0 1 +956 956 0 0 +957 957 0 1 +958 958 0 0 +959 959 0 1 +960 960 0 0 +961 961 0 1 +962 962 0 0 +963 963 0 1 +964 964 0 0 +965 965 0 1 +966 966 0 0 +967 967 0 1 +968 968 0 0 +969 969 0 1 +970 970 0 0 +971 971 0 1 +972 972 0 0 +973 973 0 1 +974 974 0 0 +975 975 0 1 +976 976 0 0 +977 977 0 1 +978 978 0 0 +979 979 0 1 +980 980 0 0 +981 981 0 1 +982 982 0 0 +983 983 0 1 +984 984 0 0 +985 985 0 1 +986 986 0 0 +987 987 0 1 +988 988 0 0 +989 989 0 1 +990 990 0 0 +991 991 0 1 +992 992 0 0 +993 993 0 1 +994 994 0 0 +995 995 0 1 +996 996 0 0 +997 997 0 1 +998 998 0 0 +999 999 0 1 diff --git a/example/imputation.snplist b/example/imputation.snplist new file mode 100644 index 00000000..231104ce --- /dev/null +++ b/example/imputation.snplist @@ -0,0 +1,100 @@ +var0 +var1 +var2 +var3 +var4 +var5 +var6 +var7 +var8 +var9 +var10 +var11 +var12 +var13 +var14 +var15 +var16 +var17 +var18 +var19 +var20 +var21 +var22 +var23 +var24 +var25 +var26 +var27 +var28 +var29 +var30 +var31 +var32 +var33 +var34 +var35 +var36 +var37 +var38 +var39 +var40 +var41 +var42 +var43 +var44 +var45 +var46 +var47 +var48 +var49 +var50 +var51 +var52 +var53 +var54 +var55 +var56 +var57 +var58 +var59 +var60 +var61 +var62 +var63 +var64 +var65 +var66 +var67 +var68 +var69 +var70 +var71 +var72 +var73 +var74 +var75 +var76 +var77 +var78 +var79 +var80 +var81 +var82 +var83 +var84 +var85 +var86 +var87 +var88 +var89 +var90 +var91 +var92 +var93 +var94 +var95 +var96 +var97 +var98 +var99 diff --git a/pipelines/association_testing/regress_eval_regenie.snakefile b/pipelines/association_testing/regress_eval_regenie.snakefile index 7cad2da4..a37ffca1 100644 --- a/pipelines/association_testing/regress_eval_regenie.snakefile +++ b/pipelines/association_testing/regress_eval_regenie.snakefile @@ -17,33 +17,41 @@ regenie_step2_bsize = regenie_config_step2["bsize"] regenie_njobs = regenie_config_step1.get("njobs", 1) regenie_joblist = range(1, regenie_njobs) +config_file_prefix = ( + "cv_split0/deeprvat/" if cv_exp else "" +) + wildcard_constraints: job="\d+" -# rule evaluate: -# input: -# associations = expand('{{phenotype}}/deeprvat/mean_agg_results/burden_associations.parquet', -# repeat=range(n_repeats)), -# config = '{phenotype}/deeprvat/hpopt_config.yaml', -# output: -# "{phenotype}/deeprvat/eval/significant.parquet", -# "{phenotype}/deeprvat/eval/all_results.parquet" -# threads: 1 -# shell: -# 'deeprvat_evaluate ' -# + debug + -# '--use-seed-genes ' -# '--n-repeats {n_repeats} ' -# '--correction-method FDR ' -# '{input.associations} ' -# '{input.config} ' -# '{wildcards.phenotype}/deeprvat/eval' +rule evaluate: + input: + associations ='{phenotype}/deeprvat/average_regression_results/burden_associations.parquet', + config = f"{config_file_prefix}{{phenotype}}/deeprvat/hpopt_config.yaml" + output: + "{phenotype}/deeprvat/eval/significant.parquet", + "{phenotype}/deeprvat/eval/all_results.parquet" + threads: 1 + resources: + mem_mb = 16000, + load = 16000 + params: + use_baseline_results = '--use-baseline-results' + shell: + 'deeprvat_evaluate ' + + debug + + '{params.use_baseline_results} ' + '--correction-method Bonferroni ' + '--phenotype {wildcards.phenotype} ' + '{input.associations} ' + '{input.config} ' + '{wildcards.phenotype}/deeprvat/eval' rule all_regenie: input: - expand('{phenotype}/deeprvat/mean_agg_results/burden_associations.parquet', + expand('{phenotype}/deeprvat/average_regression_results/burden_associations.parquet', phenotype=phenotypes), rule convert_regenie_output: @@ -51,12 +59,12 @@ rule convert_regenie_output: expand("regenie_output/step2/deeprvat_{phenotype}.regenie", phenotype=phenotypes) output: - expand('{phenotype}/deeprvat/mean_agg_results/burden_associations.parquet', + expand('{phenotype}/deeprvat/average_regression_results/burden_associations.parquet', phenotype=phenotypes) params: pheno_options = " ".join([ f"--phenotype {phenotype} regenie_output/step2/deeprvat_{phenotype}.regenie " - f"{phenotype}/deeprvat/mean_agg_results/burden_associations.parquet" + f"{phenotype}/deeprvat/average_regression_results/burden_associations.parquet" for phenotype in phenotypes]), gene_file = config["data"]["dataset_config"]["rare_embedding"]["config"]["gene_file"] threads: 1 diff --git a/pipelines/association_testing_pretrained_regenie.snakefile b/pipelines/association_testing_pretrained_regenie.snakefile index f3eb0b0e..87050d87 100644 --- a/pipelines/association_testing_pretrained_regenie.snakefile +++ b/pipelines/association_testing_pretrained_regenie.snakefile @@ -5,20 +5,27 @@ configfile: 'config.yaml' debug_flag = config.get('debug', False) phenotypes = config['phenotypes'] phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes +training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 +n_avg_chunks = config.get('n_avg_chunks', 1) n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' model_path = Path(config.get("pretrained_model_path", "pretrained_models")) +cv_exp = False +config_file_prefix = ( + "cv_split0/deeprvat/" if cv_exp else "" +) + wildcard_constraints: repeat="\d+", trial="\d+", -include: "association_testing/config.snakefile" +include: "training/config.snakefile" include: "association_testing/association_dataset.snakefile" include: "association_testing/burdens.snakefile" include: "association_testing/regress_eval_regenie.snakefile" diff --git a/pipelines/training_association_testing_regenie.snakefile b/pipelines/training_association_testing_regenie.snakefile index 3f8a4e01..ce4dd990 100644 --- a/pipelines/training_association_testing_regenie.snakefile +++ b/pipelines/training_association_testing_regenie.snakefile @@ -9,6 +9,7 @@ training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 +n_avg_chunks = config.get('n_avg_chunks', 1) n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] @@ -17,6 +18,7 @@ do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) model_path = Path("models") n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1) +cv_exp = False wildcard_constraints: repeat="\d+", diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz new file mode 100644 index 00000000..607f68df Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/expected/expected_data.npz differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/qc/excluded_samples.csv b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/qc/excluded_samples.csv new file mode 100644 index 00000000..ea0a0082 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/qc/excluded_samples.csv @@ -0,0 +1,11 @@ +100096 +100097 +100099 +100100 +100101 +100102 +100103 +100104 +100105 +100106 +100107 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/samples_chr.csv b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/samples_chr.csv new file mode 100644 index 00000000..ea0a0082 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/samples_chr.csv @@ -0,0 +1,11 @@ +100096 +100097 +100099 +100100 +100101 +100102 +100103 +100104 +100105 +100106 +100107 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz new file mode 100644 index 00000000..0fee2c66 Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.parquet new file mode 100644 index 00000000..df779fb3 Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.parquet differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz new file mode 100644 index 00000000..6da9e9cd Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_all/input/variants.tsv.gz differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/expected/expected_data.npz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/expected/expected_data.npz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv new file mode 100644 index 00000000..0f863dc6 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/qc/input_c1_b1.tsv @@ -0,0 +1,20 @@ +chr1 16103 T G +chr1 51479 T A +chr1 51898 C A +chr1 51928 G A +chr1 51954 G C +chr1 54490 G A +chr1 54669 C T +chr1 54708 G C +chr1 54716 C T +chr1 54725 T G +chr1 54727 T C +chr1 54753 T G +chr1 55299 C T +chr1 55326 T C +chr1 55330 G A +chr1 55351 T A +chr1 55365 A G +chr1 55367 G A +chr1 55385 A G +chr1 55388 C T diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv new file mode 100644 index 00000000..ea0a0082 --- /dev/null +++ b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/samples_chr.csv @@ -0,0 +1,11 @@ +100096 +100097 +100099 +100100 +100101 +100102 +100103 +100104 +100105 +100106 +100107 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/sparse_gt/chr1/input_c1_b1.tsv.gz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet new file mode 100644 index 00000000..df779fb3 Binary files /dev/null and b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.parquet differ diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_all/input/variants.tsv.gz new file mode 100644 index 00000000..e69de29b diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py index a8fc0415..cd5828b4 100644 --- a/tests/preprocessing/test_preprocess.py +++ b/tests/preprocessing/test_preprocess.py @@ -23,7 +23,7 @@ def load_h5_archive(h5_path): @pytest.mark.parametrize( - "test_data_name_dir, extra_cli_params, genotype_file_name", + "test_data_name_dir, extra_cli_params, genotype_file_name, should_fail", [ ( "no_filters_minimal", @@ -32,6 +32,7 @@ def load_h5_archive(h5_path): "1", ], "genotypes_chr1.h5", + False, ), ( "no_filters_minimal_str_samples", @@ -40,6 +41,7 @@ def load_h5_archive(h5_path): "1", ], "genotypes_chr1.h5", + False, ), ( "filter_variants_minimal", @@ -50,6 +52,18 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_variants_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, + ), + ( + "filter_variants_all", + [ + "--chromosomes", + "1", + "--exclude-variants", + f"{(tests_data_dir / 'process_sparse_gt/filter_variants_all/input/qc').as_posix()}", + ], + "genotypes_chr1.h5", + True, ), ( "filter_variants_multiple", @@ -60,6 +74,7 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_variants_multiple/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, ), ( "filter_samples_minimal", @@ -70,6 +85,18 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_samples_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, + ), + ( + "filter_samples_all", + [ + "--chromosomes", + "1", + "--exclude-samples", + f"{(tests_data_dir / 'process_sparse_gt/filter_samples_all/input/qc').as_posix()}", + ], + "genotypes_chr1.h5", + True, ), ( "filter_calls_minimal", @@ -80,6 +107,7 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_calls_minimal/input/qc').as_posix()}", ], "genotypes_chr1.h5", + False, ), ( "filter_calls_vars_samples_minimal", @@ -94,11 +122,12 @@ def load_h5_archive(h5_path): f"{(tests_data_dir / 'process_sparse_gt/filter_calls_vars_samples_minimal/input/qc/variants/').as_posix()}", ], "genotypes_chr1.h5", + False, ), ], ) def test_process_sparse_gt_file( - test_data_name_dir, extra_cli_params, genotype_file_name, tmp_path + test_data_name_dir, extra_cli_params, genotype_file_name, should_fail, tmp_path ): cli_runner = CliRunner() @@ -127,7 +156,14 @@ def test_process_sparse_gt_file( out_file_base.as_posix(), ] - result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=False) + result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=True) + + if should_fail: + assert isinstance(result.exception, ValueError) + return + else: + assert result.exception is None + assert result.exit_code == 0 h5_file = out_file_base.as_posix().replace("genotypes", genotype_file_name)