From 4e9b47d440eaf20652235a22d65f500ab80ee0f7 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Tue, 16 Apr 2024 10:44:54 +0200 Subject: [PATCH] Squashed commit of the following: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 24b3af57cb7895f74f78f3d741ff3fa23b7b61f2 Author: Magnus Wahlberg Date: Tue Apr 16 10:40:45 2024 +0200 Optimize preprocessing (#65) * Add new test files * Update test_preprocess.py * Use parquet * Add brians code * Update preprocess.py * sort samples * Remove threads * Update exclude calls logic * Squashed commit of the following: commit 101feb2ca95aa8b99132ba11c79669463f006c5d Author: Marcel Mück Date: Tue Apr 9 11:56:54 2024 +0200 Annotations new features (#54) * added all changes from annotation-speedups branch * added gtf and genotype mock file for github tests * Delete example/annotations/preprocessing_workdir/preprocessed directory * Update annotation_colnames_filling_values.yaml * Corrected fill values for maf columns * Changed protein_id merging and exon distance filtering, s.t. no annotations are dropped * included rulegraph instead dag * based on suggestions from @endast * added version info for rockdb.yaml file * updated rulegraph Updated Documentation corrected nonfunctional links * added support for X/Y chromosomes, removed dependency on pvcf file * excluded mkl version 2024.1.0 since it is crashing pytorch(https://github.com/pytorch/pytorch/issues/123097) * changed way file stems are assumed to include 'double ending' on input files. * removed unused lines, removed pvcf from config file * changed if statement for gene_id_file --------- Co-authored-by: “Marcel-Mueck” <“mueckm1@gmail.com”> Co-authored-by: PMBio commit 628af87bce2d22ced001dceef58c0854410edbef Author: Marcel Mück Date: Thu Apr 4 14:09:22 2024 +0200 Update preprocessing.md (#60) Corrected small spelling mistake commit 1356ed2f2026eaa9e35c1ef08fa82da8c96e10b0 Author: Eva Holtkamp <59055511+HolEv@users.noreply.github.com> Date: Fri Mar 1 14:55:55 2024 +0100 Update dense_gt.py (#56) bugfix (had forgotten to remove sample_file = none) but the sample file is needed during cv training commit 4d9ef64dac1b0db36d4e468e8e9755201e6af342 Author: Eva Holtkamp <59055511+HolEv@users.noreply.github.com> Date: Fri Feb 23 12:21:49 2024 +0100 Feature cv training (#55) * performance optimizations * train multiple repeats on single node in parallel * bug fix * fix bug in indexing when subset_samples() removed something * sleep between jobs; stop if any job fails * format with black * bug fixes * add test for MultiphenoDataloader * update environments * uncomment rules * bug fixes * subset samples in training_dataset rule * example config.yaml * use gpu queue for compute_burdens * bugfix since dask reading didn't work any more * allow evaluation of all repeat combinations * allow analysis of each n_repeats and for all repeat combinations * option to provide burden file * allow seed gene alpha to be defined in config * change sorting order to get the best model * adaptations to analyze multiple repeats and use script wo seed genes * allow to provide a sample file and do separate indexing for pheno and geno to ensure indices are correct * automatize generation of figure 3 (associations & repliation) * generate cv splits with related samples in the same split * average burdens * average burdens * cross-validation like trainign * add missing cv_utils * write average burdens or each combination to single zarr file to avoid zarr issues * add logging information * make maf column a param * add logging * pipeline replictaion and plotting * evaluate all repeat combis with and without seed genes * update lsf.yaml * small updates * per-gene pval aggregation * aggregate pval per gene * bugfix- only load burdens if not skip burdens * logging info * updates and fixes * load burdens only for genes analysed in current chunk to save memory * small changes to pipeline * standardizing/qt-transform of combined test set x/y arrays * my_quantile_transform for numpy arrays * bugfix * remove unnecessary code * remove unnecessary wildcards * make averaging part of associate.py * allow seed genes/baselines to be missing (to allow assoc. testing for non-training phenotypes) * updates * gene-specific common variant covariates for conditional analysis * bugfix * post-hoc conditioning on common variants * restructure pipelines * removing redundant options * add cv_utils cli * simplify script (only evaluate one repeat combi/average burdens); aggregate baseline pvalues; make bonferroni correction default * removal of redundant wildcards, updates and fixes * bugfixes * baseline discoveries only required for training phenotypes * remove not needed code * update configs * formatting * manually merge changes from feature-regenie to account for gene-specific annotations * allow different sample orders in phenotype_df and genotypes.h5 * change sample ids to be bytes as it is in the real data * update pipelines * update gitignore * pipeline updates * manually update github actions to be like master * bug fixes * checkout tests from master * make phenotype indices string as they are in real data * 'add gene_id' column * manually merge with master so tests can pass * bugfixes * use gene_id column instead of gene_ids * pipeline updates and fixes * update test config * adding age2 and age_sex to example data * update config * set tests folder to main version * checkout preprocssing files from main * checkout from main * manually merge sample_id changes from main * pipeline bugfixes and renamings * fixup! Format Python code with psf/black pull_request * remove gene_ids column * integrating suggested PR changes * fixup! Format Python code with psf/black pull_request --------- Co-authored-by: Brian Clarke Co-authored-by: PMBio commit ada0aaa57924ad67390e599365636bde5b46de05 Author: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Wed Feb 21 15:56:14 2024 +0100 Feature regenie (#52) * convert burdens and phenotypes to SAIGE format * add function to make regenie input * modifications for regenie * bug fixes * update to use regenie * add function for mapping samples * implement burden export * convert burdens and phenotypes to SAIGE format * add function to make regenie input * modifications for regenie * bug fixes * update to use regenie * add function for mapping samples * implement burden export * add function to convert REGENIE output * don't show all unmapped samples if the list is long * don't parallelize REGENIE step 1 * separate pipelines with and without REGENIE * support gene-specific annotation * bug fix * bug fix * bug fix * bug fix * correct regenie_step1 --lowmem-prefix * modify to work standalone * add --association-only option * allow gene-specific annotation * go back to SEAK/statsmodels * bug fixes * remove SAIGE code, fix imports and conda envs * make pipelines more self-contained * don't require burdens.zarr when --skip-burdens is passed * udpate utils --------- Co-authored-by: Brian Clarke * Revert "Squashed commit of the following:" This reverts commit ebde7c1637bec98c281f7e69ea94529e63cdb574. * Remove unused import * don't use mkl 2024.1.0 * update micromamba@v1.8.1 * Isolate failing test * test genotype matrix * Revert "test genotype matrix" This reverts commit 6deee9be70b9a68f367dd4a6e337fb36a710549c. * Revert "Isolate failing test" This reverts commit 6a11fe39122248aea24019484ecb5ef1dd8d9eb3. * fixup! Format Python code with psf/black pull_request * remove files * Delete variants.tsv.gz * Update test_preprocess.py * Update test_preprocess.py * fixup! Format Python code with psf/black pull_request * Update test_preprocess.py * Update test-runner.yml * one test * Revert "one test" This reverts commit 05e45786d52dee15a1769ef76acbe0034c3fd512. * Revert "Update test-runner.yml" This reverts commit ff78d30724a899393af0c535e3c44a29412470e2. * update call filter test data * Update expected data * Update deeprvat_preprocessing_env.yml Remove joblib * Squashed commit of the following: commit 101feb2ca95aa8b99132ba11c79669463f006c5d Author: Marcel Mück Date: Tue Apr 9 11:56:54 2024 +0200 Annotations new features (#54) * added all changes from annotation-speedups branch * added gtf and genotype mock file for github tests * Delete example/annotations/preprocessing_workdir/preprocessed directory * Update annotation_colnames_filling_values.yaml * Corrected fill values for maf columns * Changed protein_id merging and exon distance filtering, s.t. no annotations are dropped * included rulegraph instead dag * based on suggestions from @endast * added version info for rockdb.yaml file * updated rulegraph Updated Documentation corrected nonfunctional links * added support for X/Y chromosomes, removed dependency on pvcf file * excluded mkl version 2024.1.0 since it is crashing pytorch(https://github.com/pytorch/pytorch/issues/123097) * changed way file stems are assumed to include 'double ending' on input files. * removed unused lines, removed pvcf from config file * changed if statement for gene_id_file --------- Co-authored-by: “Marcel-Mueck” <“mueckm1@gmail.com”> Co-authored-by: PMBio commit 628af87bce2d22ced001dceef58c0854410edbef Author: Marcel Mück Date: Thu Apr 4 14:09:22 2024 +0200 Update preprocessing.md (#60) Corrected small spelling mistake commit 1356ed2f2026eaa9e35c1ef08fa82da8c96e10b0 Author: Eva Holtkamp <59055511+HolEv@users.noreply.github.com> Date: Fri Mar 1 14:55:55 2024 +0100 Update dense_gt.py (#56) bugfix (had forgotten to remove sample_file = none) but the sample file is needed during cv training commit 4d9ef64dac1b0db36d4e468e8e9755201e6af342 Author: Eva Holtkamp <59055511+HolEv@users.noreply.github.com> Date: Fri Feb 23 12:21:49 2024 +0100 Feature cv training (#55) * performance optimizations * train multiple repeats on single node in parallel * bug fix * fix bug in indexing when subset_samples() removed something * sleep between jobs; stop if any job fails * format with black * bug fixes * add test for MultiphenoDataloader * update environments * uncomment rules * bug fixes * subset samples in training_dataset rule * example config.yaml * use gpu queue for compute_burdens * bugfix since dask reading didn't work any more * allow evaluation of all repeat combinations * allow analysis of each n_repeats and for all repeat combinations * option to provide burden file * allow seed gene alpha to be defined in config * change sorting order to get the best model * adaptations to analyze multiple repeats and use script wo seed genes * allow to provide a sample file and do separate indexing for pheno and geno to ensure indices are correct * automatize generation of figure 3 (associations & repliation) * generate cv splits with related samples in the same split * average burdens * average burdens * cross-validation like trainign * add missing cv_utils * write average burdens or each combination to single zarr file to avoid zarr issues * add logging information * make maf column a param * add logging * pipeline replictaion and plotting * evaluate all repeat combis with and without seed genes * update lsf.yaml * small updates * per-gene pval aggregation * aggregate pval per gene * bugfix- only load burdens if not skip burdens * logging info * updates and fixes * load burdens only for genes analysed in current chunk to save memory * small changes to pipeline * standardizing/qt-transform of combined test set x/y arrays * my_quantile_transform for numpy arrays * bugfix * remove unnecessary code * remove unnecessary wildcards * make averaging part of associate.py * allow seed genes/baselines to be missing (to allow assoc. testing for non-training phenotypes) * updates * gene-specific common variant covariates for conditional analysis * bugfix * post-hoc conditioning on common variants * restructure pipelines * removing redundant options * add cv_utils cli * simplify script (only evaluate one repeat combi/average burdens); aggregate baseline pvalues; make bonferroni correction default * removal of redundant wildcards, updates and fixes * bugfixes * baseline discoveries only required for training phenotypes * remove not needed code * update configs * formatting * manually merge changes from feature-regenie to account for gene-specific annotations * allow different sample orders in phenotype_df and genotypes.h5 * change sample ids to be bytes as it is in the real data * update pipelines * update gitignore * pipeline updates * manually update github actions to be like master * bug fixes * checkout tests from master * make phenotype indices string as they are in real data * 'add gene_id' column * manually merge with master so tests can pass * bugfixes * use gene_id column instead of gene_ids * pipeline updates and fixes * update test config * adding age2 and age_sex to example data * update config * set tests folder to main version * checkout preprocssing files from main * checkout from main * manually merge sample_id changes from main * pipeline bugfixes and renamings * fixup! Format Python code with psf/black pull_request * remove gene_ids column * integrating suggested PR changes * fixup! Format Python code with psf/black pull_request --------- Co-authored-by: Brian Clarke Co-authored-by: PMBio commit ada0aaa57924ad67390e599365636bde5b46de05 Author: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Wed Feb 21 15:56:14 2024 +0100 Feature regenie (#52) * convert burdens and phenotypes to SAIGE format * add function to make regenie input * modifications for regenie * bug fixes * update to use regenie * add function for mapping samples * implement burden export * convert burdens and phenotypes to SAIGE format * add function to make regenie input * modifications for regenie * bug fixes * update to use regenie * add function for mapping samples * implement burden export * add function to convert REGENIE output * don't show all unmapped samples if the list is long * don't parallelize REGENIE step 1 * separate pipelines with and without REGENIE * support gene-specific annotation * bug fix * bug fix * bug fix * bug fix * correct regenie_step1 --lowmem-prefix * modify to work standalone * add --association-only option * allow gene-specific annotation * go back to SEAK/statsmodels * bug fixes * remove SAIGE code, fix imports and conda envs * make pipelines more self-contained * don't require burdens.zarr when --skip-burdens is passed * udpate utils --------- Co-authored-by: Brian Clarke * Revert change of micromamba * Ruff check * Squashed commit of the following: commit ae5c83e4e89612507e9e5de67164747c242cea77 Author: Marcel Mück Date: Mon Apr 15 11:01:03 2024 +0200 fixed bugs in the annotation pipeline based on issues #61, #62 and #63. (#64) * fixed bugs in the annotation pipeline based on issues #61, #62 and #63. * fixup! Format Python code with psf/black pull_request --------- Co-authored-by: “Marcel-Mueck” <“mueckm1@gmail.com”> Co-authored-by: PMBio --------- Co-authored-by: PMBio commit ae5c83e4e89612507e9e5de67164747c242cea77 Author: Marcel Mück Date: Mon Apr 15 11:01:03 2024 +0200 fixed bugs in the annotation pipeline based on issues #61, #62 and #63. (#64) * fixed bugs in the annotation pipeline based on issues #61, #62 and #63. * fixup! Format Python code with psf/black pull_request --------- Co-authored-by: “Marcel-Mueck” <“mueckm1@gmail.com”> Co-authored-by: PMBio commit 101feb2ca95aa8b99132ba11c79669463f006c5d Author: Marcel Mück Date: Tue Apr 9 11:56:54 2024 +0200 Annotations new features (#54) * added all changes from annotation-speedups branch * added gtf and genotype mock file for github tests * Delete example/annotations/preprocessing_workdir/preprocessed directory * Update annotation_colnames_filling_values.yaml * Corrected fill values for maf columns * Changed protein_id merging and exon distance filtering, s.t. no annotations are dropped * included rulegraph instead dag * based on suggestions from @endast * added version info for rockdb.yaml file * updated rulegraph Updated Documentation corrected nonfunctional links * added support for X/Y chromosomes, removed dependency on pvcf file * excluded mkl version 2024.1.0 since it is crashing pytorch(https://github.com/pytorch/pytorch/issues/123097) * changed way file stems are assumed to include 'double ending' on input files. * removed unused lines, removed pvcf from config file * changed if statement for gene_id_file --------- Co-authored-by: “Marcel-Mueck” <“mueckm1@gmail.com”> Co-authored-by: PMBio --- deeprvat/preprocessing/preprocess.py | 294 +++++++++++------- deeprvat_preprocessing_env.yml | 1 - docs/preprocessing.md | 3 - .../config/deeprvat_preprocess_config.yaml | 2 - pipelines/preprocess_no_qc.snakefile | 3 +- pipelines/preprocess_with_qc.snakefile | 3 +- pipelines/preprocessing/preprocess.snakefile | 2 - .../combine_chr1_chr2/input/genotypes_chr1.h5 | Bin 6354 -> 8442 bytes .../combine_chr1_chr2/input/genotypes_chr2.h5 | Bin 6354 -> 8442 bytes .../expected/expected_data.npz | Bin 747 -> 1564 bytes .../calls/chr1/excluded_calls_untidy.tsv.gz | Bin 112 -> 0 bytes .../input/qc/calls/chr1/input_c1_b1.tsv.gz | Bin 0 -> 98 bytes .../input/qc/calls/chr1/input_c1_b2.tsv.gz | Bin 0 -> 98 bytes ...calls_untidy.tsv.gz => input_c2_b1.tsv.gz} | Bin .../input/variants.parquet | Bin 0 -> 4049 bytes .../input/variants.tsv.gz | Bin 193 -> 0 bytes .../input/variants.parquet | Bin 0 -> 4049 bytes .../input/variants.tsv.gz | Bin 193 -> 0 bytes .../input/qc/chr1/input_c1_b1.tsv | 7 + .../input/variants.parquet | Bin 0 -> 4039 bytes .../input/qc/calls/chr1/input_c1_b1.tsv.gz | Bin 0 -> 112 bytes .../input/variants.parquet | Bin 0 -> 4039 bytes .../input/variants.parquet | Bin 0 -> 4039 bytes .../input/variants.parquet | Bin 0 -> 4039 bytes .../no_filters_minimal/input/variants.parquet | Bin 0 -> 4039 bytes tests/preprocessing/test_preprocess.py | 24 +- 26 files changed, 212 insertions(+), 127 deletions(-) delete mode 100644 tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr1/excluded_calls_untidy.tsv.gz create mode 100644 tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr1/input_c1_b1.tsv.gz create mode 100644 tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr1/input_c1_b2.tsv.gz rename tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr2/{excluded_calls_untidy.tsv.gz => input_c2_b1.tsv.gz} (100%) create mode 100644 tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/variants.parquet delete mode 100644 tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/variants.tsv.gz create mode 100644 tests/preprocessing/test_data/process_and_combine_sparse_gt/no_filters_minimal_split/input/variants.parquet delete mode 100644 tests/preprocessing/test_data/process_and_combine_sparse_gt/no_filters_minimal_split/input/variants.tsv.gz create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_calls_minimal/input/qc/chr1/input_c1_b1.tsv create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_calls_minimal/input/variants.parquet create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_calls_vars_samples_minimal/input/qc/calls/chr1/input_c1_b1.tsv.gz create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_calls_vars_samples_minimal/input/variants.parquet create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_samples_minimal/input/variants.parquet create mode 100644 tests/preprocessing/test_data/process_sparse_gt/filter_variants_minimal/input/variants.parquet create mode 100644 tests/preprocessing/test_data/process_sparse_gt/no_filters_minimal/input/variants.parquet diff --git a/deeprvat/preprocessing/preprocess.py b/deeprvat/preprocessing/preprocess.py index f38cb628..330ce42e 100644 --- a/deeprvat/preprocessing/preprocess.py +++ b/deeprvat/preprocessing/preprocess.py @@ -4,12 +4,12 @@ import time from pathlib import Path from typing import List, Optional, Tuple +from contextlib import ExitStack import click import h5py import numpy as np import pandas as pd -from joblib import Parallel, delayed from tqdm import tqdm, trange logging.basicConfig( @@ -26,8 +26,11 @@ def drop_rows(df: pd.DataFrame, df_to_drop: pd.DataFrame) -> pd.DataFrame: return df.drop_duplicates(keep=False) -def ragged_to_matrix(rows: List[np.ndarray], pad_value: int = -1) -> np.ndarray: - max_len = max([r.shape[0] for r in rows]) +def ragged_to_matrix( + rows: List[np.ndarray], pad_value: int = -1, max_len: Optional[int] = None +) -> np.ndarray: + if max_len is None: + max_len = max([r.shape[0] for r in rows]) matrix = np.stack( [ np.pad( @@ -36,7 +39,7 @@ def ragged_to_matrix(rows: List[np.ndarray], pad_value: int = -1) -> np.ndarray: "constant", constant_values=(pad_value, pad_value), ) - for r in tqdm(rows) + for r in tqdm(rows, desc="Ragged to matrix", file=sys.stdout) ] ) return matrix @@ -56,7 +59,8 @@ def process_sparse_gt_file( ) sparse_gt = sparse_gt[sparse_gt["sample"].isin(samples)] - sparse_gt = drop_rows(sparse_gt, calls_to_exclude) + if calls_to_exclude is not None: + sparse_gt = drop_rows(sparse_gt, calls_to_exclude) try: sparse_gt = pd.merge(sparse_gt, variants, validate="m:1") @@ -97,10 +101,13 @@ def write_genotype_file( samples: np.ndarray, variant_matrix: np.ndarray, genotype_matrix: np.ndarray, + count_variants: Optional[np.ndarray] = None, ): f.create_dataset("samples", data=samples, dtype=h5py.string_dtype()) f.create_dataset("variant_matrix", data=variant_matrix, dtype=np.int32) f.create_dataset("genotype_matrix", data=genotype_matrix, dtype=np.int8) + if count_variants is not None: + f.create_dataset("count_variants", data=count_variants, dtype=np.int32) @click.group() @@ -209,49 +216,43 @@ def process_individual_missingness( @cli.command() +@click.option("--chunksize", type=int, default=1000) @click.option("--exclude-variants", type=click.Path(exists=True), multiple=True) @click.option("--exclude-samples", type=click.Path(exists=True)) @click.option("--exclude-calls", type=click.Path(exists=True)) @click.option("--chromosomes", type=str) -@click.option("--threads", type=int, default=1) @click.option("--skip-sanity-checks", is_flag=True) @click.argument("variant-file", type=click.Path(exists=True)) @click.argument("samples-path", type=click.Path(exists=True)) @click.argument("sparse-gt", type=click.Path(exists=True)) @click.argument("out-file", type=click.Path()) def process_sparse_gt( + chunksize: int, exclude_variants: List[str], exclude_samples: Optional[str], exclude_calls: Optional[str], chromosomes: Optional[str], - threads: int, skip_sanity_checks: bool, variant_file: str, samples_path: str, sparse_gt: str, out_file: str, ): - logging.info("Reading variants...") + logging.info("Reading and processing variants...") start_time = time.time() - - variants = pd.read_table(variant_file, engine="pyarrow") - - # Filter all variants based on chromosome + variants = pd.read_parquet(variant_file, engine="pyarrow") if chromosomes is not None: chromosomes = [f"chr{chrom}" for chrom in chromosomes.split(",")] variants = variants[variants["chrom"].isin(chromosomes)] total_variants = len(variants) - if len(exclude_variants) > 0: variant_exclusion_files = [ v for directory in exclude_variants for v in Path(directory).rglob("*.tsv*") ] - - exclusion_file_cols = ["chrom", "pos", "ref", "alt"] variants_to_exclude = pd.concat( [ - pd.read_csv(v, sep="\t", names=exclusion_file_cols) - for v in tqdm(variant_exclusion_files) + pd.read_csv(v, sep="\t", names=["chrom", "pos", "ref", "alt"]) + for v in tqdm(variant_exclusion_files, file=sys.stdout) ], ignore_index=True, ) @@ -275,14 +276,14 @@ def process_sparse_gt( if exclude_samples is not None: total_samples = len(samples) - if sample_exclusion_files := list(Path(exclude_samples).rglob("*.csv")): - - sample_exclusion_files = [ - s for s in sample_exclusion_files if s.stat().st_size > 0 - ] - if sample_exclusion_files: - logging.info( - f"Found {len(sample_exclusion_files)} sample exclusion files" + if sample_exclusion_files := list(Path(exclude_samples).glob("*.csv")): + samples_to_exclude = set( + pd.concat( + [ + pd.read_csv(s, header=None).loc[:, 0] + for s in sample_exclusion_files + ], + ignore_index=True, ) samples_to_exclude = set( pd.concat( @@ -300,47 +301,20 @@ def process_sparse_gt( else: logging.info(f"Found no samples to exclude in {exclude_samples}") - samples = list(samples) + samples = sorted(list(samples)) logging.info("Processing sparse GT files by chromosome") total_calls_dropped = 0 variant_groups = variants.groupby("chrom") logger.info(f"variant groups are {variant_groups.groups.keys()}") - for chrom in tqdm(variant_groups.groups.keys()): + for chrom in tqdm( + variant_groups.groups.keys(), desc="Chromosomes", file=sys.stdout + ): logging.info(f"Processing chromosome {chrom}") logging.info("Reading in filtered calls") - exclude_calls_file_cols = ["chrom", "pos", "ref", "alt", "sample"] - - calls_to_exclude = pd.DataFrame(columns=exclude_calls_file_cols) - if exclude_calls is not None: exclude_calls_chrom = Path(exclude_calls).rglob("*.tsv*") - exclude_calls_chrom_files = [] - for exclude_call_file in exclude_calls_chrom: - file_chrom = get_file_chromosome( - exclude_call_file, col_names=exclude_calls_file_cols - ) - - if file_chrom == chrom: - exclude_calls_chrom_files.append(exclude_call_file) - - if exclude_calls_chrom_files: - calls_to_exclude = pd.concat( - [ - pd.read_csv( - c, - names=exclude_calls_file_cols, - sep="\t", - index_col=None, - ) - for c in tqdm(exclude_calls_chrom_files) - ], - ignore_index=True, - ) - calls_dropped = len(calls_to_exclude) - total_calls_dropped += calls_dropped - logging.info(f"Dropped {calls_dropped} calls") logging.info("Processing sparse GT files") @@ -354,83 +328,185 @@ def process_sparse_gt( if get_file_chromosome(f, col_names=sparse_file_cols) == chrom ] - logging.info(f"sparse gt chrom(es) are: {sparse_gt_chrom}") - - processed = Parallel(n_jobs=threads, verbose=50)( - delayed(process_sparse_gt_file)( - f.as_posix(), variants_chrom, samples, calls_to_exclude - ) - for f in sparse_gt_chrom + logging.info( + f"{len(sparse_gt_chrom)} sparse GT files: {[str(s) for s in sparse_gt_chrom]}" ) - postprocessed_gt = postprocess_sparse_gt(processed, 1, len(samples)) - postprocessed_variants = postprocess_sparse_gt(processed, 0, len(samples)) + variant_ids = [np.array([], dtype=np.int32) for _ in samples] + genotypes = [np.array([], dtype=np.int8) for _ in samples] + for sparse_gt_file in tqdm(sparse_gt_chrom, desc="Files", file=sys.stdout): + # Load calls to exclude that correspond to current file + file_stem = sparse_gt_file + while str(file_stem).find(".") != -1: + file_stem = Path(file_stem.stem) + + calls_to_exclude = None + if exclude_calls is not None: + exclude_call_files = [ + f for f in exclude_calls_chrom if f.name.startswith(str(file_stem)) + ] + if len(exclude_call_files) > 0: + calls_to_exclude = pd.concat( + [ + pd.read_csv( + c, + names=["chrom", "pos", "ref", "alt", "sample"], + sep="\t", + index_col=None, + ) + for c in tqdm( + exclude_call_files, + desc="Filtered calls", + file=sys.stdout, + ) + ], + ignore_index=True, + ) + + calls_dropped = len(calls_to_exclude) + total_calls_dropped += calls_dropped + logging.info(f"Dropped {calls_dropped} calls") - logging.info("Ordering gt and variant matrices") - order = [np.argsort(i) for i in postprocessed_variants] + # Load sparse_gt_file as data frame, add variant IDs, remove excluded calls + this_variant_ids, this_genotypes = process_sparse_gt_file( + sparse_gt_file.as_posix(), variants_chrom, samples, calls_to_exclude + ) + assert len(this_variant_ids) == len(samples) + assert len(this_genotypes) == len(samples) - postprocessed_variants = [ - postprocessed_variants[i][order[i]] - for i in range(len(postprocessed_variants)) - ] - postprocessed_gt = [ - postprocessed_gt[i][order[i]] for i in range(len(postprocessed_gt)) - ] + # Concatenate to existing results + for i, (v, g) in enumerate(zip(this_variant_ids, this_genotypes)): + variant_ids[i] = np.append(variant_ids[i], v) + genotypes[i] = np.append(genotypes[i], g) - logging.info("Preparing GT arrays for storage") + gc.collect() - logger.info("padding ragged matrix") - variant_matrix = ragged_to_matrix(postprocessed_variants) - gt_matrix = ragged_to_matrix(postprocessed_gt) + gc.collect() out_file_chrom = f"{out_file}_{chrom}.h5" Path(out_file_chrom).parents[0].mkdir(exist_ok=True, parents=True) logging.info(f"Writing to {out_file_chrom}") + count_variants = np.array([g.shape[0] for g in genotypes]) + max_len = np.max(count_variants) with h5py.File(out_file_chrom, "w") as f: sample_array = np.array(samples, dtype="S") - write_genotype_file(f, sample_array, variant_matrix, gt_matrix) + f.create_dataset("samples", data=sample_array, dtype=h5py.string_dtype()) + f.create_dataset("variant_matrix", (len(samples), max_len), dtype=np.int32) + f.create_dataset("genotype_matrix", (len(samples), max_len), dtype=np.int8) + f.create_dataset("count_variants", data=count_variants, dtype=np.int32) + + # Operate in chunks to reduce memory usage + for start_sample in trange( + 0, len(samples), chunksize, desc="Chunks", file=sys.stdout + ): + end_sample = min(start_sample + chunksize, len(samples)) + + order = [np.argsort(v) for v in variant_ids[start_sample:end_sample]] + + this_variant_ids = [ + variant_ids[i][order[i - start_sample]] + for i in range(start_sample, end_sample) + ] + this_genotypes = [ + genotypes[i][order[i - start_sample]] + for i in range(start_sample, end_sample) + ] + + gc.collect() + + variant_matrix = ragged_to_matrix(this_variant_ids, max_len=max_len) + gt_matrix = ragged_to_matrix(this_genotypes, max_len=max_len) + + f["variant_matrix"][start_sample:end_sample] = variant_matrix + f["genotype_matrix"][start_sample:end_sample] = gt_matrix + + del variant_matrix + del gt_matrix + gc.collect() logging.info(f"Dropped {total_calls_dropped} calls in total") logging.info("Finished!") @cli.command() +@click.option("--chunksize", type=int) @click.argument("genotype-files", nargs=-1, type=click.Path(exists=True)) @click.argument("out-file", type=click.Path()) -def combine_genotypes(genotype_files: List[str], out_file: str): +def combine_genotypes( + chunksize: Optional[int], genotype_files: List[str], out_file: str +): with h5py.File(genotype_files[0]) as f: samples = f["samples"][:] - n_samples = samples.shape[0] - variant_matrix_list: List[List] = [[] for _ in range(n_samples)] - genotype_matrix_list: List[List] = [[] for _ in range(n_samples)] - mat_idx_offset = 0 - for file in tqdm(genotype_files, desc="Files"): + for gt_file in genotype_files: + with h5py.File(gt_file) as f: + if not np.array_equal(samples, f["samples"][:]): + raise ValueError( + f"Error when processing {gt_file}: " + "All genotype files must contain the same samples in the same order" + ) + + n_samples: int = samples.shape[0] + + if chunksize is None: + chunksize = n_samples + + count_variants = np.zeros(n_samples, dtype=np.int32) + for file in genotype_files: with h5py.File(file) as f: - var_matrix = f["variant_matrix"][:] - gt_matrix = f["genotype_matrix"][:] - for i in trange(n_samples, desc="Samples"): - this_var = var_matrix[i, var_matrix[i, :] != -1] - this_gt = gt_matrix[i, gt_matrix[i, :] != -1] - assert this_var.shape == this_gt.shape - - variant_matrix_list[i].append(this_var + mat_idx_offset) - genotype_matrix_list[i].append(this_gt) - - ragged_variant_matrix = [np.concatenate(vm) for vm in tqdm(variant_matrix_list)] - del variant_matrix_list - gc.collect() - ragged_genotype_matrix = [np.concatenate(gt) for gt in tqdm(genotype_matrix_list)] - del genotype_matrix_list - gc.collect() - variant_matrix = ragged_to_matrix(ragged_variant_matrix) - del ragged_variant_matrix - gc.collect() - genotype_matrix = ragged_to_matrix(ragged_genotype_matrix) - del ragged_genotype_matrix - gc.collect() - with h5py.File(out_file, "a") as f: - write_genotype_file(f, samples, variant_matrix, genotype_matrix) + count_variants += f["count_variants"][:] + + max_n_variants = int(np.max(count_variants)) + + with h5py.File(out_file, "w") as f: + f.create_dataset("samples", data=samples, dtype=h5py.string_dtype()) + f.create_dataset("variant_matrix", (n_samples, max_n_variants), dtype=np.int32) + f.create_dataset("genotype_matrix", (n_samples, max_n_variants), dtype=np.int8) + + with h5py.File(out_file, "r+") as g: + for start_sample in trange( + 0, n_samples, chunksize, desc="Chunks", file=sys.stdout + ): + end_sample = min(start_sample + chunksize, n_samples) + this_chunksize = end_sample - start_sample + + # Use ExitStack so we don't have to reopen every HDF5 file for each sample + variant_matrix_list = [[] for _ in range(this_chunksize)] + genotype_matrix_list = [[] for _ in range(this_chunksize)] + with ExitStack() as stack: + genotype_file_objs = [ + stack.enter_context(h5py.File(g)) for g in genotype_files + ] + for f in tqdm(genotype_file_objs, desc="Files", file=sys.stdout): + var_matrix = f["variant_matrix"][start_sample:end_sample, :] + gt_matrix = f["genotype_matrix"][start_sample:end_sample, :] + for i in trange(this_chunksize, desc="Samples", file=sys.stdout): + this_var = var_matrix[i, var_matrix[i, :] != -1] + this_gt = gt_matrix[i, gt_matrix[i, :] != -1] + assert this_var.shape == this_gt.shape + + variant_matrix_list[i].append(this_var) + genotype_matrix_list[i].append(this_gt) + + ragged_variant_matrix = [np.concatenate(vm) for vm in variant_matrix_list] + del variant_matrix_list + gc.collect() + ragged_genotype_matrix = [np.concatenate(gt) for gt in genotype_matrix_list] + del genotype_matrix_list + gc.collect() + variant_matrix = ragged_to_matrix( + ragged_variant_matrix, max_len=max_n_variants + ) + del ragged_variant_matrix + gc.collect() + genotype_matrix = ragged_to_matrix( + ragged_genotype_matrix, max_len=max_n_variants + ) + del ragged_genotype_matrix + gc.collect() + + g["variant_matrix"][start_sample:end_sample, :] = variant_matrix + g["genotype_matrix"][start_sample:end_sample, :] = genotype_matrix if __name__ == "__main__": diff --git a/deeprvat_preprocessing_env.yml b/deeprvat_preprocessing_env.yml index bbf17e8b..771b3a29 100644 --- a/deeprvat_preprocessing_env.yml +++ b/deeprvat_preprocessing_env.yml @@ -6,7 +6,6 @@ dependencies: - python=3.8 - click=8.1.3 - h5py=3.8.0 - - joblib=1.0.1 - numpy=1.21.2 - pandas=1.5.1 - tqdm=4.59.0 diff --git a/docs/preprocessing.md b/docs/preprocessing.md index 2af3ddb9..b9564cf1 100644 --- a/docs/preprocessing.md +++ b/docs/preprocessing.md @@ -53,9 +53,6 @@ included_chromosomes : [21,22] # The format of the name of the "raw" vcf files vcf_files_list: vcf_files_list.txt -# Number of threads to use in the preprocessing script, separate from snakemake threads -preprocess_threads: 16 - # If you need to run a cmd to load bcf and samtools specify it here, see example bcftools_load_cmd : # module load bcftools/1.10.2 && samtools_load_cmd : # module load samtools/1.9 && diff --git a/pipelines/config/deeprvat_preprocess_config.yaml b/pipelines/config/deeprvat_preprocess_config.yaml index 0f1b146c..335dd6c9 100644 --- a/pipelines/config/deeprvat_preprocess_config.yaml +++ b/pipelines/config/deeprvat_preprocess_config.yaml @@ -4,8 +4,6 @@ included_chromosomes : [21,22] # The format of the name of the "raw" vcf files vcf_files_list: vcf_files_list.txt -# Number of threads to use in the preprocessing script, separate from snakemake threads -preprocess_threads: 16 # If you need to run a cmd to load bcf and samtools specify it here, see example bcftools_load_cmd : # module load bcftools/1.10.2 && diff --git a/pipelines/preprocess_no_qc.snakefile b/pipelines/preprocess_no_qc.snakefile index a98c60d6..4675972f 100644 --- a/pipelines/preprocess_no_qc.snakefile +++ b/pipelines/preprocess_no_qc.snakefile @@ -24,8 +24,7 @@ rule preprocess_no_qc: f"--exclude-variants {qc_duplicate_vars_dir}", "--chromosomes ", ",".join(str(chr) for chr in set(chromosomes)), - f"--threads {preprocess_threads}", - "{input.variants}", + "{input.variants_parquet}", "{input.samples}", f"{sparse_dir}", f"{preprocessed_dir / 'genotypes'}", diff --git a/pipelines/preprocess_with_qc.snakefile b/pipelines/preprocess_with_qc.snakefile index 4cbb22f4..70b778bb 100644 --- a/pipelines/preprocess_with_qc.snakefile +++ b/pipelines/preprocess_with_qc.snakefile @@ -39,8 +39,7 @@ rule preprocess_with_qc: f"--exclude-samples {qc_filtered_samples_dir}", "--chromosomes ", ",".join(str(chr) for chr in set(chromosomes)), - f"--threads {preprocess_threads}", - "{input.variants}", + "{input.variants_parquet}", "{input.samples}", f"{sparse_dir}", f"{preprocessed_dir / 'genotypes'}", diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile index 9c4097e3..20eb0c89 100644 --- a/pipelines/preprocessing/preprocess.snakefile +++ b/pipelines/preprocessing/preprocess.snakefile @@ -15,8 +15,6 @@ working_dir = Path(config["working_dir"]) preprocessed_dir = working_dir / config["preprocessed_dir_name"] reference_dir = working_dir / config["reference_dir_name"] -preprocess_threads = config["preprocess_threads"] - fasta_file = reference_dir / config["reference_fasta_file"] fasta_index_file = reference_dir / f"{config['reference_fasta_file']}.fai" diff --git a/tests/preprocessing/test_data/combine_genotypes/combine_chr1_chr2/input/genotypes_chr1.h5 b/tests/preprocessing/test_data/combine_genotypes/combine_chr1_chr2/input/genotypes_chr1.h5 index 8bc4588ac9a1efb58153290b5f6722c5e24fd9a3..38a4670df27a69aa73dda053cce777c7e900c501 100644 GIT binary patch delta 145 zcmca)_{(vE2GcKvjaq$7j1H4`GTBQe=a=S{#Fr%&WhUm86f-a|GC+XDWLJX`$iOyPnOT4GE>;7zO9~K$8c_8RN{E32%m5kt XWx<=2$pY*jHXq=e!p6uraicT<6Eq+c delta 56 zcmez6c*$^r2Gb>pjaq$7j0Tf;GTC!7GC+XF#EtTs6_~{s8JRc7vq&;dJfJaIfmvvC I7psRb0LFO^o&W#< diff --git a/tests/preprocessing/test_data/combine_genotypes/combine_chr1_chr2/input/genotypes_chr2.h5 b/tests/preprocessing/test_data/combine_genotypes/combine_chr1_chr2/input/genotypes_chr2.h5 index 73cbbf0160896784a8d0eb001c3a07499b63e973..5af842813e66e06548174fc86769db0423823c9a 100644 GIT binary patch delta 145 zcmca)_{(vE2GcKvjaq$7j1H4`GTBQe=a=S{#Fr%&WhUm86f-a|GC+XDWLJX`$iOyPnOT4GE>;7zO9~K$8c_8RN{E32%m5kt Xcfp&K$pY*jHXq=e!p6uraicT<6V@Od delta 56 zcmez6c*$^r2Gb>pjaq$7j0Tf;GTC!7GC+XF#EtTs6_~{s8JRc7vq&;dJfJaIfmvvC I7psRb0LFO^o&W#< diff --git a/tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/expected/expected_data.npz b/tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/expected/expected_data.npz index f27e3f15e9526dc33452d8d20494bb486103fdc4..7f324c91a434f12db38c75c375dbe1c0740b8997 100644 GIT binary patch literal 1564 zcmWIWW@Zs#fB;2?3BP3CZ3A*Zn43X_p*S(OASbn0FR!4IkwE|~3{nb`27$?bp}ql; zj0|NA)#@p!#mPnLRtoAh!Di|@3hHV3MI}XvdGYy0DXAcFx5S*{RG@fqMq)uKkgs8> zqp71%t3UzZGBhwSurvdc=3vqiNE#X#f=MGVX$&S!z@#adGz&oSvc}A?b1xVf7(iGU z=;id(y!?{Ng4Foj#FC=S3YgbH(jWk%v3lJm(}GB^E0|K|c`m4P7zoi!5NE^H{l@`d z=Af&G@nPCv>TuC8bufJ}8kc%p;xKbyG%j^8acZlBxf?aHgzTDY0!u7Hz{FCPSd^KV zS3ix(mP;s2Dhlwl!U0VSFdF7Z7!8$%rxjS(!sKBzA5dunVZ% H2ORVOTf93@ literal 747 zcmWIWW@Zs#U|`??Vnqi3kSw`QAnOGq1A`EQ2t!$7QD$OZNqlZ%Nl|8nUS2^ZBZB}~ z7O02;q!I*XzY%{WEpYNgz_Ea}^Cm^jiCL1jbV1(a%!TU~FPJwiJ|t+$4DtE#Qzmf< zm3yDj^keN3PgfCJ!t_&WnibcwSyo(Ixl)+*+}Jo0nl7CQm?i2}yx?L@z%P3~hT!AO zOIa6e5twCH^83P9!60>xrLB?yjA!f(zAxU*uXesIO-`@$#lr)hOMdUT@Q0Zp0L3$l z!v!<5fzAVYMi}Us^whlklFEWq9KHc51p#XLM#05+;m7}v8FpMTUieZuMczkNcvfA~ z-R{$Kr>t@g_s`kEi00l2zhvGe0v)goh`E98El$iW$Vn}Rg*r$W1VBna0INIWgmab= z4s~#ti!kUpGcz-L8yjTusp&Q@S=gvJ!-Q$_v4l%3KC_xUje)LXWD;Se;LXYgl4b_NT|ioz3B&^c Dw=>uB diff --git a/tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr1/excluded_calls_untidy.tsv.gz b/tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr1/excluded_calls_untidy.tsv.gz deleted file mode 100644 index cd87e135b7daed9006d282dbc93f5b711fc02fcd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 112 zcmV-$0FVD4iwFptyQX9S17&z)Y;|O1WM5-pY;1F1b#8QNWO*)hb9Mkr&L}eEG&Qs| zvfy;*bmTNNFaRR1WU#1-xdBMj86s*36E(3kK&Ul=iW*v2S^!l-Ma}V>U~X#683GhX SRf}S-1s4GF4nTaG0001;%PI;0 diff --git a/tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr1/input_c1_b1.tsv.gz b/tests/preprocessing/test_data/process_and_combine_sparse_gt/filter_calls_variants_samples_minimal_split/input/qc/calls/chr1/input_c1_b1.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..c6833adc6bda9c18a39a4bc22d63b0a35b59d558 GIT binary patch literal 98 zcmb2|=HPJtCy~a$oS9cpS`wdZ7@uUQS5jQY;C)WR=!uEx7L!d6N}iWIF)=Y7%Q6rOdQ#BP!jpzGQwLcvhWhcx&nPV1^5t=-sO5Ta7m8z(NDICA36?D{9MgHp9+)i*oy z-pqU7d+*zEdWp#eX*)ejzmb6#dI_QF^LB)gSg8fAcZf!`i+*K}_S4h!7(F(#i2Qc2 zhunA!!sZFT4MsmjxzRM``+}OGCq0pAH0E)S!j7s&+gyIvN0jSR%9rvv4xabS;mEWN z!7vRkgkb%hjsB6!Ku8SfbIS4O+h~-spZs}r+=0G*;B=rzQx5d^l*9h`ccT!) zAzCnj}m!Tb`N+&WDU|GdO91lr6Fp$^`J=;qRnn$g=DF$rE7fjf;@te1)*Y=R_6GyLe zBQ<%0p#7L~eMb4dANT!3I&U668Ws~l7trdFu&myF{mKu>_w}(pEWFz2!}8eayXC|z z`VTC?pSykhL1>CX6!hK(G6G@;83h>w83%EII6;nqOn^*+Oo1E+c>!b^MbS!R%wFP0`3~Ir8IB&i>-s6o;kvg<_f5i9Rl<&?2&}Pu#Az_BK%A-b*d#o;2 zWR0hfA$szZ&H7YNq09^*qwS!^#JI|2>qtnp5Oh+KZpjOXez`1+soNApxH ze=1t~o@-ZKzfQRPK8Ncs=Lv*7bM7&N*CoF0{%vf}tLT!vvo2QErl!}u=Vq_$dAC3Z zGjEO8%d&TF)~oZH3<|xes<5gz_*w&&0Wz-|Fb$b=8P9{)-vxu|aGiLUEx-HSY$&oS zt?$>FxYiqO1IP0=bw#X7CWNMDw7Z6?uqlfT3{8_8yu>$n*uLkLb=a=xWqfvRHF5f^ z_j(_g{V2TT=Lfsk2c)NdDlE8E*XIC|m7f>Bm%&(;@<1d-Z!9j5vhT~etL{sAAPjtr zgJCS?fiO>2+<{2&nGZ&?l+QMjA`J#IbqC^fE2JN-aD=BdnQv5Ugg&Yw)c9H(v&U%j zwOX}>18>WD7#HW-`_%m=*CSg#ej-#8zc4|}g8_au|cUF6Q0SE&1!$v5$HhFM}5 zW(5R4?=Z{)6DurA7@w%c*iQzl-B=;kqJd ziAL!%oJ~f8Lb6q7<7tiO^7T?8+AMNG74H-Dd|ilxA3X-RAa*bskP9(lWA%KK=d#FuIssKxM#Y3K5RBA2U3$t9c<)Fa<4hN4@LyH0Xf^J_UJ zu)ylU{n&eT>f(aDs-hNcm~|={C2nV{5bD&6QHr5TTL=Y6Trrt|x+iuE;ao%ntWY~e z;vxo)4LEV$P~Tun&iCb}MHODlZ%T1uZVyYtS^<1broxHSI&V(%-*fH3>@i`zVIfD%5IQRN$M#HAI-|jz0>k-F{Ew_*k_#G$!xAQm)q$m z+&W2o^>pfrZy~(i#RH2kt(b_XSv-(SELOxn-@u3GD*1in};Et%pa!z_cq{gGN>#7nSsaq}qq+a<2ZsCfU`PQ0bK>A!Oy6XzZO zEg`&&K|-)H6Kmmn*m_7NkR+f5JJdyF@CA;O>w;WaBdMgw5i%t+l3>!jYkEhB{Vn94 pHTc@KrrbC!8pdhzBQ9OYoohe*qlqY^dCd469D^^b^3KL*(A?uE7%Q6rOdQ#BP!jpzGQwLcvhWhcx&nPV1^5t=-sO5Ta7m8z(NDICA36?D{9MgHp9+)i*oy z-pqU7d+*zEdWp#eX*)ejzmb6#dI_QF^LB)gSg8fAcZf!`i+*K}_S4h!7(F(#i2Qc2 zhunA!!sZFT4MsmjxzRM``+}OGCq0pAH0E)S!j7s&+gyIvN0jSR%9rvv4xabS;mEWN z!7vRkgkb%hjsB6!Ku8SfbIS4O+h~-spZs}r+=0G*;B=rzQx5d^l*9h`ccT!) zAzCnj}m!Tb`N+&WDU|GdO91lr6Fp$^`J=;qRnn$g=DF$rE7fjf;@te1)*Y=R_6GyLe zBQ<%0p#7L~eMb4dANT!3I&U668Ws~l7trdFu&myF{mKu>_w}(pEWFz2!}8eayXC|z z`VTC?pSykhL1>CX6!hK(G6G@;83h>w83%EII6;nqOn^*+Oo1E+c>!b^MbS!R%wFP0`3~Ir8IB&i>-s6o;kvg<_f5i9Rl<&?2&}Pu#Az_BK%A-b*d#o;2 zWR0hfA$szZ&H7YNq09^*qwS!^#JI|2>qtnp5Oh+KZpjOXez`1+soNApxH ze=1t~o@-ZKzfQRPK8Ncs=Lv*7bM7&N*CoF0{%vf}tLT!vvo2QErl!}u=Vq_$dAC3Z zGjEO8%d&TF)~oZH3<|xes<5gz_*w&&0Wz-|Fb$b=8P9{)-vxu|aGiLUEx-HSY$&oS zt?$>FxYiqO1IP0=bw#X7CWNMDw7Z6?uqlfT3{8_8yu>$n*uLkLb=a=xWqfvRHF5f^ z_j(_g{V2TT=Lfsk2c)NdDlE8E*XIC|m7f>Bm%&(;@<1d-Z!9j5vhT~etL{sAAPjtr zgJCS?fiO>2+<{2&nGZ&?l+QMjA`J#IbqC^fE2JN-aD=BdnQv5Ugg&Yw)c9H(v&U%j zwOX}>18>WD7#HW-`_%m=*CSg#ej-#8zc4|}g8_au|cUF6Q0SE&1!$v5$HhFM}5 zW(5R4?=Z{)6DurA7@w%c*iQzl-B=;kqJd ziAL!%oJ~f8Lb6q7<7tiO^7T?8+AMNG74H-Dd|ilxA3X-RAa*bskP9(lWA%KK=d#FuIssKxM#Y3K5RBA2U3$t9c<)Fa<4hN4@LyH0Xf^J_UJ zu)ylU{n&eT>f(aDs-hNcm~|={C2nV{5bD&6QHr5TTL=Y6Trrt|x+iuE;ao%ntWY~e z;vxo)4LEV$P~Tun&iCb}MHODlZ%T1uZVyYtS^<1broxHSI&V(%-*fH3>@i`zVIfD%5IQRN$M#HAI-|jz0>k-F{Ew_*k_#G$!xAQm)q$m z+&W2o^>pfrZy~(i#RH2kt(b_XSv-(SELOxn-@u3GD*1in};Et%pa!z_cq{gGN>#7nSsaq}qq+a<2ZsCfU`PQ0bK>A!Oy6XzZO zEg`&&K|-)H6Kmmn*m_7NkR+f5JJdyF@CA;O>w;WaBdMgw5i%t+l3>!jYkEhB{Vn94 pHTc@KrrbC!8pdhzBQ9OYoohe*qlqY^dCd469D^^b^3KL*(A?uE7%Q6dwO1v74lX(sgagLdK|;4{7jEoW>O(t=-sJhmgZ-M+gbkdYD4gIm$Il1;IQ{je8eR(2n(PGQuXO z&kAGEVhN(aU5l5R^2BD)xW_#PyJ{+Bbp>7TSzLE4fyIF1_-W59ghiAU!7u|agkb%h zm3m~!LO=}U3yb5Ax6zoze&NTl2?zT2vD1MLryc08X@~vSUz~3HM^8ZW4QL*KJOudx zT!75n3=^C>q(1 zP5AE6AI5`UJ^XY(^w^0|;L9;DWjAYK(`yY+ov78-14=MC7_iMyX#UfV~34;=l@+3M0JL3_{Q z`rH!udLr;U>AP|GbXW`oJwR)x!m@V%^_$6_*g)(cV<6)o6Ce%{C&(F)NsuX!X^^uZFM-T}ybQbnHj?KR^F7R)uEp&z zw;;@$VJ%os=FKN(`@FHrVwX1W+X8=11@27(ZH6755N1@XJW33?#~VUbQaS1jqNXld z%}@0r%6b7AWd{u=tYO1s=vwez8#JW5#ybHbas@qJ&#mY1^}pbc=9yUjOtcKV(5|{3 zOuB*rhwIRJ9wE=HdtB!O^3VGG z8mCI2&{~QDt45QnH(?nf^O_FRh%uM&Jbdsj7z~FS#JgnrJ?dstmK1U0sLsH((d_6r zp1-BZLQOOvv{b#*Gt~GkNoZnds?_8}uF1jneZQo^c3G?7v+HZAON;)40We2V_{q-? zc5wtqU;Rv2aH$Sv0g{=YAHJ92Sf=t=Bw1@N%#*So$+@Qp>e2l|kOy#jKPgmTb zNbs2tM>3VqHH z=oC#?=wyZH5{V=BHdewJ)6P_`8fzGd7h-25+ZShbPDR8+ABu$F#)hb?PQUS={Ppv#Cbz~ z!)>WBkeeEpIkm7QCW*aSF6OJ?BUI|uYd4-v;vD)hg>-!HBs@Las5$~K#H*!{+4C@8 zOvDDULVZHHt&ue(yx4?31Bd#x(K1R|Cb)))X+;*)R7b*`N@m&;o9Y0kql5=q zSrN3H&PV16Usu+aS}V!QZoyyaga6QzNrbM$f8qZHwH4x< literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_calls_vars_samples_minimal/input/qc/calls/chr1/input_c1_b1.tsv.gz b/tests/preprocessing/test_data/process_sparse_gt/filter_calls_vars_samples_minimal/input/qc/calls/chr1/input_c1_b1.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..799ada83f26bb58b694b36a734280f499e0dfd24 GIT binary patch literal 112 zcmV-$0FVD4iwFovyQX9S17&z)Y;|O1WM5-pY;1F1b#8QNWO*)hb9Mkr&L}eEG&Qs| zvfy;*bmTNNFaRR1WU#1-xdBMj86s*36E(3kK&Ul=iW*v2S^!l-Ma}V>U~X#683GhX SRf}S-1s4GF4nTaG0001b$0@e} literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_calls_vars_samples_minimal/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_calls_vars_samples_minimal/input/variants.parquet new file mode 100644 index 0000000000000000000000000000000000000000..df779fb34a6a6da307133e7fc4234f7e1f7631fb GIT binary patch literal 4039 zcmcgvO>7%Q6dwO1v74lX(sgagLdK|;4{7jEoW>O(t=-sJhmgZ-M+gbkdYD4gIm$Il1;IQ{je8eR(2n(PGQuXO z&kAGEVhN(aU5l5R^2BD)xW_#PyJ{+Bbp>7TSzLE4fyIF1_-W59ghiAU!7u|agkb%h zm3m~!LO=}U3yb5Ax6zoze&NTl2?zT2vD1MLryc08X@~vSUz~3HM^8ZW4QL*KJOudx zT!75n3=^C>q(1 zP5AE6AI5`UJ^XY(^w^0|;L9;DWjAYK(`yY+ov78-14=MC7_iMyX#UfV~34;=l@+3M0JL3_{Q z`rH!udLr;U>AP|GbXW`oJwR)x!m@V%^_$6_*g)(cV<6)o6Ce%{C&(F)NsuX!X^^uZFM-T}ybQbnHj?KR^F7R)uEp&z zw;;@$VJ%os=FKN(`@FHrVwX1W+X8=11@27(ZH6755N1@XJW33?#~VUbQaS1jqNXld z%}@0r%6b7AWd{u=tYO1s=vwez8#JW5#ybHbas@qJ&#mY1^}pbc=9yUjOtcKV(5|{3 zOuB*rhwIRJ9wE=HdtB!O^3VGG z8mCI2&{~QDt45QnH(?nf^O_FRh%uM&Jbdsj7z~FS#JgnrJ?dstmK1U0sLsH((d_6r zp1-BZLQOOvv{b#*Gt~GkNoZnds?_8}uF1jneZQo^c3G?7v+HZAON;)40We2V_{q-? zc5wtqU;Rv2aH$Sv0g{=YAHJ92Sf=t=Bw1@N%#*So$+@Qp>e2l|kOy#jKPgmTb zNbs2tM>3VqHH z=oC#?=wyZH5{V=BHdewJ)6P_`8fzGd7h-25+ZShbPDR8+ABu$F#)hb?PQUS={Ppv#Cbz~ z!)>WBkeeEpIkm7QCW*aSF6OJ?BUI|uYd4-v;vD)hg>-!HBs@Las5$~K#H*!{+4C@8 zOvDDULVZHHt&ue(yx4?31Bd#x(K1R|Cb)))X+;*)R7b*`N@m&;o9Y0kql5=q zSrN3H&PV16Usu+aS}V!QZoyyaga6QzNrbM$f8qZHwH4x< literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_samples_minimal/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_samples_minimal/input/variants.parquet new file mode 100644 index 0000000000000000000000000000000000000000..df779fb34a6a6da307133e7fc4234f7e1f7631fb GIT binary patch literal 4039 zcmcgvO>7%Q6dwO1v74lX(sgagLdK|;4{7jEoW>O(t=-sJhmgZ-M+gbkdYD4gIm$Il1;IQ{je8eR(2n(PGQuXO z&kAGEVhN(aU5l5R^2BD)xW_#PyJ{+Bbp>7TSzLE4fyIF1_-W59ghiAU!7u|agkb%h zm3m~!LO=}U3yb5Ax6zoze&NTl2?zT2vD1MLryc08X@~vSUz~3HM^8ZW4QL*KJOudx zT!75n3=^C>q(1 zP5AE6AI5`UJ^XY(^w^0|;L9;DWjAYK(`yY+ov78-14=MC7_iMyX#UfV~34;=l@+3M0JL3_{Q z`rH!udLr;U>AP|GbXW`oJwR)x!m@V%^_$6_*g)(cV<6)o6Ce%{C&(F)NsuX!X^^uZFM-T}ybQbnHj?KR^F7R)uEp&z zw;;@$VJ%os=FKN(`@FHrVwX1W+X8=11@27(ZH6755N1@XJW33?#~VUbQaS1jqNXld z%}@0r%6b7AWd{u=tYO1s=vwez8#JW5#ybHbas@qJ&#mY1^}pbc=9yUjOtcKV(5|{3 zOuB*rhwIRJ9wE=HdtB!O^3VGG z8mCI2&{~QDt45QnH(?nf^O_FRh%uM&Jbdsj7z~FS#JgnrJ?dstmK1U0sLsH((d_6r zp1-BZLQOOvv{b#*Gt~GkNoZnds?_8}uF1jneZQo^c3G?7v+HZAON;)40We2V_{q-? zc5wtqU;Rv2aH$Sv0g{=YAHJ92Sf=t=Bw1@N%#*So$+@Qp>e2l|kOy#jKPgmTb zNbs2tM>3VqHH z=oC#?=wyZH5{V=BHdewJ)6P_`8fzGd7h-25+ZShbPDR8+ABu$F#)hb?PQUS={Ppv#Cbz~ z!)>WBkeeEpIkm7QCW*aSF6OJ?BUI|uYd4-v;vD)hg>-!HBs@Las5$~K#H*!{+4C@8 zOvDDULVZHHt&ue(yx4?31Bd#x(K1R|Cb)))X+;*)R7b*`N@m&;o9Y0kql5=q zSrN3H&PV16Usu+aS}V!QZoyyaga6QzNrbM$f8qZHwH4x< literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/filter_variants_minimal/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/filter_variants_minimal/input/variants.parquet new file mode 100644 index 0000000000000000000000000000000000000000..df779fb34a6a6da307133e7fc4234f7e1f7631fb GIT binary patch literal 4039 zcmcgvO>7%Q6dwO1v74lX(sgagLdK|;4{7jEoW>O(t=-sJhmgZ-M+gbkdYD4gIm$Il1;IQ{je8eR(2n(PGQuXO z&kAGEVhN(aU5l5R^2BD)xW_#PyJ{+Bbp>7TSzLE4fyIF1_-W59ghiAU!7u|agkb%h zm3m~!LO=}U3yb5Ax6zoze&NTl2?zT2vD1MLryc08X@~vSUz~3HM^8ZW4QL*KJOudx zT!75n3=^C>q(1 zP5AE6AI5`UJ^XY(^w^0|;L9;DWjAYK(`yY+ov78-14=MC7_iMyX#UfV~34;=l@+3M0JL3_{Q z`rH!udLr;U>AP|GbXW`oJwR)x!m@V%^_$6_*g)(cV<6)o6Ce%{C&(F)NsuX!X^^uZFM-T}ybQbnHj?KR^F7R)uEp&z zw;;@$VJ%os=FKN(`@FHrVwX1W+X8=11@27(ZH6755N1@XJW33?#~VUbQaS1jqNXld z%}@0r%6b7AWd{u=tYO1s=vwez8#JW5#ybHbas@qJ&#mY1^}pbc=9yUjOtcKV(5|{3 zOuB*rhwIRJ9wE=HdtB!O^3VGG z8mCI2&{~QDt45QnH(?nf^O_FRh%uM&Jbdsj7z~FS#JgnrJ?dstmK1U0sLsH((d_6r zp1-BZLQOOvv{b#*Gt~GkNoZnds?_8}uF1jneZQo^c3G?7v+HZAON;)40We2V_{q-? zc5wtqU;Rv2aH$Sv0g{=YAHJ92Sf=t=Bw1@N%#*So$+@Qp>e2l|kOy#jKPgmTb zNbs2tM>3VqHH z=oC#?=wyZH5{V=BHdewJ)6P_`8fzGd7h-25+ZShbPDR8+ABu$F#)hb?PQUS={Ppv#Cbz~ z!)>WBkeeEpIkm7QCW*aSF6OJ?BUI|uYd4-v;vD)hg>-!HBs@Las5$~K#H*!{+4C@8 zOvDDULVZHHt&ue(yx4?31Bd#x(K1R|Cb)))X+;*)R7b*`N@m&;o9Y0kql5=q zSrN3H&PV16Usu+aS}V!QZoyyaga6QzNrbM$f8qZHwH4x< literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_data/process_sparse_gt/no_filters_minimal/input/variants.parquet b/tests/preprocessing/test_data/process_sparse_gt/no_filters_minimal/input/variants.parquet new file mode 100644 index 0000000000000000000000000000000000000000..df779fb34a6a6da307133e7fc4234f7e1f7631fb GIT binary patch literal 4039 zcmcgvO>7%Q6dwO1v74lX(sgagLdK|;4{7jEoW>O(t=-sJhmgZ-M+gbkdYD4gIm$Il1;IQ{je8eR(2n(PGQuXO z&kAGEVhN(aU5l5R^2BD)xW_#PyJ{+Bbp>7TSzLE4fyIF1_-W59ghiAU!7u|agkb%h zm3m~!LO=}U3yb5Ax6zoze&NTl2?zT2vD1MLryc08X@~vSUz~3HM^8ZW4QL*KJOudx zT!75n3=^C>q(1 zP5AE6AI5`UJ^XY(^w^0|;L9;DWjAYK(`yY+ov78-14=MC7_iMyX#UfV~34;=l@+3M0JL3_{Q z`rH!udLr;U>AP|GbXW`oJwR)x!m@V%^_$6_*g)(cV<6)o6Ce%{C&(F)NsuX!X^^uZFM-T}ybQbnHj?KR^F7R)uEp&z zw;;@$VJ%os=FKN(`@FHrVwX1W+X8=11@27(ZH6755N1@XJW33?#~VUbQaS1jqNXld z%}@0r%6b7AWd{u=tYO1s=vwez8#JW5#ybHbas@qJ&#mY1^}pbc=9yUjOtcKV(5|{3 zOuB*rhwIRJ9wE=HdtB!O^3VGG z8mCI2&{~QDt45QnH(?nf^O_FRh%uM&Jbdsj7z~FS#JgnrJ?dstmK1U0sLsH((d_6r zp1-BZLQOOvv{b#*Gt~GkNoZnds?_8}uF1jneZQo^c3G?7v+HZAON;)40We2V_{q-? zc5wtqU;Rv2aH$Sv0g{=YAHJ92Sf=t=Bw1@N%#*So$+@Qp>e2l|kOy#jKPgmTb zNbs2tM>3VqHH z=oC#?=wyZH5{V=BHdewJ)6P_`8fzGd7h-25+ZShbPDR8+ABu$F#)hb?PQUS={Ppv#Cbz~ z!)>WBkeeEpIkm7QCW*aSF6OJ?BUI|uYd4-v;vD)hg>-!HBs@Las5$~K#H*!{+4C@8 zOvDDULVZHHt&ue(yx4?31Bd#x(K1R|Cb)))X+;*)R7b*`N@m&;o9Y0kql5=q zSrN3H&PV16Usu+aS}V!QZoyyaga6QzNrbM$f8qZHwH4x< literal 0 HcmV?d00001 diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py index dfd3a152..d1cda827 100644 --- a/tests/preprocessing/test_preprocess.py +++ b/tests/preprocessing/test_preprocess.py @@ -86,7 +86,7 @@ def test_process_sparse_gt_file( test_data_input_dir = current_test_data_dir / "input" - variant_file = test_data_input_dir / "variants.tsv.gz" + variant_file = test_data_input_dir / "variants.parquet" samples_file = test_data_input_dir / "samples_chr.csv" sparse_gt_dir = test_data_input_dir / "sparse_gt" @@ -99,13 +99,15 @@ def test_process_sparse_gt_file( cli_parameters = [ "process-sparse-gt", *extra_cli_params, + "--chunksize", + "3", variant_file.as_posix(), samples_file.as_posix(), sparse_gt_dir.as_posix(), out_file_base.as_posix(), ] - result = cli_runner.invoke(preprocess_cli, cli_parameters) + result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=False) assert result.exit_code == 0 h5_file = out_file_base.as_posix().replace("genotypes", genotype_file_name) @@ -149,11 +151,13 @@ def test_combine_genotypes(test_data_name_dir, input_h5, result_h5, tmp_path): cli_parameters = [ "combine-genotypes", + "--chunksize", + 3, *[(test_data_input_dir / h5f).as_posix() for h5f in input_h5], combined_output_h5.as_posix(), ] - result = cli_runner.invoke(preprocess_cli, cli_parameters) + result = cli_runner.invoke(preprocess_cli, cli_parameters, catch_exceptions=False) assert result.exit_code == 0 written_samples, written_variant_matrix, written_genotype_matrix = load_h5_archive( @@ -268,7 +272,7 @@ def test_process_and_combine_sparse_gt( test_data_input_dir = current_test_data_dir / "input" - variant_file = test_data_input_dir / "variants.tsv.gz" + variant_file = test_data_input_dir / "variants.parquet" samples_file = test_data_input_dir / "samples_chr.csv" sparse_gt_dir = test_data_input_dir / "sparse_gt" @@ -282,22 +286,30 @@ def test_process_and_combine_sparse_gt( cli_parameters_process = [ "process-sparse-gt", *extra_cli_params, + "--chunksize", + "3", variant_file.as_posix(), samples_file.as_posix(), sparse_gt_dir.as_posix(), out_file_base.as_posix(), ] - result_process = cli_runner.invoke(preprocess_cli, cli_parameters_process) + result_process = cli_runner.invoke( + preprocess_cli, cli_parameters_process, catch_exceptions=False + ) assert result_process.exit_code == 0 cli_parameters_combine = [ "combine-genotypes", + "--chunksize", + 3, *[(preprocessed_dir / h5f).as_posix() for h5f in input_h5], combined_output_h5.as_posix(), ] - result_combine = cli_runner.invoke(preprocess_cli, cli_parameters_combine) + result_combine = cli_runner.invoke( + preprocess_cli, cli_parameters_combine, catch_exceptions=False + ) assert result_combine.exit_code == 0 written_samples, written_variant_matrix, written_genotype_matrix = load_h5_archive(