Skip to content

Commit

Permalink
Fix assert in process_sparse_gt (#95)
Browse files Browse the repository at this point in the history
* Fix assert in process_sparse_gt

* add filter_variants_multiple test

* Fix path to test data
  • Loading branch information
endast authored May 22, 2024
1 parent aa5c87a commit 6198999
Show file tree
Hide file tree
Showing 9 changed files with 25 additions and 5 deletions.
9 changes: 4 additions & 5 deletions deeprvat/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ def process_sparse_gt(
if chromosomes is not None:
chromosomes = [f"chr{chrom}" for chrom in chromosomes.split(",")]
variants = variants[variants["chrom"].isin(chromosomes)]

total_variants = len(variants)
if len(exclude_variants) > 0:
variant_exclusion_files = [
Expand All @@ -267,17 +268,15 @@ def process_sparse_gt(
],
ignore_index=True,
)
if chromosomes is not None:
variants_to_exclude = variants_to_exclude[
variants_to_exclude["chrom"].isin(chromosomes)
]

variants_to_exclude = variants_to_exclude.drop_duplicates(ignore_index=True)
variant_ids_to_exclude = pd.merge(
variants_to_exclude, variants, validate="1:1"
)["id"]

variants = variants[~variants["id"].isin(variant_ids_to_exclude)]
if not skip_sanity_checks:
assert total_variants - len(variants) == len(variants_to_exclude)
assert total_variants - len(variants) == len(variant_ids_to_exclude)

logging.info(f"Dropped {total_variants - len(variants)} variants")
logging.info(f"...done ({time.time() - start_time} s)")
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
100096
100097
100099
100100
100101
100102
100103
100104
100105
100106
100107
Binary file not shown.
Binary file not shown.
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/preprocessing/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@ def load_h5_archive(h5_path):
],
"genotypes_chr1.h5",
),
(
"filter_variants_multiple",
[
"--chromosomes",
"1",
"--exclude-variants",
f"{(tests_data_dir / 'process_sparse_gt/filter_variants_multiple/input/qc').as_posix()}",
],
"genotypes_chr1.h5",
),
(
"filter_samples_minimal",
[
Expand Down

0 comments on commit 6198999

Please sign in to comment.