Skip to content

Commit

Permalink
Add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed May 7, 2024
1 parent b539cd3 commit c28b69d
Show file tree
Hide file tree
Showing 2 changed files with 312 additions and 0 deletions.
130 changes: 130 additions & 0 deletions deeprvat/deeprvat/associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,136 @@ def compute_burdens(
source_path.symlink_to(link_burdens)



@cli.command()
@click.option("--n-chunks", type=int, required=True)
@click.option("--skip-burdens", is_flag=True, default=False)
@click.option("--overwrite", is_flag=True, default=False)
@click.argument("burdens-chunks-dir", type=click.Path(exists=True))
@click.argument("result-dir", type=click.Path(exists=True))
def combine_burden_chunks(
n_chunks: int,
skip_burdens: bool,
overwrite: bool,
burdens_chunks_dir: Path,
result_dir: Path,
):
combine_burden_chunks_(
n_chunks=n_chunks,
skip_burdens=skip_burdens,
overwrite=overwrite,
burdens_chunks_dir=Path(burdens_chunks_dir),
result_dir=Path(result_dir),
)


def combine_burden_chunks_(
n_chunks: int,
burdens_chunks_dir: Path,
skip_burdens: bool,
overwrite: bool,
result_dir: Path,
):
compression_level = 1
burdens_chunks_dir = Path(burdens_chunks_dir)

burdens, x, y, sample_ids = None, None, None, None
start_id = None
end_id = 0

for i, chunk in tqdm(
enumerate(range(0, n_chunks)), desc=f"Merging {n_chunks} chunks"
):
chunk_dir = burdens_chunks_dir / f"chunk_{chunk}"

if not skip_burdens:
burdens_chunk = zarr.open((chunk_dir / "burdens.zarr").as_posix(), mode="r")
assert burdens_chunk.attrs["chunk"] == chunk

y_chunk = zarr.open((chunk_dir / "y.zarr").as_posix(), mode="r")

x_chunk = zarr.open((chunk_dir / "x.zarr").as_posix(), mode="r")
sample_ids_chunk = zarr.open(
(chunk_dir / "sample_ids.zarr").as_posix(), mode="r"
)

total_samples = sample_ids_chunk.attrs["n_total_samples"]

assert y_chunk.attrs["chunk"] == chunk
assert x_chunk.attrs["chunk"] == chunk
assert sample_ids_chunk.attrs["chunk"] == chunk

burdens_path = result_dir / "burdens.zarr"
x_path = result_dir / "x.zarr"
y_path = result_dir / "y.zarr"
sample_ids_path = result_dir / "sample_ids.zarr"

if i == 0:
if not skip_burdens:
burdens_shape = (total_samples,) + burdens_chunk.shape[1:]

if not overwrite:
assert not burdens_path.exists()
else:
logger.debug("Overwriting existing files")

logger.debug(f"Opening {burdens_path} in append mode")
burdens = zarr.open(
burdens_path.as_posix(),
mode="a",
shape=burdens_shape,
chunks=(1000, 1000, 1),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
assert burdens_path.exists()

logger.debug(f"Opening {y_path} in append mode")
y = zarr.open(
y_path,
mode="a",
shape=(total_samples,) + y_chunk.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
logger.debug(f"Opening {x_path} in append mode")
x = zarr.open(
x_path,
mode="a",
shape=(total_samples,) + x_chunk.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
logger.debug(f"Opening {sample_ids_path} in append mode")
sample_ids = zarr.open(
sample_ids_path,
mode="a",
shape=(total_samples),
chunks=(None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)

assert x_path.exists()
assert y_path.exists()
assert sample_ids_path.exists()

start_id = end_id
end_id += len(sample_ids_chunk)

y[start_id:end_id] = y_chunk[:]
x[start_id:end_id] = x_chunk[:]
sample_ids[start_id:end_id] = sample_ids_chunk[:]

if not skip_burdens:
burdens[start_id:end_id] = burdens_chunk[:]

logger.info(f"Done merging {n_chunks} chunks.")



def regress_on_gene_scoretest(
gene: str,
burdens: np.ndarray,
Expand Down
182 changes: 182 additions & 0 deletions tests/deeprvat/test_associate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import zipfile
from pathlib import Path

import numpy as np
import pytest
import zarr

from deeprvat.deeprvat.associate import combine_burden_chunks_

script_dir = Path(__file__).resolve().parent
tests_data_dir = script_dir / "test_data" / "associate"


def open_zarr(zarr_path: Path):
zarr_data = zarr.open(zarr_path.as_posix(), mode="r")
return zarr_data


def unzip_data(zip_path, out_path):
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(out_path)

return out_path


@pytest.fixture
def chunks_data(request, tmp_path) -> Path:
zipped_chunks_path = Path(request.param)
chunks_unpacked_path = tmp_path / "chunks"
unzip_data(zip_path=zipped_chunks_path, out_path=chunks_unpacked_path)

yield chunks_unpacked_path


@pytest.fixture
def expected_array(request, tmp_path) -> Path:
zipped_expected_path = Path(request.param)
expected_data_unpacked_path = tmp_path / "expected"
unzip_data(zip_path=zipped_expected_path, out_path=expected_data_unpacked_path)

yield expected_data_unpacked_path


@pytest.mark.parametrize(
"n_chunks, skip_burdens, overwrite, chunks_data, expected_array",
[
(
n_chunks,
False,
False,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
tests_data_dir / f"combine_burden_chunks/expected/burdens_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
]
+ [
(
n_chunks,
True,
True,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
tests_data_dir / f"combine_burden_chunks/expected/burdens_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
]
+ [
(
n_chunks,
True,
False,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
tests_data_dir / f"combine_burden_chunks/expected/burdens_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
]
+ [
(
n_chunks,
False,
True,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
tests_data_dir / f"combine_burden_chunks/expected/burdens_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
],
indirect=["chunks_data", "expected_array"],
)
def test_combine_burden_chunks_data_same(
n_chunks,
skip_burdens,
overwrite,
tmp_path,
chunks_data,
expected_array,
):

combine_burden_chunks_(
n_chunks=n_chunks,
burdens_chunks_dir=chunks_data,
skip_burdens=skip_burdens,
overwrite=overwrite,
result_dir=tmp_path,
)

zarr_files = ["x.zarr", "y.zarr", "sample_ids.zarr", "burdens.zarr"]
if skip_burdens:
zarr_files.remove("burdens.zarr")

for zarr_file in zarr_files:

expected_data = open_zarr(zarr_path=(expected_array / zarr_file))
written_data = open_zarr(zarr_path=(tmp_path / zarr_file))
expected_data_arr, written_data_arr = expected_data[:], written_data[:]
assert written_data_arr.dtype == expected_data.dtype
assert expected_data_arr.shape == written_data_arr.shape
assert np.array_equal(expected_data_arr, written_data_arr, equal_nan=True)

# No more than 10% zeros
nr_zeros = np.count_nonzero(written_data_arr == 0)
zero_percentage = nr_zeros / len(written_data_arr)
assert zero_percentage < 0.1


@pytest.mark.parametrize(
"n_chunks, skip_burdens, overwrite, chunks_data",
[
(
n_chunks,
False,
False,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
]
+ [
(
n_chunks,
True,
True,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
]
+ [
(
n_chunks,
True,
False,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
]
+ [
(
n_chunks,
False,
True,
tests_data_dir / f"combine_burden_chunks/input/chunks_{n_chunks}.zip",
)
for n_chunks in range(2, 5)
],
indirect=["chunks_data"],
)
def test_combine_burden_chunks_file_exists(
n_chunks, skip_burdens, overwrite, tmp_path, chunks_data
):

combine_burden_chunks_(
n_chunks=n_chunks,
burdens_chunks_dir=chunks_data,
skip_burdens=skip_burdens,
overwrite=overwrite,
result_dir=tmp_path,
)

if not skip_burdens:
assert (tmp_path / "burdens.zarr").exists()
else:
assert not (tmp_path / "burdens.zarr").exists()
assert (tmp_path / "x.zarr").exists()
assert (tmp_path / "y.zarr").exists()
assert (tmp_path / "sample_ids.zarr").exists()

0 comments on commit c28b69d

Please sign in to comment.