Skip to content

Commit

Permalink
write chunks separately
Browse files Browse the repository at this point in the history
  • Loading branch information
bfclarke committed Oct 4, 2023
1 parent dc6232e commit 7d07db2
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 34 deletions.
49 changes: 32 additions & 17 deletions deeprvat/deeprvat/associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def compute_burdens_(
skip_burdens: bool = False,
) -> Tuple[np.ndarray, zarr.core.Array, zarr.core.Array, zarr.core.Array]:
if not skip_burdens:
logger.info("agg_models[*][*].reverse:")
print("agg_models[*][*].reverse:")
pprint(
{
repeat: [m.reverse for m in models]
Expand All @@ -171,7 +171,7 @@ def compute_burdens_(
n_samples = len(samples)
ds = Subset(ds, samples)

logger.info(f"Processing samples in {samples} from {n_total_samples} in total")
print(f"Processing samples in {samples} from {n_total_samples} in total")
else:
n_samples = n_total_samples
chunk_start = 0
Expand All @@ -182,12 +182,12 @@ def compute_burdens_(
if torch.cuda.is_available():
pin_memory = dataloader_config.get("pin_memory", True)

logger.info(f"CUDA is available, setting pin_memory={pin_memory}")
print(f"CUDA is available, setting pin_memory={pin_memory}")
dataloader_config["pin_memory"] = pin_memory

dl = DataLoader(ds, collate_fn=collate_fn, **dataloader_config)

logger.info("Computing burden scores")
print("Computing burden scores")
batch_size = data_config["dataloader_config"]["batch_size"]
with torch.no_grad():
for i, batch in tqdm(
Expand All @@ -204,33 +204,33 @@ def compute_burdens_(
chunk_y = np.zeros(shape=(n_samples,) + this_y.shape[1:])
chunk_x = np.zeros(shape=(n_samples,) + this_x.shape[1:])

logger.info(f"Batch size: {batch['rare_variant_annotations'].shape}")
print(f"Batch size: {batch['rare_variant_annotations'].shape}")

if not skip_burdens:
burdens = zarr.open(
Path(cache_dir) / "burdens.zarr",
mode="a",
shape=(n_total_samples,) + this_burdens.shape[1:],
shape=(n_samples,) + this_burdens.shape[1:],
chunks=(1000, 1000),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
logger.info(f"burdens shape: {burdens.shape}")
print(f"burdens shape: {burdens.shape}")
else:
burdens = None

y = zarr.open(
Path(cache_dir) / "y.zarr",
mode="a",
shape=(n_total_samples,) + this_y.shape[1:],
shape=(n_samples,) + this_y.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
x = zarr.open(
Path(cache_dir) / "x.zarr",
mode="a",
shape=(n_total_samples,) + this_x.shape[1:],
shape=(n_samples,) + this_x.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
Expand All @@ -253,14 +253,22 @@ def compute_burdens_(
if bottleneck and i > 20:
break

if i % 100 == 99:
print(f"Finished {i + 1} batches")

write_slice = slice(0, chunk_end - chunk_start)
print(f"Writing to slice: {write_slice}")
if not skip_burdens:
burdens[chunk_start:chunk_end] = chunk_burden
burdens[write_slice] = chunk_burden

y[write_slice] = chunk_y
x[write_slice] = chunk_x

y[chunk_start:chunk_end] = chunk_y
x[chunk_start:chunk_end] = chunk_x
with open(f"chunk_{chunk}_samples.txt", "w") as f:
f.write("{chunk_start}:{chunk_end}")

if torch.cuda.is_available():
logger.info(
print(
"Max GPU memory allocated: " f"{torch.cuda.max_memory_allocated(0)} bytes"
)

Expand Down Expand Up @@ -440,17 +448,17 @@ def compute_burdens(
model_config = yaml.safe_load(f)

if dataset_file is not None:
logger.info("Loading pickled dataset")
print("Loading pickled dataset")
with open(dataset_file, "rb") as f:
dataset = pickle.load(f)
else:
dataset = make_dataset_(config)

if torch.cuda.is_available():
logger.info("Using GPU")
print("Using GPU")
device = torch.device("cuda")
else:
logger.info("Using CPU")
print("Using CPU")
device = torch.device("cpu")

if link_burdens is None:
Expand All @@ -471,14 +479,21 @@ def compute_burdens(
skip_burdens=(link_burdens is not None),
)

logger.info("Saving computed burdens, corresponding genes, and targets")
print("Saving computed burdens, corresponding genes, and targets")
np.save(Path(out_dir) / "genes.npy", genes)
if link_burdens is not None:
source_path = Path(out_dir) / "burdens.zarr"
source_path.unlink(missing_ok=True)
source_path.symlink_to(link_burdens)


@cli.command()
@click.option("--chunk", type=(click.Path(exists=True), click.Path(exists=True)), multiple=True)
@click.argument("out_dir", type=click.Path(exists=True))
def combine_burden_chunks(chunk: Tuple[Tuple[str, str, str, str, str]], out_dir: str):
pass


def regress_on_gene_scoretest(gene: str, burdens: np.ndarray, model_score):
burdens = burdens.reshape(burdens.shape[0], -1)
logger.info(f"Burdens shape: {burdens.shape}")
Expand Down
4 changes: 2 additions & 2 deletions dnanexus/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,5 +159,5 @@ data:
low_memory: True
verbose: True
dataloader_config:
batch_size: 16
num_workers: 16
batch_size: 8
num_workers: 4
16 changes: 13 additions & 3 deletions dnanexus/deeprvat_compute_burdens/dxapp.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,18 @@
"version": "0.0.1",
"inputSpec": [
{
"name": "config",
"label": "config",
"name": "n_chunks",
"label": "n_chunks",
"class": "string",
"optional": false,
"patterns": [
"*"
],
"help": ""
},
{
"name": "chunk",
"label": "chunk",
"class": "string",
"optional": false,
"patterns": [
Expand All @@ -30,7 +40,7 @@
"runSpec": {
"timeoutPolicy": {
"*": {
"hours": 1
"hours": 10
}
},
"interpreter": "bash",
Expand Down
23 changes: 11 additions & 12 deletions dnanexus/deeprvat_compute_burdens/src/run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
main() {
BASE=/mnt/project/DeepRVAT/DeepRVAT
WORKDIR=workdir/pretrained_scoring_debug # TODO: Change
WORKDIR=workdir/pretrained_scoring # TODO: Change

echo "Mounting via dxfuse"
mkdir -pv /mnt/project
Expand All @@ -25,31 +25,30 @@ main() {

echo "Downloading data"
echo "dx download DeepRVAT/workdir/preprocessed/genotypes.h5"
# cp $BASE/workdir/preprocessed/genotypes.h5 .
cp $BASE/data/genotypes-head1000.h5 .
mv genotypes-head1000.h5 genotypes.h5
cp $BASE/workdir/preprocessed/genotypes.h5 .
# cp $BASE/data/genotypes-head1000.h5 .
# mv genotypes-head1000.h5 genotypes.h5
echo "dx download DeepRVAT/data/variants_90pct10dp_qc.parquet"
cp $BASE/data/variants_90pct10dp_qc.parquet .
echo "dx download DeepRVAT/data/phenotypes.parquet"
# cp $BASE/data/phenotypes.parquet .
cp $BASE/data/phenotypes-head1000.parquet .
mv phenotypes-head1000.parquet phenotypes.parquet
cp $BASE/data/phenotypes.parquet .
# cp $BASE/data/phenotypes-head1000.parquet .
# mv phenotypes-head1000.parquet phenotypes.parquet
echo "dx download DeepRVAT/data/annotations.parquet"
cp $BASE/data/annotations.parquet .
echo "dx download DeepRVAT/data/protein_coding_genes.parquet"
cp $BASE/data/protein_coding_genes.parquet .

echo "Executing command: $command using config $config"
echo "dx download $config"
cp /mnt/project/DeepRVAT/$config .
echo "Run deeprvat_associate compute-burdens"
mkdir -p Calcium/deeprvat/burdens
python deeprvat/deeprvat/deeprvat/associate.py compute-burdens --debug \
mkdir -p Calcium/deeprvat/burdens_chunk_$chunk
python deeprvat/deeprvat/deeprvat/associate.py compute-burdens \
--n-chunks $n_chunks --chunk $chunk \
--dataset-file $BASE/$WORKDIR/Calcium/deeprvat/association_dataset.pkl \
$BASE/workdir/pretrained_scoring/Calcium/deeprvat/hpopt_config.yaml \
$BASE/workdir/pretrained_scoring/pretrained_models/config.yaml \
$BASE/workdir/pretrained_scoring/pretrained_models/repeat_0/best/bag_0.ckpt $BASE/workdir/pretrained_scoring/pretrained_models/repeat_1/best/bag_0.ckpt $BASE/workdir/pretrained_scoring/pretrained_models/repeat_2/best/bag_0.ckpt $BASE/workdir/pretrained_scoring/pretrained_models/repeat_3/best/bag_0.ckpt $BASE/workdir/pretrained_scoring/pretrained_models/repeat_4/best/bag_0.ckpt $BASE/workdir/pretrained_scoring/pretrained_models/repeat_5/best/bag_0.ckpt \
Calcium/deeprvat/burdens
Calcium/deeprvat/burdens_chunk_$chunk

echo "Uploading outputs"
echo "rm config.yaml"
Expand Down

0 comments on commit 7d07db2

Please sign in to comment.