Skip to content

Commit

Permalink
Merge branch 'main' into deepripe-batch
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed Oct 18, 2023
2 parents c7a21dc + 955f535 commit 66eb97c
Show file tree
Hide file tree
Showing 96 changed files with 245 additions and 62 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/autoblack_pull_request.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# GitHub Action that uses Black to reformat the Python code in an incoming pull request.
# If all Python code in the pull request is complient with Black then this Action does nothing.
# Othewrwise, Black is run and its changes are committed back to the incoming pull request.
# https://github.com/cclauss/autoblack

name: autoblack_pull_request
on: [ pull_request ]
jobs:
black-code:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- run: pip install black
- run: black --check .
- name: If needed, commit black changes to the pull request
if: failure()
run: |
printenv | grep GITHUB
git config --global user.name 'PMBio'
git config --global user.email 'PMBio@users.noreply.github.com'
git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY
git remote -v
git branch
git status
black .
git status
echo ready to commit
git commit -am "fixup! Format Python code with psf/black pull_request"
echo ready to push
git push
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ Rare variant association testing using deep learning and data-driven burden scor
git clone git@github.com:PMBio/deeprvat.git
```
1. Change directory to the repository: `cd deeprvat`
1. Install the conda environment. We recommend using `mamba`, though you may also replace `mamba` with `conda`:
1. Install the conda environment. We recommend using [mamba](https://mamba.readthedocs.io/en/latest/index.html), though you may also replace `mamba` with `conda`

*note: [the current deeprvat env does not support cuda when installed with conda](https://github.com/PMBio/deeprvat/issues/16), install using mamba for cuda support.*
```
mamba env create -n deeprvat -f deeprvat_env.yaml
```
Expand Down
15 changes: 14 additions & 1 deletion deeprvat/data/dense_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,18 @@ def transform_data(self):
self.phenotype_df[col] = rng.permutation(
self.phenotype_df[col].to_numpy()
)

if len(self.y_phenotypes) > 0:
unique_y_val = self.phenotype_df[self.y_phenotypes[0]].unique()
n_unique_y_val = np.count_nonzero(~np.isnan(unique_y_val))
logger.info(f"unique y values {unique_y_val}")
logger.info(n_unique_y_val)
else:
n_unique_y_val = 0
if n_unique_y_val == 2:
logger.warning(
"Not applying y transformation because y only has two values and seems to be binary"
)
self.y_transformation = None
if self.y_transformation is not None:
if self.y_transformation == "standardize":
logger.debug(" Standardizing target phenotype")
Expand All @@ -425,6 +436,8 @@ def transform_data(self):
)
else:
raise ValueError(f"Unknown y_transformation: {self.y_transformation}")
else:
logger.warning("Not transforming phenotype")

def setup_annotations(
self,
Expand Down
19 changes: 12 additions & 7 deletions deeprvat/deeprvat/associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,9 @@ def make_dataset_(
with open(ds_pickled, "rb") as f:
ds = pickle.load(f)
else:
variant_file = data_config.get(
"variant_file", f'{data_config["gt_file"][:-3]}_variants.parquet'
)
ds = DenseGTDataset(
data_config["gt_file"],
variant_file=variant_file,
variant_file=data_config["variant_file"],
split="",
skip_y_na=False,
**copy.deepcopy(data_config["dataset_config"]),
Expand Down Expand Up @@ -499,8 +496,11 @@ def regress_on_gene_scoretest(gene: str, burdens: np.ndarray, model_score):
f"gene {gene}, p-value: {pv}, using saddle instead."
)
pv = model_score.pv_alt_model(burdens, method="saddle")

beta = model_score.coef(burdens)["beta"][0, 0]
# beta only for linear models
try:
beta = model_score.coef(burdens)["beta"][0, 0]
except:
beta = None

genes_params_pvalues = ([], [], [])
genes_params_pvalues[0].append(gene)
Expand Down Expand Up @@ -579,7 +579,12 @@ def regress_(
logger.info(f"X shape: {X.shape}, Y shape: {y.shape}")

# compute null_model for score test
model_score = scoretest.ScoretestNoK(y, X)
if len(np.unique(y)) == 2:
logger.info("Fitting binary model since only found two distinct y values")
model_score = scoretest.ScoretestLogit(y, X)
else:
logger.info("Fitting linear model")
model_score = scoretest.ScoretestNoK(y, X)
genes_betas_pvals = [
regress_on_gene_scoretest(gene, burdens[mask, i], model_score)
for i, gene in tqdm(
Expand Down
2 changes: 1 addition & 1 deletion deeprvat/deeprvat/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def get_baseline_results(
(
r["type"].split("/")[0],
r["type"].split("/")[1],
): f"{r['base']}/{pheno}/{r['type']}/eval/burden_associations_testing.parquet"
): f"{r['base']}/{pheno}/{r['type']}/eval/burden_associations.parquet"
for r in config["baseline_results"]
}

Expand Down
8 changes: 2 additions & 6 deletions deeprvat/deeprvat/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,9 @@ def make_dataset_(
or training_dataset_file is None
or not Path(training_dataset_file).is_file()
):
variant_file = config["training_data"].get(
"variant_file",
f'{config["training_data"]["gt_file"][:-3]}_variants.parquet',
)
ds = DenseGTDataset(
gt_file=config["training_data"]["gt_file"],
variant_file=variant_file,
variant_file=config["training_data"]["variant_file"],
split="",
skip_y_na=True,
**config["training_data"]["dataset_config"],
Expand Down Expand Up @@ -289,7 +285,7 @@ def __getitem__(self, index):
start_idx = index * self.batch_size
end_idx = min(self.total_samples, start_idx + self.batch_size)
batch_samples = self.sample_order.iloc[start_idx:end_idx]
samples_by_pheno = batch_samples.groupby("phenotype")
samples_by_pheno = batch_samples.groupby("phenotype", observed=True)

result = dict()
for pheno, df in samples_by_pheno:
Expand Down
3 changes: 1 addition & 2 deletions deeprvat/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,7 @@ def process_sparse_gt(
else:
logging.info(f"Found no samples to exclude in {exclude_samples}")

# Assumes only numeric sample names
samples = sorted([s for s in samples if int(s) > 0])
samples = list(samples)

logging.info("Processing sparse GT files by chromosome")
total_calls_dropped = 0
Expand Down
18 changes: 16 additions & 2 deletions deeprvat/seed_gene_discovery/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,20 @@ phenotypes:
# - Platelet_crit
# - Platelet_distribution_width
# - Red_blood_cell_erythrocyte_count

# - Body_mass_index_BMI
# - Glucose
# - Vitamin_D
# - Albumin
# - Total_protein
# - Cystatin_C
# - Gamma_glutamyltransferase
# - Alkaline_phosphatase
# - Creatinine
# - Whole_body_fat_free_mass
# - Forced_expiratory_volume_in_1_second_FEV1
# - Glycated_haemoglobin_HbA1c
# - WHR_Body_mass_index_BMI_corrected

variant_types:
- missense
- plof
Expand All @@ -42,7 +55,7 @@ test_config:
neglect_homozygous: False
collapse_method: sum #collapsing method for burde
var_weight_function: beta_maf

min_mac: 10
variant_file: variants.parquet

data:
Expand Down Expand Up @@ -99,3 +112,4 @@ data:
num_workers: 10
#batch_size: 20


Loading

0 comments on commit 66eb97c

Please sign in to comment.