From 92d033bdbf7c9f58e7ec4d63e7819a5a8a34b4b3 Mon Sep 17 00:00:00 2001
From: Eva Holtkamp <eva.holtkamp@gmx.de>
Date: Tue, 10 Oct 2023 15:28:59 +0200
Subject: [PATCH] allow to have training and testing only phenotypes

---
 .../training_association_testing.snakefile    | 60 ++++++++++++++++---
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile
index 3270290a..502723dd 100644
--- a/pipelines/training_association_testing.snakefile
+++ b/pipelines/training_association_testing.snakefile
@@ -19,12 +19,54 @@ wildcard_constraints:
     repeat="\d+",
     trial="\d+",
 
+phenotypes = [
+  'Apolipoprotein_A',
+  'Apolipoprotein_B',
+  'Calcium',
+  'Cholesterol',
+  'HDL_cholesterol',
+  'IGF_1',
+    'LDL_direct',
+  'SHBG',
+  'Total_bilirubin',
+  'Triglycerides',
+  'Urate',
+  'Standing_height',
+  'Lymphocyte_percentage',
+  'Mean_platelet_thrombocyte_volume',
+  'Mean_corpuscular_volume',
+  'Mean_reticulocyte_volume',
+  'Neutrophill_count',
+  'Platelet_count',
+  'Platelet_crit',
+  'Platelet_distribution_width',
+  'Red_blood_cell_erythrocyte_count']
+
+new_phenotypes = [
+    'Body_mass_index_BMI',
+    'Glucose',
+    'Vitamin_D',
+    'Albumin',
+    'Total_protein',
+    'Cystatin_C',
+    'Gamma_glutamyltransferase',
+    'Alkaline_phosphatase',
+    'Creatinine',
+    'Whole_body_fat_free_mass',
+    'Forced_expiratory_volume_in_1_second_FEV1',
+    'QTC_interval',
+    'Glycated_haemoglobin_HbA1c',
+    # 'WHR',
+    'WHR_Body_mass_index_BMI_corrected'
+]
+
+phenotypes_testing = [*new_phenotypes, *phenotypes]
 rule all:
     input:
         expand("{phenotype}/deeprvat/eval/significant.parquet",
-               phenotype=phenotypes),
+               phenotype=phenotypes_testing),
         expand("{phenotype}/deeprvat/eval/all_results.parquet",
-               phenotype=phenotypes)
+               phenotype=phenotypes_testing)
 
 rule evaluate:
     input:
@@ -48,7 +90,7 @@ rule evaluate:
 rule all_regression:
     input:
         expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
-               phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)),
+               phenotype=phenotypes_testing, type=['deeprvat'], repeat=range(n_repeats)),
 
 rule combine_regression_chunks:
     input:
@@ -94,7 +136,7 @@ rule all_burdens:
         [
             (f'{p}/deeprvat/burdens/chunk{c}.' +
              ("finished" if p == phenotypes[0] else "linked"))
-            for p in phenotypes
+            for p in phenotypes_testing
             for c in range(n_burden_chunks)
         ]
 
@@ -157,7 +199,7 @@ rule compute_burdens:
 rule all_association_dataset:
     input:
         expand('{phenotype}/deeprvat/association_dataset.pkl',
-               phenotype=phenotypes)
+               phenotype=phenotypes_testing)
 
 rule association_dataset:
     input:
@@ -305,18 +347,18 @@ rule training_dataset_pickle:
 rule all_config:
     input:
         seed_genes = expand('{phenotype}/deeprvat/seed_genes.parquet',
-                            phenotype=phenotypes),
+                            phenotype=phenotypes_testing),
         config = expand('{phenotype}/deeprvat/hpopt_config.yaml',
-                        phenotype=phenotypes),
+                        phenotype=phenotypes_testing),
         baseline = expand('{phenotype}/deeprvat/baseline_results.parquet',
-                          phenotype=phenotypes),
+                          phenotype=phenotypes_testing),
 
 rule config:
     input:
         config = 'config.yaml',
         baseline = lambda wildcards: [
             str(Path(r['base']) / wildcards.phenotype / r['type'] /
-                'eval/burden_associations_testing.parquet')
+                'eval/burden_associations.parquet')
             for r in config['baseline_results']
         ]
     output: