Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

properly branched #131

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions dwi_2_environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# dwi environment


name: witty
channels:
- conda-forge

dependencies:
- dipy=1.7.0
- fury=0.7.1
- jupyter
- jupyterlab
- matplotlib=3.5.3
- nilearn=0.7.0
- osfclient=0.0.5
- python>=3.10
- pybids

# if environment does not resolve try installing everything but pybids, then `conda install conda-forge::pybids`


22 changes: 22 additions & 0 deletions dwi_3_environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# dwi environment


name: wittier
channels:
- conda-forge

dependencies:
- dipy=1.7.0
- fury=0.7.1
- jupyter
- jupyterlab
- matplotlib=3.5.3
- nilearn=0.7.0
- numexpr=2.8.4
- osfclient=0.0.5
- python>=3.10
- pybids

# if environment does not resolve try installing everything but pybids, then `conda install conda-forge::pybids`


64 changes: 64 additions & 0 deletions r_scripts/1112024_try_replace_colnames_scorecsvs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#Here, we would like to change the Participant Id numbers in the SCORE export repeating data files to BRICK participant Ids. This way, we can upload it to the BRICK castor
#here we read in the score repeating data files
library(dplyr)

# Read the subset key table and rename the Participant.Id to BRICK_Id
subset_key_table <- read.csv("Z:/castor_proof_files/csv_castor/current/brick_subset_key_table102024.csv") %>%
rename(BRICK_Id = Participant.Id)

# Read in all of the repeating data files from SCORE #path is for windows machine /mnt/data on linux!.
# Read in all of the repeating data files from SCORE
file_paths <- list(
visual_hearing = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Visual_and_hearing_disease_Medical_History_Clinical_manifestati_export_20240717.csv",
acute_complications = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Acute_complications_export_20240717.csv",
bone_extremities = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Bone_and_extremities_Medical_History_Clinical_manifestations_export_20240717.csv",
cardiac_pulmonary = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Cardiac_and_pulmonary_disease_Medical_History_Clinical_manifest_export_20240717.csv",
comorbidities = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Comorbidities_export_20240717.csv",
endocrinological = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Endocrinological_disease_Medical_History_Clinical_manifestation_export_20240717.csv",
registry = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__export_20240717.csv",
liver_kidney = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Liver_and_kidney_disease_Medical_History_Clinical_manifestation_export_20240717.csv",
neurological = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Neurological_disease_Medical_History_Clinical_manifestations_export_20240717.csv",
specific_treatment = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Treatment_Use_of_specific_treatment_or_inclusion_in_Clinical_Tr_export_20240717.csv",
chelation_treatment = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Treatments_chelation_export_20240717.csv",
hydroxyurea_treatment = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Treatments_hydroxyurea_export_20240717.csv",
visit = "Z:/score_clinical_manifestationTOBEREVIEWED/SCORE_RADeep-registry__Visit_export_20240717.csv"
)

# Read each file into a data frame and store in df_list
df_list <- lapply(file_paths, function(path) {
read.csv(path, sep = ";", stringsAsFactors = FALSE)
})

# Rename the first column in each data frame to RADeep_id
df_list <- lapply(df_list, function(df) {
colnames(df)[1] <- "RADeep_id"
df
})

# Merge each data frame with subset_key_table, bringing in BRICK_Id and renaming it to Participant Id
df_list <- lapply(df_list, function(df) {
df %>%
left_join(subset_key_table, by = "RADeep_id") %>% # Join on RADeep_id
mutate(`Participant Id` = BRICK_Id) %>% # Create Participant Id from BRICK_Id
select(`Participant Id`, everything(), -BRICK_Id) # Arrange columns accordingly
})

# Optionally assign back to original data frame names
names(df_list) <- names(file_paths)

# Assign back to individual data frames if needed
list2env(df_list, envir = .GlobalEnv)

# Define the output directory for the updated CSV files
output_dir <- "Z:/castor_proof_files/csv_castor/current/"

# Ensure the output directory exists
dir.create(output_dir, showWarnings = FALSE)

# Write each data frame to a CSV file in the output directory
for (name in names(df_list)) {
write.csv(df_list[[name]], file = paste0(output_dir, name, "_updated.csv"), row.names = FALSE)
}

# Confirm file creation
cat("CSV files have been created in:", output_dir, "\n")
154 changes: 154 additions & 0 deletions r_scripts/1162024Tables_paper_1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
## in this script, I will try to clean up the dataexport of castor on november 11th. At this thime, score repeated data, volbrain data and freesurfer data have not been uploaded yet
#weirdly enough, the age_T0_MRI_months is exported incorrectly by castor. I decided to limit my next export to just the baseline_MRI form at T0 and to merge it later
library(dplyr)
library(writexl) #to write the tables to excel files
#read in the raw data file. For now in my personal folder, will get moved around when we arrive at the final datastructure
raw <- read.csv("Z:/Aida_experiment/6112024_brick_castor_data/BRICK_export_20241106.csv", sep= ";")
ages <-read.csv("Z:/Aida_experiment/combined_BRICK_marjolein.csv", sep=",") #this file contains the ages and recon-all data, as I apparently cannot export calculated fields from castor
volbrain <- read.csv("Z:/castor_proof_files/csv_castor/current/volbrains_castor.csv") #contains volbrain output

#rename first columns in order to merge later.
raw <- raw %>%
rename(Participant_Id = `ï..Participant.Id`)

ages <- ages %>%
rename(Participant_Id = Participant.Id)

volbrain <- volbrain %>%
rename(Participant_Id = Participant.Id)

#I decided not to clean it up. Instead. Just start with creating the tables.

table1 <- raw %>% select(Participant_Id, Hydrea_at_scan_T0, brick_genotype, mothersbirth, fathersbirth)

#import age in months from the ages df
table1 <- table1 %>%
left_join(select(ages, Participant_Id, Age_at_scan_m_T0X, gender_BRICK), by = "Participant_Id")

#add a column in years. now we have the ingredients for table1
table1 <- table1 %>%
mutate(Age_at_scan_years = round(Age_at_scan_m_T0X / 12, 1))

#Table 2: Lab values including HbF

table2 <- raw %>% select(Participant_Id, ERY0, HB0, MCV0, HT0, reticulocyte_count_percentage_1, LEU0, TROMBO0, FE0, FERT0, TRAF0, TSAT0, ALAT0, LDH0, TBIL0, DBIL0, KREA0, SCHWARTZ_Bedsite0, FOLZ0, UREU0, NA0, K00, VITD0, ASAT0, AFOS0, GGT0, CRP0, KREA_U0, TE_U0, TE_U_KR0, ALB_U0, ALB_U_KR0, B120, HPLC_HBF_T0, HPLC_HBF_date_T0, HPLC_HbS_T0, HPLC_HbS_date_T0)

#Table 3: Descriptives on radiology reports
table3 <- raw %>% select(Participant_Id,Measurement_moment_QC_T1w_T0, Score_QC_T1w_T0, Exclude_Score_QC_T1w_T0, Remarks_Score_QC_T1w_T0, Screened_for_WMH_T0, WMH_observed_T0, Vasculature_examined_T0, Vascular_malformations_T0, Microbleeds_screened_T0, Microbleeds_present_T0, Incidental_finding_T0, Marjolein_T0, Remarks_MRI_T0)

#Table 4: White matter hyperintensities
table4 <- volbrain %>%
select(
Participant_Id, dl_Sex_T0, dl_Age_T0,
dl_Total_lesion_count_T0, dl_Total_lesion_volume_.absolute._cm3_T0,
dl_Total_lesion_volume_.normalized._._T0, dl_Total_lesion_burden_T0,
dl_Periventricular_lesion_count_T0, dl_Periventricular_lesion_volume_.absolute._cm3_T0,
dl_Periventricular_lesion_volume_.normalized._._T0, dl_Periventricular_lesion_burden_T0,
dl_Deep_white_lesion_count_T0, dl_Deep_white_lesion_volume_.absolute._cm3_T0,
dl_Deep_white_lesion_volume_.normalized._._T0, dl_Deep_white_lesion_burden_T0,
dl_Juxtacortical_lesion_count_T0, dl_Juxtacortical_lesion_volume_.absolute._cm3_T0,
dl_Juxtacortical_lesion_volume_.normalized._._T0, dl_Juxtacortical_lesion_burden_T0,
dl_Infratentorial_lesion_count_T0, dl_Infratentorial_lesion_volume_.absolute._cm3_T0,
dl_Infratentorial_lesion_volume_.normalized._._T0, dl_Infratentorial_lesion_burden_T0
)
#Table 5: Recon-all volumetrics (white, grey and subcortical matter) with z-scores for 2 different kinds of growth charts.
# Calculate the Total Cortical Volume and Total Cerebellar Volume
table5 <- ages %>%
mutate(
Total_Cortical_Volume = CortexVol + Left.Cerebellum.Cortex + Right.Cerebellum.Cortex,
Total_Cerebellar_Volume = Left.Cerebellum.Cortex + Right.Cerebellum.Cortex
) %>%

# Select relevant columns for the table and rename for clarity
select(
`Participant ID` = Participant_Id,
`Age (years)` = Age_at_scan_y_T0X,
`Gender` = gender_BRICK,

# Total Volumes
`Total Cortical Volume` = Total_Cortical_Volume,
`Total White Matter Volume` = CerebralWhiteMatterVol,
`Total Gray Matter Volume` = TotalGrayVol,
`Total Cerebellar Volume` = Total_Cerebellar_Volume,

# Subcortical Structures
`Thalamus Left` = Left.Thalamus.Proper,
`Thalamus Right` = Right.Thalamus.Proper,
`Caudate Left` = Left.Caudate,
`Caudate Right` = Right.Caudate,
`Putamen Left` = Left.Putamen,
`Putamen Right` = Right.Putamen,
`Pallidum Left` = Left.Pallidum,
`Pallidum Right` = Right.Pallidum,
`Hippocampus Left` = Left.Hippocampus,
`Hippocampus Right` = Right.Hippocampus
)
#Table 5_1: Neuropsychologic outcomes(WISCV&WAISIV) + Education level parents
#WiscV
table6_wisc <- raw %>%
select(
Participant_Id,
scorePrimIndex_TIQ_S_1, # Full Scale IQ
scorePrimIndex_VBI_IQ_1, # Verbal Comprehension Index
scorePrimIndex_VRI_IQ_1, # Visual Spatial Index
scorePrimIndex_FRI_IQ_1, # Fluid Reasoning Index
scorePrimIndex_WgI_IQ_1, # Working Memory Index
scorePrimIndex_VsI_IQ_1, # Processing Speed Index
scoreSecIndex_KRI_IQ_1, # Quantitative Reasoning Index
scoreSecIndex_AWI_IQ_1, # Auditory Working Memory Index
scoreSecIndex_NVI_IQ_1, # Nonverbal Index
scoreSecIndex_AVI_IQ_1, # General Ability Index
)

#waisIV

table6_wais <- raw %>%
select(
"Participant_Id",
"ScoreTIQ_1", # Full Scale IQ (FSIQ)
"ScoreSomVBI_1", # Verbal Comprehension Index (VCI)
"ScorePerVBI_1", # Perceptual Verbal Comprehension Index (related to FRI)
"ScoreBIVBI_1", # Blocked Verbal Comprehension Index (related to FRI)
"ScoreWgI_1", # Working Memory Index (WMI)
"ScoreVsI_1" # Processing Speed Index (PSI)
)

#create a real table1
library(tableone)

# Specify the categorical and continuous variables
categorical_vars <- c("Hydrea_at_scan_T0", "brick_genotype", "gender_BRICK")
continuous_vars <- c("Age_at_scan_years")

# Specify the categorical and continuous variables for CreateTableOne
vars <- c(categorical_vars, continuous_vars)

# Create the table using CreateTableOne
table1_summary <- CreateTableOne(vars = vars, data = combined_data, factorVars = categorical_vars)

# Print the table1 summary
print(table1_summary)

#I want the median and IQR for HbS
# Manually calculate median and IQR for 'HPLC_HbS_T0'
hplc_summary <- combined_data %>%
summarise(
Median_HPLC_HbS_T0 = median(HPLC_HbS_T0, na.rm = TRUE),
IQR_HPLC_HbS_T0 = IQR(HPLC_HbS_T0, na.rm = TRUE)
)

# Manually calculate mean and SD for 'HB0'
hb0_summary <- combined_data %>%
summarise(
Mean_HB0 = mean(HB0, na.rm = TRUE),
SD_HB0 = sd(HB0, na.rm = TRUE)
)

# Print the custom summaries for HPLC_HbS_T0 and HB0
print("HPLC_HbS_T0 - Median and IQR:")
print(hplc_summary)

print("HB0 - Mean and SD:")
print(hb0_summary)


78 changes: 78 additions & 0 deletions r_scripts/1172024_Table2_Growthcurves_script_attempt.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
library(dplyr)
library(tableone)
library(writexl)
library(tibble) # For rownames_to_column()

# Select relevant columns from table2
table2_filtered <- table2 %>%
select(Participant_Id, ERY0, HB0, MCV0, HT0, reticulocyte_count_percentage_1, LEU0,
TROMBO0, FE0, FERT0, TRAF0, TSAT0, ALAT0, LDH0, TBIL0, DBIL0, KREA0,
FOLZ0, UREU0, NA0, K00, VITD0, ASAT0, AFOS0, GGT0, CRP0, KREA_U0,
TE_U0, TE_U_KR0, ALB_U0, ALB_U_KR0, B120)


# Define continuous variables
continuous_vars <- c("ERY0", "HB0", "MCV0", "HT0", "reticulocyte_count_percentage_1",
"LEU0", "TROMBO0", "FE0", "FERT0", "TRAF0", "TSAT0", "ALAT0",
"LDH0", "TBIL0", "DBIL0", "KREA0", "FOLZ0", "UREU0", "NA0",
"K00", "VITD0", "ASAT0", "AFOS0", "GGT0", "CRP0", "KREA_U0",
"TE_U0", "TE_U_KR0", "ALB_U0", "ALB_U_KR0", "B120")

# Create a named vector of labels for the continuous variables, with units
labels <- c(
"ERY0" = "Red blood cell (RBC) count (x10^12/L)",
"HB0" = "Hemoglobin (g/dL)",
"MCV0" = "Mean corpuscular volume (MCV) (fL)",
"HT0" = "Hematocrit (HCT) (%)",
"reticulocyte_count_percentage_1" = "Reticulocyte count (%)",
"LEU0" = "White blood cell (WBC) count (x10^9/L)",
"TROMBO0" = "Platelet count (x10^9/L)",
"FE0" = "Serum iron (µg/dL)",
"FERT0" = "Ferritin serum (ng/mL)",
"TRAF0" = "Transferrin (mg/dL)",
"TSAT0" = "Transferrin saturation (%)",
"ALAT0" = "Alanine transaminase (ALT) (U/L)",
"LDH0" = "Lactate Dehydrogenase (LDH) (U/L)",
"TBIL0" = "Total bilirubin (mg/dL)",
"DBIL0" = "Conjugated/direct bilirubin (mg/dL)",
"KREA0" = "Creatinine (mg/dL)",
"FOLZ0" = "Folate (ng/mL)",
"UREU0" = "Urea (mg/dL)",
"NA0" = "Sodium (mmol/L)",
"K00" = "Potassium (mmol/L)",
"VITD0" = "Vitamin D (ng/mL)",
"ASAT0" = "Aspartate transaminase (AST) (U/L)",
"AFOS0" = "Alkaline phosphatase (ALP) (U/L)",
"GGT0" = "Gamma-glutamyl transferase (GGT) (U/L)",
"CRP0" = "C-reactive protein (CRP) (mg/L)",
"KREA_U0" = "Creatinine (Urine) (mg/dL)",
"TE_U0" = "Iron (Urine) (µg/dL)",
"TE_U_KR0" = "Iron (Urine, Kr) (µg/dL)",
"ALB_U0" = "Albumin (Urine) (g/dL)",
"ALB_U_KR0" = "Albumin (Urine, Kr) (g/dL)",
"B120" = "Bilirubin 120 (mg/dL)"
)

# Create a summary table with continuous variables
table2_summary <- CreateTableOne(
vars = continuous_vars,
data = table2_filtered,
factorVars = character(0) # No factor variables
)

# Convert the table summary to a data frame
table2_df <- as.data.frame(print(table2_summary, quote = FALSE, noSpaces = TRUE))

# Add the labels as a new column by matching with continuous_vars
table2_df <- table2_df %>%
rownames_to_column(var = "Variable") %>%
mutate(Variable = labels[continuous_vars])

# Rename the 'Overall' column to "Mean (SD)" to indicate the values
colnames(table2_df)[colnames(table2_df) == "Overall"] <- "Mean (SD)"


# View the final table
print(table2_df)
#question: weird results, how do we impute missing values?

Loading
Loading