Skip to content

Commit

Permalink
Merge pull request #20 from VectorInstitute/setup_package_structure
Browse files Browse the repository at this point in the history
  • Loading branch information
amrit110 authored Mar 28, 2024
2 parents 85e8d5d + 420066e commit f344ce5
Show file tree
Hide file tree
Showing 31 changed files with 760 additions and 399 deletions.
10 changes: 5 additions & 5 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from lib.data import FinetuneDataset
from lib.tokenizer import ConceptTokenizer
from lib.utils import (
from odyssey.data.dataset import FinetuneDataset
from odyssey.data.tokenizer import ConceptTokenizer
from odyssey.models.cehr_big_bird.model import BigBirdFinetune, BigBirdPretrain
from odyssey.models.cehr_bert.model import BertFinetune, BertPretrain
from odyssey.models.utils import (
get_latest_checkpoint,
get_run_id,
load_config,
load_finetune_data,
seed_everything,
)
from models.big_bird_cehr.model import BigBirdFinetune, BigBirdPretrain
from models.cehr_bert.model import BertFinetune, BertPretrain


def main(
Expand Down
6 changes: 0 additions & 6 deletions models/__init__.py

This file was deleted.

1 change: 1 addition & 0 deletions odyssey/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Odyssey package."""
1 change: 1 addition & 0 deletions odyssey/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Data sub-package."""
File renamed without changes.
2 changes: 1 addition & 1 deletion lib/data.py → odyssey/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch
from torch.utils.data import Dataset

from lib.tokenizer import ConceptTokenizer
from odyssey.data.tokenizer import ConceptTokenizer


class PretrainDataset(Dataset):
Expand Down
1 change: 1 addition & 0 deletions odyssey/data/mimiciv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""MIMICIV specific data processing module."""
84 changes: 52 additions & 32 deletions data/collect.py → odyssey/data/mimiciv/collect.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Collect data from the FHIR database and save to csv files."""

import json
import logging
import os
from ast import literal_eval
from typing import Any, Dict, List, Optional
Expand All @@ -17,6 +18,13 @@
from sqlalchemy import MetaData, Table, create_engine, select
from tqdm import tqdm

from odyssey.utils.log import setup_logging


# Logging.
LOGGER = logging.getLogger(__name__)
setup_logging(print_level="INFO", logger=LOGGER)


PATIENT = "patient"
ENCOUNTER = "encounter"
Expand Down Expand Up @@ -212,7 +220,8 @@ def get_patient_data(self) -> None:
)
buffer = []
results = self.execute_query(DATA_COLLECTION_CONFIG[PATIENT]["table_name"])
for p in tqdm(results, desc="Processing patients", unit=PATIENT):
LOGGER.info("Fetching patient data ...")
for p in tqdm(results, desc="Processing patients", unit="patients"):
patient = Patient(p)
patient_data = {
"patient_id": patient.id,
Expand All @@ -239,7 +248,7 @@ def get_patient_data(self) -> None:
def get_encounter_data(self) -> None:
"""Get encounter data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/patients.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "patients.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_patient_data() first.")
return
Expand All @@ -249,10 +258,11 @@ def get_encounter_data(self) -> None:
)
buffer = []
outpatient_ids = []
LOGGER.info("Fetching encounter data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query(
DATA_COLLECTION_CONFIG[ENCOUNTER]["table_name"],
Expand Down Expand Up @@ -295,12 +305,12 @@ def get_encounter_data(self) -> None:
flush=True,
)
patients = patients[~patients["patient_id"].isin(outpatient_ids)]
patients.to_csv(self.csv_dir + "/inpatient.csv", index=False)
patients.to_csv(os.path.join(self.csv_dir, "inpatient.csv"), index=False)

def get_procedure_data(self) -> None:
"""Get procedure data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print(
"Encounters (inpatient) file not found. Please run get_encounter_data() first.",
Expand All @@ -312,10 +322,11 @@ def get_procedure_data(self) -> None:
)
procedure_vocab = set()
buffer = []
LOGGER.info("Fetching procedure data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query("procedure", patient_id)
proc_codes = []
Expand Down Expand Up @@ -358,13 +369,13 @@ def get_procedure_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/procedure_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "procedure_vocab.json"), "w") as f:
json.dump(list(procedure_vocab), f)

def get_medication_data(self) -> None:
"""Get medication data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_encounter_data() first.")
return
Expand All @@ -377,11 +388,12 @@ def get_medication_data(self) -> None:
save_path = os.path.join(self.csv_dir, "med_requests.csv")
med_vocab = set()
buffer = []
LOGGER.info("Fetching medication data ...")
with self.engine.connect() as connection:
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query(
DATA_COLLECTION_CONFIG[MEDICATION]["table_name"],
Expand Down Expand Up @@ -437,24 +449,25 @@ def get_medication_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/med_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "med_vocab.json"), "w") as f:
json.dump(list(med_vocab), f)

def get_lab_data(self) -> None:
"""Get lab data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_encounter_data() first.")
return
save_path = os.path.join(self.csv_dir, "labs.csv")
lab_vocab = set()
all_units = {}
buffer = []
LOGGER.info("Fetching lab data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query(
DATA_COLLECTION_CONFIG[LAB]["table_name"],
Expand Down Expand Up @@ -506,31 +519,32 @@ def get_lab_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/lab_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "w") as f:
json.dump(list(lab_vocab), f)
all_units = {k: list(v) for k, v in all_units.items()}
with open(self.vocab_dir + "/lab_units.json", "w") as f:
with open(os.path.join(self.vocab_dir, "lab_units.json"), "w") as f:
json.dump(all_units, f)

def filter_lab_data(
self,
) -> None:
"""Filter out lab codes that have more than one units."""
try:
labs = pd.read_csv(self.csv_dir + "/labs.csv")
with open(self.vocab_dir + "/lab_vocab.json", "r") as f:
labs = pd.read_csv(os.path.join(self.csv_dir, "labs.csv"))
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "r") as f:
lab_vocab = json.load(f)
with open(self.vocab_dir + "/lab_units.json", "r") as f:
with open(os.path.join(self.vocab_dir, "lab_units.json"), "r") as f:
lab_units = json.load(f)
except FileNotFoundError:
print("Labs file not found. Please run get_lab_data() first.")
return
LOGGER.info("Filtering lab data ...")
for code, units in lab_units.items():
if len(units) > 1:
lab_vocab.remove(code)
labs = labs.apply(lambda x: filter_lab_codes(x, lab_vocab), axis=1)
labs.to_csv(self.csv_dir + "/filtered_labs.csv", index=False)
with open(self.vocab_dir + "/lab_vocab.json", "w") as f:
labs.to_csv(os.path.join(self.csv_dir, "filtered_labs.csv"), index=False)
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "w") as f:
json.dump(list(lab_vocab), f)

def process_lab_values(self, num_bins: int = 5) -> None:
Expand All @@ -540,10 +554,11 @@ def process_lab_values(self, num_bins: int = 5) -> None:
----------
num_bins : int, optional
number of bins, by default 5
"""
try:
labs = pd.read_csv(self.csv_dir + "/filtered_labs.csv")
with open(self.vocab_dir + "/lab_vocab.json", "r") as f:
labs = pd.read_csv(os.path.join(self.csv_dir, "filtered_labs.csv"))
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "r") as f:
lab_vocab = json.load(f)
except FileNotFoundError:
print("Labs file not found. Please run get_lab_data() first.")
Expand All @@ -565,6 +580,7 @@ def assign_to_quantile_bins(row: pd.Series) -> pd.Series:
row["binned_values"] = binned_values
return row

LOGGER.info("Processing lab values ...")
labs = labs.apply(apply_eval, axis=1)
quantile_bins = {}
for code in lab_vocab:
Expand All @@ -582,19 +598,19 @@ def assign_to_quantile_bins(row: pd.Series) -> pd.Series:
).categories

labs = labs.apply(assign_to_quantile_bins, axis=1)
labs.to_csv(self.csv_dir + "/processed_labs.csv", index=False)
labs.to_csv(os.path.join(self.csv_dir, "processed_labs.csv"), index=False)

lab_vocab_binned = []
lab_vocab_binned.extend(
[f"{code}_{i}" for code in lab_vocab for i in range(num_bins)],
)
with open(self.vocab_dir + "/lab_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir + "lab_vocab.json"), "w") as f:
json.dump(lab_vocab_binned, f)

def get_condition_data(self) -> None:
"""Get condition data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_encounter_data() first.")
return
Expand All @@ -603,10 +619,11 @@ def get_condition_data(self) -> None:
condition_counts = {}
condition_systems = {}
buffer = []
LOGGER.info("Fetching condition data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
patient_conditions_counted = set()
results = self.execute_query(
Expand Down Expand Up @@ -650,26 +667,26 @@ def get_condition_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/condition_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "condition_vocab.json"), "w") as f:
json.dump(list(condition_vocab), f)
sorted_conditions = sorted(
condition_counts.items(),
key=lambda x: x[1]["count"],
reverse=True,
)
sorted_dict = dict(sorted_conditions)
with open(self.vocab_dir + "/condition_counts.json", "w") as f:
with open(os.path.join(self.vocab_dir, "condition_counts.json"), "w") as f:
json.dump(sorted_dict, f)

with open(self.vocab_dir + "/condition_systems.json", "w") as f:
with open(os.path.join(self.vocab_dir, "condition_systems.json"), "w") as f:
json.dump(condition_systems, f)

def group_conditions(self) -> None:
"""Group conditions into categories."""
with open(self.vocab_dir + "/condition_counts.json", "r") as file:
with open(os.path.join(self.vocab_dir, "condition_counts.json"), "r") as file:
data = json.load(file)
with open(self.vocab_dir + "/condition_systems.json", "r") as file:
with open(os.path.join(self.vocab_dir, "condition_systems.json"), "r") as file:
systems = json.load(file)
LOGGER.info("Grouping conditions ...")
grouped_data = {}
for code, info in data.items():
prefix = code[:3]
Expand All @@ -686,7 +703,10 @@ def group_conditions(self) -> None:
reverse=True,
),
)
with open(self.vocab_dir + "condition_categories.json", "w") as file:
with open(
os.path.join(self.vocab_dir, "condition_categories.json"),
"w",
) as file:
json.dump(sorted_grouped_data, file, indent=4)


Expand Down
Loading

0 comments on commit f344ce5

Please sign in to comment.