Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add odyssey package and move files #20

Merged
merged 1 commit into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from lib.data import FinetuneDataset
from lib.tokenizer import ConceptTokenizer
from lib.utils import (
from odyssey.data.dataset import FinetuneDataset
from odyssey.data.tokenizer import ConceptTokenizer
from odyssey.models.cehr_big_bird.model import BigBirdFinetune, BigBirdPretrain
from odyssey.models.cehr_bert.model import BertFinetune, BertPretrain
from odyssey.models.utils import (
get_latest_checkpoint,
get_run_id,
load_config,
load_finetune_data,
seed_everything,
)
from models.big_bird_cehr.model import BigBirdFinetune, BigBirdPretrain
from models.cehr_bert.model import BertFinetune, BertPretrain


def main(
Expand Down
6 changes: 0 additions & 6 deletions models/__init__.py

This file was deleted.

1 change: 1 addition & 0 deletions odyssey/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Odyssey package."""
1 change: 1 addition & 0 deletions odyssey/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Data sub-package."""
2 changes: 1 addition & 1 deletion lib/data.py → odyssey/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch
from torch.utils.data import Dataset

from lib.tokenizer import ConceptTokenizer
from odyssey.data.tokenizer import ConceptTokenizer


class PretrainDataset(Dataset):
Expand Down
1 change: 1 addition & 0 deletions odyssey/data/mimiciv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""MIMICIV specific data processing module."""
84 changes: 52 additions & 32 deletions data/collect.py → odyssey/data/mimiciv/collect.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Collect data from the FHIR database and save to csv files."""

import json
import logging
import os
from ast import literal_eval
from typing import Any, Dict, List, Optional
Expand All @@ -17,6 +18,13 @@
from sqlalchemy import MetaData, Table, create_engine, select
from tqdm import tqdm

from odyssey.utils.log import setup_logging


# Logging.
LOGGER = logging.getLogger(__name__)
setup_logging(print_level="INFO", logger=LOGGER)


PATIENT = "patient"
ENCOUNTER = "encounter"
Expand Down Expand Up @@ -212,7 +220,8 @@ def get_patient_data(self) -> None:
)
buffer = []
results = self.execute_query(DATA_COLLECTION_CONFIG[PATIENT]["table_name"])
for p in tqdm(results, desc="Processing patients", unit=PATIENT):
LOGGER.info("Fetching patient data ...")
for p in tqdm(results, desc="Processing patients", unit="patients"):
patient = Patient(p)
patient_data = {
"patient_id": patient.id,
Expand All @@ -239,7 +248,7 @@ def get_patient_data(self) -> None:
def get_encounter_data(self) -> None:
"""Get encounter data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/patients.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "patients.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_patient_data() first.")
return
Expand All @@ -249,10 +258,11 @@ def get_encounter_data(self) -> None:
)
buffer = []
outpatient_ids = []
LOGGER.info("Fetching encounter data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query(
DATA_COLLECTION_CONFIG[ENCOUNTER]["table_name"],
Expand Down Expand Up @@ -295,12 +305,12 @@ def get_encounter_data(self) -> None:
flush=True,
)
patients = patients[~patients["patient_id"].isin(outpatient_ids)]
patients.to_csv(self.csv_dir + "/inpatient.csv", index=False)
patients.to_csv(os.path.join(self.csv_dir, "inpatient.csv"), index=False)

def get_procedure_data(self) -> None:
"""Get procedure data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print(
"Encounters (inpatient) file not found. Please run get_encounter_data() first.",
Expand All @@ -312,10 +322,11 @@ def get_procedure_data(self) -> None:
)
procedure_vocab = set()
buffer = []
LOGGER.info("Fetching procedure data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query("procedure", patient_id)
proc_codes = []
Expand Down Expand Up @@ -358,13 +369,13 @@ def get_procedure_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/procedure_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "procedure_vocab.json"), "w") as f:
json.dump(list(procedure_vocab), f)

def get_medication_data(self) -> None:
"""Get medication data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_encounter_data() first.")
return
Expand All @@ -377,11 +388,12 @@ def get_medication_data(self) -> None:
save_path = os.path.join(self.csv_dir, "med_requests.csv")
med_vocab = set()
buffer = []
LOGGER.info("Fetching medication data ...")
with self.engine.connect() as connection:
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query(
DATA_COLLECTION_CONFIG[MEDICATION]["table_name"],
Expand Down Expand Up @@ -437,24 +449,25 @@ def get_medication_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/med_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "med_vocab.json"), "w") as f:
json.dump(list(med_vocab), f)

def get_lab_data(self) -> None:
"""Get lab data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_encounter_data() first.")
return
save_path = os.path.join(self.csv_dir, "labs.csv")
lab_vocab = set()
all_units = {}
buffer = []
LOGGER.info("Fetching lab data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
results = self.execute_query(
DATA_COLLECTION_CONFIG[LAB]["table_name"],
Expand Down Expand Up @@ -506,31 +519,32 @@ def get_lab_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/lab_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "w") as f:
json.dump(list(lab_vocab), f)
all_units = {k: list(v) for k, v in all_units.items()}
with open(self.vocab_dir + "/lab_units.json", "w") as f:
with open(os.path.join(self.vocab_dir, "lab_units.json"), "w") as f:
json.dump(all_units, f)

def filter_lab_data(
self,
) -> None:
"""Filter out lab codes that have more than one units."""
try:
labs = pd.read_csv(self.csv_dir + "/labs.csv")
with open(self.vocab_dir + "/lab_vocab.json", "r") as f:
labs = pd.read_csv(os.path.join(self.csv_dir, "labs.csv"))
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "r") as f:
lab_vocab = json.load(f)
with open(self.vocab_dir + "/lab_units.json", "r") as f:
with open(os.path.join(self.vocab_dir, "lab_units.json"), "r") as f:
lab_units = json.load(f)
except FileNotFoundError:
print("Labs file not found. Please run get_lab_data() first.")
return
LOGGER.info("Filtering lab data ...")
for code, units in lab_units.items():
if len(units) > 1:
lab_vocab.remove(code)
labs = labs.apply(lambda x: filter_lab_codes(x, lab_vocab), axis=1)
labs.to_csv(self.csv_dir + "/filtered_labs.csv", index=False)
with open(self.vocab_dir + "/lab_vocab.json", "w") as f:
labs.to_csv(os.path.join(self.csv_dir, "filtered_labs.csv"), index=False)
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "w") as f:
json.dump(list(lab_vocab), f)

def process_lab_values(self, num_bins: int = 5) -> None:
Expand All @@ -540,10 +554,11 @@ def process_lab_values(self, num_bins: int = 5) -> None:
----------
num_bins : int, optional
number of bins, by default 5

"""
try:
labs = pd.read_csv(self.csv_dir + "/filtered_labs.csv")
with open(self.vocab_dir + "/lab_vocab.json", "r") as f:
labs = pd.read_csv(os.path.join(self.csv_dir, "filtered_labs.csv"))
with open(os.path.join(self.vocab_dir, "lab_vocab.json"), "r") as f:
lab_vocab = json.load(f)
except FileNotFoundError:
print("Labs file not found. Please run get_lab_data() first.")
Expand All @@ -565,6 +580,7 @@ def assign_to_quantile_bins(row: pd.Series) -> pd.Series:
row["binned_values"] = binned_values
return row

LOGGER.info("Processing lab values ...")
labs = labs.apply(apply_eval, axis=1)
quantile_bins = {}
for code in lab_vocab:
Expand All @@ -582,19 +598,19 @@ def assign_to_quantile_bins(row: pd.Series) -> pd.Series:
).categories

labs = labs.apply(assign_to_quantile_bins, axis=1)
labs.to_csv(self.csv_dir + "/processed_labs.csv", index=False)
labs.to_csv(os.path.join(self.csv_dir, "processed_labs.csv"), index=False)

lab_vocab_binned = []
lab_vocab_binned.extend(
[f"{code}_{i}" for code in lab_vocab for i in range(num_bins)],
)
with open(self.vocab_dir + "/lab_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir + "lab_vocab.json"), "w") as f:
json.dump(lab_vocab_binned, f)

def get_condition_data(self) -> None:
"""Get condition data from the database and save to a csv file."""
try:
patients = pd.read_csv(self.csv_dir + "/inpatient.csv")
patients = pd.read_csv(os.path.join(self.csv_dir, "inpatient.csv"))
except FileNotFoundError:
print("Patients file not found. Please run get_encounter_data() first.")
return
Expand All @@ -603,10 +619,11 @@ def get_condition_data(self) -> None:
condition_counts = {}
condition_systems = {}
buffer = []
LOGGER.info("Fetching condition data ...")
for _, patient_id in tqdm(
patients["patient_id"].items(),
desc="Processing patients",
unit="patient",
unit="patients",
):
patient_conditions_counted = set()
results = self.execute_query(
Expand Down Expand Up @@ -650,26 +667,26 @@ def get_condition_data(self) -> None:
save_path,
flush=True,
)
with open(self.vocab_dir + "/condition_vocab.json", "w") as f:
with open(os.path.join(self.vocab_dir, "condition_vocab.json"), "w") as f:
json.dump(list(condition_vocab), f)
sorted_conditions = sorted(
condition_counts.items(),
key=lambda x: x[1]["count"],
reverse=True,
)
sorted_dict = dict(sorted_conditions)
with open(self.vocab_dir + "/condition_counts.json", "w") as f:
with open(os.path.join(self.vocab_dir, "condition_counts.json"), "w") as f:
json.dump(sorted_dict, f)

with open(self.vocab_dir + "/condition_systems.json", "w") as f:
with open(os.path.join(self.vocab_dir, "condition_systems.json"), "w") as f:
json.dump(condition_systems, f)

def group_conditions(self) -> None:
"""Group conditions into categories."""
with open(self.vocab_dir + "/condition_counts.json", "r") as file:
with open(os.path.join(self.vocab_dir, "condition_counts.json"), "r") as file:
data = json.load(file)
with open(self.vocab_dir + "/condition_systems.json", "r") as file:
with open(os.path.join(self.vocab_dir, "condition_systems.json"), "r") as file:
systems = json.load(file)
LOGGER.info("Grouping conditions ...")
grouped_data = {}
for code, info in data.items():
prefix = code[:3]
Expand All @@ -686,7 +703,10 @@ def group_conditions(self) -> None:
reverse=True,
),
)
with open(self.vocab_dir + "condition_categories.json", "w") as file:
with open(
os.path.join(self.vocab_dir, "condition_categories.json"),
"w",
) as file:
json.dump(sorted_grouped_data, file, indent=4)


Expand Down
Loading
Loading