diff --git a/odyssey/data/DataProcessor.ipynb b/odyssey/data/DataProcessor.ipynb index 1192cc9..0e24f83 100644 --- a/odyssey/data/DataProcessor.ipynb +++ b/odyssey/data/DataProcessor.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-03-13T16:14:45.546088300Z", @@ -13,23 +13,39 @@ "outputs": [], "source": [ "import os\n", + "import sys\n", "import pickle\n", "import random\n", - "import sys\n", "from typing import Any, Dict, List, Optional\n", "\n", "import numpy as np\n", "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from skmultilearn.model_selection import iterative_train_test_split\n", - "\n", - "from odyssey.utils.utils import save_object_to_disk, seed_everything\n", "\n", - "DATA_ROOT = \"/h/afallah/odyssey/odyssey/data/bigbird_data\"\n", + "ROOT = \"/h/afallah/odyssey/odyssey\"\n", + "DATA_ROOT = f\"{ROOT}/odyssey/data/bigbird_data\"\n", "DATASET = f\"{DATA_ROOT}/patient_sequences/patient_sequences_2048.parquet\"\n", "MAX_LEN = 2048\n", "\n", - "os.chdir(DATA_ROOT)\n", + "os.chdir(ROOT)\n", + "\n", + "from odyssey.utils.utils import seed_everything\n", + "from odyssey.data.processor import (\n", + " filter_by_num_visit,\n", + " filter_by_length_of_stay,\n", + " get_last_occurence_index,\n", + " check_readmission_label,\n", + " get_length_of_stay,\n", + " get_visit_cutoff_at_threshold,\n", + " process_length_of_stay_dataset,\n", + " process_condition_dataset,\n", + " process_mortality_dataset,\n", + " process_readmission_dataset,\n", + " process_multi_dataset,\n", + " stratified_train_test_split,\n", + " sample_balanced_subset, \n", + " get_pretrain_test_split,\n", + " get_finetune_split\n", + ")\n", "\n", "SEED = 23\n", "seed_everything(seed=SEED)" @@ -37,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-03-13T16:15:12.321718600Z", @@ -45,235 +61,7 @@ }, "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Current columns: Index(['patient_id', 'num_visits', 'deceased', 'death_after_start',\n", - " 'death_after_end', 'length', 'token_length', 'event_tokens_2048',\n", - " 'type_tokens_2048', 'age_tokens_2048', 'time_tokens_2048',\n", - " 'visit_tokens_2048', 'position_tokens_2048', 'elapsed_tokens_2048',\n", - " 'common_conditions', 'rare_conditions'],\n", - " dtype='object')\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - " | patient_id | \n", - "num_visits | \n", - "deceased | \n", - "death_after_start | \n", - "death_after_end | \n", - "length | \n", - "token_length | \n", - "event_tokens_2048 | \n", - "type_tokens_2048 | \n", - "age_tokens_2048 | \n", - "time_tokens_2048 | \n", - "visit_tokens_2048 | \n", - "position_tokens_2048 | \n", - "elapsed_tokens_2048 | \n", - "common_conditions | \n", - "rare_conditions | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "35581927-9c95-5ae9-af76-7d74870a349c | \n", - "1 | \n", - "0 | \n", - "NaN | \n", - "NaN | \n", - "50 | \n", - "54 | \n", - "[[CLS], [VS], 00006473900, 00904516561, 510790... | \n", - "[1, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, ... | \n", - "[0, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85... | \n", - "[0, 5902, 5902, 5902, 5902, 5902, 5902, 5902, ... | \n", - "[0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ... | \n", - "[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", - "[-2.0, -1.0, 1.97, 2.02, 2.02, 2.02, 2.02, 2.0... | \n", - "[1, 0, 0, 0, 0, 0, 0, 0, 0, 0] | \n", - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | \n", - "
1 | \n", - "f5bba8dd-25c0-5336-8d3d-37424c185026 | \n", - "2 | \n", - "0 | \n", - "NaN | \n", - "NaN | \n", - "148 | \n", - "156 | \n", - "[[CLS], [VS], 52135_2, 52075_2, 52074_2, 52073... | \n", - "[1, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ... | \n", - "[0, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83... | \n", - "[0, 6594, 6594, 6594, 6594, 6594, 6594, 6594, ... | \n", - "[0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ... | \n", - "[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", - "[-2.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0... | \n", - "[0, 0, 0, 0, 0, 0, 0, 1, 0, 0] | \n", - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | \n", - "
2 | \n", - "f4938f91-cadb-5133-8541-a52fb0916cea | \n", - "2 | \n", - "0 | \n", - "NaN | \n", - "NaN | \n", - "78 | \n", - "86 | \n", - "[[CLS], [VS], 0RB30ZZ, 0RG10A0, 00071101441, 0... | \n", - "[1, 2, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... | \n", - "[0, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44... | \n", - "[0, 8150, 8150, 8150, 8150, 8150, 8150, 8150, ... | \n", - "[0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ... | \n", - "[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", - "[-2.0, -1.0, 0.0, 0.0, 1.08, 1.08, 13.89, 13.8... | \n", - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | \n", - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | \n", - "
3 | \n", - "6fe2371b-a6f0-5436-aade-7795005b0c66 | \n", - "2 | \n", - "0 | \n", - "NaN | \n", - "NaN | \n", - "86 | \n", - "94 | \n", - "[[CLS], [VS], 63739057310, 49281041688, 005970... | \n", - "[1, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... | \n", - "[0, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72... | \n", - "[0, 6093, 6093, 6093, 6093, 6093, 6093, 6093, ... | \n", - "[0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ... | \n", - "[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", - "[-2.0, -1.0, 0.75, 0.75, 0.75, 0.75, 0.75, 0.7... | \n", - "[1, 0, 0, 0, 0, 0, 0, 1, 0, 0] | \n", - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | \n", - "
4 | \n", - "6f7590ae-f3b9-50e5-9e41-d4bb1000887a | \n", - "1 | \n", - "0 | \n", - "NaN | \n", - "NaN | \n", - "72 | \n", - "76 | \n", - "[[CLS], [VS], 50813_0, 52135_0, 52075_3, 52074... | \n", - "[1, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ... | \n", - "[0, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47... | \n", - "[0, 6379, 6379, 6379, 6379, 6379, 6379, 6379, ... | \n", - "[0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ... | \n", - "[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", - "[-2.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0... | \n", - "[1, 0, 0, 0, 0, 0, 0, 0, 0, 1] | \n", - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | \n", - "