Skip to content

Commit

Permalink
[pre-commit.ci] Add auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Nov 4, 2024
1 parent 988f46a commit 084021a
Showing 1 changed file with 19 additions and 18 deletions.
37 changes: 19 additions & 18 deletions odyssey/data/DataProcessor.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
"\n",
"SEED = 23\n",
"ROOT = \"/h/afallah/odyssey/odyssey\"\n",
"DATA_ROOT = f\"{ROOT}/odyssey/data/meds_data\" # bigbird_data\n",
"DATASET = f\"{DATA_ROOT}/patient_sequences/patient_sequences.parquet\" #patient_sequences_2048.parquet\\\n",
"DATA_ROOT = f\"{ROOT}/odyssey/data/meds_data\" # bigbird_data\n",
"DATASET = f\"{DATA_ROOT}/patient_sequences/patient_sequences.parquet\" # patient_sequences_2048.parquet\\\n",
"DATASET_2048 = f\"{DATA_ROOT}/patient_sequences/patient_sequences_2048.parquet\"\n",
"MAX_LEN = 2048\n",
"\n",
Expand Down Expand Up @@ -66,19 +66,22 @@
"source": [
"dataset = pl.read_parquet(DATASET)\n",
"dataset = dataset.rename({\"subject_id\": \"patient_id\", \"code\": \"event_tokens\"})\n",
"dataset = dataset.filter(pl.col('event_tokens').map_elements(len) > 5)\n",
"\n",
"dataset = dataset.with_columns([\n",
" pl.col('patient_id').cast(pl.String).alias('patient_id'),\n",
" pl.concat_list([\n",
" pl.col('event_tokens').list.slice(0, 2047),\n",
" pl.lit(['[EOS]'])\n",
" ]).alias('event_tokens'),\n",
"])\n",
"dataset = dataset.filter(pl.col(\"event_tokens\").map_elements(len) > 5)\n",
"\n",
"dataset = dataset.with_columns(\n",
" [\n",
" pl.col(\"patient_id\").cast(pl.String).alias(\"patient_id\"),\n",
" pl.concat_list(\n",
" [pl.col(\"event_tokens\").list.slice(0, 2047), pl.lit([\"[EOS]\"])]\n",
" ).alias(\"event_tokens\"),\n",
" ]\n",
")\n",
"\n",
"dataset = dataset.with_columns([\n",
" pl.col('event_tokens').map_elements(len).alias('token_length'),\n",
"])\n",
"dataset = dataset.with_columns(\n",
" [\n",
" pl.col(\"event_tokens\").map_elements(len).alias(\"token_length\"),\n",
" ]\n",
")\n",
"\n",
"print(dataset.head())\n",
"print(dataset.schema)\n",
Expand Down Expand Up @@ -175,7 +178,7 @@
"metadata": {},
"outputs": [],
"source": [
"dataset['event_tokens_2048'].iloc[0]"
"dataset[\"event_tokens_2048\"].iloc[0]"
]
},
{
Expand Down Expand Up @@ -229,9 +232,7 @@
"outputs": [],
"source": [
"# Process the dataset for hospital readmission in one month task\n",
"dataset_readmission = process_readmission_dataset(\n",
" dataset.copy(), max_len=MAX_LEN\n",
")"
"dataset_readmission = process_readmission_dataset(dataset.copy(), max_len=MAX_LEN)"
]
},
{
Expand Down

0 comments on commit 084021a

Please sign in to comment.