diff --git a/3.0_topic_model_transfused_filtered_vocab.ipynb b/3.0_topic_model_transfused_filtered_vocab.ipynb new file mode 100644 index 0000000..2cff9f9 --- /dev/null +++ b/3.0_topic_model_transfused_filtered_vocab.ipynb @@ -0,0 +1,892 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3.0 topic modeling on transfused admissions\n", + "+ python 3.7.x" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/summerrankin/opt/anaconda3/envs/env_py3/lib/python3.7/site-packages/past/types/oldstr.py:5: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", + " from collections import Iterable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "last ran: 2020-08-04 07:18:14.409672\n", + "Python Version: 3.7.3 (\n", + "operating system: darwin\n", + "pandas version: 0.24.2\n", + "sqlalchemy version: 1.3.3\n", + "psycopg2 version: 2.7.6.1\n", + "tqdm version: 4.32.1\n", + "scipy version: 1.2.1\n", + "numpy version: 1.16.2\n", + "nltk version: 3.4\n", + "matplotlib version: 3.0.3\n" + ] + } + ], + "source": [ + "import pickle \n", + "import time\n", + "import os\n", + "import re\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "import sys\n", + "\n", + "from nltk.tokenize import wordpunct_tokenize\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer\n", + "from sklearn.decomposition import LatentDirichletAllocation\n", + "\n", + "from scipy.sparse import csr_matrix, vstack\n", + "\n", + "import pyLDAvis, pyLDAvis.sklearn\n", + "from IPython.display import display\n", + "from tqdm import tnrange, tqdm_notebook\n", + "from time import sleep\n", + "\n", + "from importlib_metadata import version\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# things to connect to the posgres database\n", + "import psycopg2\n", + "from sqlalchemy import create_engine\n", + "POSTGRES_CONNECT = os.environ.get(\"POSTGRES_CONNECT\")\n", + "POSTGRES_ENGINE = os.environ.get(\"POSTGRES_ENGINE\")\n", + "conn = psycopg2.connect(POSTGRES_CONNECT)\n", + "cur = conn.cursor();\n", + "cur.execute(\"\"\"SET search_path = mimiciii;\"\"\")\n", + "engine = create_engine(POSTGRES_ENGINE)\n", + "\n", + "\n", + "libraries = ['pandas','sqlalchemy','psycopg2','tqdm','scipy','numpy','nltk','matplotlib']\n", + "print('last ran: ',datetime.now() )\n", + "print(\"Python Version:\", sys.version[0:7])\n", + "print( \"operating system:\", sys.platform)\n", + "\n", + "for lib in libraries:\n", + " print(lib + ' version: ' + version(lib))" + ] + }, + { + "source": [ + "Get the date (which will become the filename) and set a path for the output" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import date\n", + "y = date.today().year\n", + "d = date.today().day\n", + "m = date.today().month\n", + "day = str(y) + str(m) + str(d) + '_'\n", + "\n", + "path1 =\"./\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# vectorization and topic modeling functions\n", + "def get_xf_notes():\n", + " \"\"\"\n", + " Function to retrieve the data for transfused patients from the mimic postgres database.\n", + " Returns a pandas dataframe of the hadmid, and text of the concatenated notes.\n", + " \"\"\"\n", + " mimic_notes = pd.read_sql(\"\"\"SELECT * FROM mimiciii.transfused_notes_unique \"\"\", engine)\n", + " \n", + " return mimic_notes\n", + "#================================================================================================\n", + "def clean_text(text, xf_term_ver=9):\n", + " \"\"\" \n", + " Takes in a corpus of documents and cleans. Needs multiple docs. \n", + " \n", + " IN: corpus of documents\n", + " \n", + " 1. remove id masks\n", + " 2. tokenize into words using wordpunct\n", + " 3. lowercase and remove stop words\n", + " 4. put into a list for the modeling\n", + " \n", + " OUT: cleaned text = a list (documents) of lists (cleaned words in each doc)\n", + " \"\"\"\n", + " stop = ['I'] \n", + " \n", + " cleaned_text = []\n", + " \n", + " for post in tqdm_notebook(text):\n", + " cleaned_words = []\n", + " \n", + " #remove masked ids\n", + " clean_ids = re.sub(re.compile(r'\\[\\*\\*(.*?)\\*\\*\\]'), ' ', post)\n", + " \n", + " # tokenize into words\n", + " for word in wordpunct_tokenize(clean_ids): \n", + " \n", + " # lowercase and throw out any words in stop words (doing it here and later makes it faster)\n", + " if word.strip().lower() not in stop:\n", + " \n", + " # put into a list of words for each document\n", + " cleaned_words.append(word.strip().lower())\n", + " \n", + " # keep corpus of cleaned words for each document \n", + " cleaned_text.append(' '.join(cleaned_words))\n", + " \n", + " return cleaned_text\n", + "#================================================================================================\n", + "def topic_mod_mimic(vectorizer, vect_data, model='LDA', topics=20,iters=5,no_top_words=50):\n", + " \"\"\"\n", + " Run a topic modeling function and display the top terms from each topic.\n", + "\n", + " INPUT: \n", + " vectorizer=the model from the vectorization (needs to be from sklearn)\n", + " vect_data=output from the vectorization step (sklearn)\n", + " model='LDA' or 'NMF'\n", + " topics=number of topics\n", + " iters=number of iterations\n", + " no_top_words=number of top terms to print for each topic\n", + "\n", + " OUTPUT:\n", + " mod=the trained topic model\n", + " mod_dat=output of the topic model\n", + " topics=number of topics\n", + " iters=number of iterations\n", + " \"\"\"\n", + " \n", + " if model == 'NMF':\n", + " mod = NMF(n_components=topics,\n", + " # init='nndsvd',\n", + " # max_iter=iters,\n", + " random_state=42,\n", + " verbose=False,\n", + " beta_loss='kullback-leibler', \n", + " solver='mu',\n", + " max_iter=1000, \n", + " alpha=.1,\n", + " l1_ratio=.5)\n", + " \n", + " else:\n", + " mod = LatentDirichletAllocation(n_components=topics,\n", + " max_iter=iters,\n", + " random_state=42,\n", + " learning_method='online',\n", + " n_jobs=-1)\n", + " \n", + " mod_dat = mod.fit_transform(vect_data)\n", + " \n", + " \n", + " # to display a list of topic words and their scores \n", + " def display_topics(model, feature_names, no_top_words):\n", + " for ix, topic in enumerate(model.components_):\n", + " print(\"Topic \", ix)\n", + " print(\" \".join([feature_names[i]\n", + " for i in topic.argsort()[:-no_top_words - 1:-1]]) + '\\n')\n", + " \n", + " display_topics(mod, vectorizer.get_feature_names() , no_top_words)\n", + "\n", + " \n", + " return mod, mod_dat, topics, iters" + ] + }, + { + "source": [ + "# 1.0 Import data\n", + "Get the transfused notes from postgres" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mimic_notes = get_xf_notes()\n", + "corpus=mimic_notes.text.values" + ] + }, + { + "source": [ + "# 1.1 Clean" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clean_corpus = clean_text(corpus)" + ] + }, + { + "source": [ + "# 1.2 Import features\n", + "Retrieve the list of features created in the previous steps" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features1 = pd.read_csv(path1 + 'all_filtered_features.csv')\n", + "features = features1.vocab.to_list()" + ] + }, + { + "source": [ + "# 1.3 Vectorize \n", + "Count vectorize the data using the existing vocabulary that was created in steps 2.0, 2.1, 2.2, 2.3, 2.4" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vect_mod = CountVectorizer(\n", + " stop_words=None, \n", + " lowercase=True, \n", + " strip_accents='ascii', \n", + " vocabulary=features\n", + " )\n", + "vect_data = vect_mod.fit_transform(clean_corpus)" + ] + }, + { + "source": [ + "# Set Hyperparameters\n", + "Set the type of topic model and the number of topics" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model='LDA'\n", + "num = 45" + ] + }, + { + "source": [ + "# 1.4 Topic model\n", + "Run the topic model using the vectorized data and model from the previous step" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mod, mod_dat, topic_n, iter_n = topic_mod_mimic(\n", + " vect_mod, \n", + " vect_data, \n", + " model=model, \n", + " topics=num, \n", + " iters=10, \n", + " no_top_words=15) " + ] + }, + { + "source": [ + "# 1.5 Display \n", + "Display the topic modeling results with an interactive pyPLAvis plot.\n", + "This can be saved as an html and opened in any browser" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pyLDAvis, pyLDAvis.sklearn\n", + "from IPython.display import display \n", + " \n", + "# Setup to run in Jupyter notebooks\n", + "pyLDAvis.enable_notebook()\n", + "\n", + " # Create the visualization\n", + "vis = pyLDAvis.sklearn.prepare(\n", + " mod, \n", + " vect_data,\n", + " vect_mod, \n", + " sort_topics=False, \n", + " mds='mmds')\n", + "\n", + "# view it in the notebook\n", + "display(vis)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def vis_sav( path1, number, vis, model = 'LDA'):\n", + " \"\"\"\n", + " Save the .html output from pyLDAvis\n", + " \"\"\"\n", + " \n", + " pyLDAvis.save_html(\n", + " vis, \n", + " path1 + day + model + str(number) + '_filtered_v1.html'\n", + " ) \n", + " \n", + "vis_sav(path1, num, vis )" + ] + }, + { + "source": [ + "# 1.6 Save\n", + "Save the models and results from previous steps " + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_model(path1, number, vec_model, vect_dat, lda_model=0, model_dat=0, model = 'LDA', stops='none'):\n", + " \"\"\"\n", + " Save the models and results from the vectorization and topic modeling.\n", + " \"\"\"\n", + " # save the count vectorizer model\n", + " with open(path1 + day + model + str(number) + '_filtered_v1vect_model.pkl', 'wb') as picklefile:\n", + " pickle.dump(vec_model, picklefile)\n", + "\n", + " # save the vectorized data (sparse matrix)\n", + " with open(path1 + day + model + str(number) + '_filtered_v1vect_data.pkl', 'wb') as picklefile:\n", + " pickle.dump(vect_dat, picklefile)\n", + " \n", + " # save parameters of the topic model as a text file\n", + " with open(path1 + day + model + str(number) + '_filtered_v1_params.txt', \"w\") as parameters_f:\n", + " print('model: '+ str(lda_model) + \\\n", + " '\\n Vectorizer: ' + str(vect_mod) + \\\n", + " '\\n stop words: '+ stops \\\n", + " , file = parameters_f) \n", + "\n", + " # save the actual topic model\n", + " with open(path1 + day + model + str(number) + '_filtered_v1_model.pkl', 'wb') as picklefile:\n", + " pickle.dump(lda_model, picklefile)\n", + " \n", + " #save the output from the topic model\n", + " with open(path1 + day + model + str(number) + '_filtered_v1_data.pkl', 'wb') as picklefile:\n", + " pickle.dump(model_dat, picklefile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_model(\n", + " path1, \n", + " num, \n", + " vect_mod, \n", + " vect_data, \n", + " mod, \n", + " mod_dat, \n", + " stops='vocab', \n", + " model = model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_topic_scores(path1, number, mod2, vect_mod, no_top_words, model):\n", + " ''' to save the raw pesudocount for the top xx words/terms from _components of each topic'''\n", + " \n", + " ff = vect_mod.get_feature_names()\n", + " \n", + " topicsdf = pd.DataFrame()\n", + " \n", + " for ix, topic in enumerate(mod2.components_):\n", + " \n", + " topicsdf.loc[:,'topic_' +str(ix) ] = [ff[i] for i in topic.argsort()[:-no_top_words - 1:-1]]\n", + " topicsdf.loc[:,'topic_' +str(ix) + '_scr' ] = [topic[i] for i in topic.argsort()[:-no_top_words - 1:-1]]\n", + "\n", + " topicsdf.to_csv(\n", + " path1 + day + model + str(number) + '_filtered_v1_' + str(no_top_words) + '_words_scores.csv', \n", + " float_format='%.3f'\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_topic_scores(\n", + " path1,\n", + " num, \n", + " mod, \n", + " vect_mod, \n", + " 9499, \n", + " model=model\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_word_scores(path1, number, mod2, vect_mod, no_top_words, model):\n", + " ''' to save the proba score for the top xx words from _components of each topic'''\n", + " \n", + " ff = vect_mod.get_feature_names()\n", + " \n", + " topicsdf = pd.DataFrame()\n", + " \n", + " proba_scores = mod2.components_ / mod2.components_.sum(axis=1)[:,np.newaxis]\n", + " \n", + " for ix, topic in enumerate(proba_scores):\n", + " \n", + " topicsdf.loc[:,'topic_' +str(ix) ] = [ff[i] for i in topic.argsort()[:-no_top_words - 1:-1]]\n", + " topicsdf.loc[:,'topic_' +str(ix) + '_scr' ] = [topic[i] for i in topic.argsort()[:-no_top_words - 1:-1]]\n", + "\n", + " topicsdf.to_csv(path1 + day + model + str(number) + '_filtered_v1_' + str(no_top_words) + '_words_proba.csv',float_format='%.5f')\n", + " return topicsdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "words = save_word_scores(\n", + " path1,\n", + " num, \n", + " mod, \n", + " vect_mod, \n", + " 9499, \n", + " model=model\n", + " )\n", + "words.head()" + ] + }, + { + "source": [ + "Save all topic scores for each document in a .csv" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scores = pd.DataFrame(mod.components_)\n", + "scorest = scores.T\n", + "scorest.to_csv(\n", + " path1 + day + model + str(num) + '_filtered_v1_all_topic_scores_hadmids.csv')" + ] + }, + { + "source": [ + "Format all scores for easy plotting" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scores_all = mod.components_ / mod.components_.sum(axis=1)[:, np.newaxis]\n", + "scores_all = scores_all.reshape([1,-1])\n", + "scores1=pd.DataFrame(mod.components_ / mod.components_.sum(axis=1)[:, np.newaxis])\n", + "scores1.head()" + ] + }, + { + "source": [ + "# 1.7 Plots\n", + "Plot distributions of the topic scores" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_hist(df, fig_title, save_name, num, log_y=False, bins=100):\n", + " \"\"\"\n", + " Plot and save a histogram of a pandas dataframe\n", + " \"\"\"\n", + " fig = plt.figure(dpi=300) #figsize=(8, 8),\n", + " \n", + " plt.hist(df, density=False, bins=bins, log=log_y)\n", + " plt.title(fig_title, fontsize=10)\n", + " plt.yticks(fontsize=7)\n", + " plt.xticks(fontsize=7)\n", + " plt.ylabel('Frequency', fontsize=7)\n", + " plt.savefig(path1 + day + model + str(num) + '_filtered_v1'+ save_name + '.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sscores_df = pd.DataFrame(scores_all)\n", + "plot_hist(sscores_df.iloc[0,:], fig_title='Histogram of All Term Scores ('+ str(num) +' topics) ', \\\n", + " save_name='loghist_term_proba', num=num, log_y=True, bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_hist(sscores_df.iloc[0,:], fig_title='Histogram of All Term Probas ('+ str(num) +' topics) ', \\\n", + " save_name='hist_term_proba', num=num, log_y=False, bins=500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scores1.iloc[22,:].plot.hist(\n", + " figsize=(20,15), \n", + " logy=True, \n", + " use_index=True, \n", + " bins=(100), \n", + " title='histogram of terms probability score for topic 22 ('+ str(num) + ' topics)'\n", + " );" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scores1.iloc[22,:].plot.hist(\n", + " figsize=(20,15), \n", + " logy=False, \n", + " use_index=True, \n", + " bins=(100), \n", + " title='histogram of all 40k terms probability score for topic 22 ('+ str(num) + ' topics)'\n", + " );" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2.0 Assign the Maximum Topic to each document (admission)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def open_model( path1, number, model='LDA'):\n", + " \"\"\"\n", + " Function to open a saved model and the vectorized output from the model.\n", + " \"\"\"\n", + " # vectorizer\n", + " with open(path1 + '/'+ model + '_' + str(number) + '/' + day + 'vect_model.pkl', 'rb') as picklefile:\n", + " vec_model =pickle.load(picklefile)\n", + " #vectorized data\n", + " #with open(path1 + '/'+ model + '_' + str(number) + '/' + day + 'vect_data.pkl', 'rb') as picklefile:\n", + " # vect_dat=pickle.load(picklefile)\n", + " \n", + " # topic model\n", + " with open(path1 + '/'+ model + '_' + str(number) + '/' + day + model + '_model.pkl', 'rb') as file:\n", + " lda_model=pickle.load(file)\n", + " # topic model results\n", + " with open(path1 + '/'+ model + '_' + str(number) + '/' + day + model + '_data.pkl', 'rb') as picklefile:\n", + " mod_dat = pickle.load(picklefile)\n", + "\n", + " return mod_dat, vec_model, lda_model" + ] + }, + { + "source": [ + "# 2.1 Load the models and results" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num=45\n", + "mod_dat, vect_model, mod = open_model(path1, num, model = 'LDA')\n" + ] + }, + { + "source": [ + "# 2.2 Assign Max Topic\n", + "Get the topic number with the maximum score for each document and save as 'max_topic' column\n", + "Get the proba/score for that maximum topic for each document and save as 'max_topic_val' column" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mimic_notes['max_topic'] = mod_dat.argmax(axis=1)\n", + "mimic_notes['max_topic_val'] = np.amax(mod_dat, axis=1)\n", + "\n", + "# merge these max cols and notes with all the topic scores\n", + "tdf = pd.DataFrame.from_records(mod_dat)\n", + "mm = pd.concat([mimic_notes,tdf],axis=1)\n", + "mm.head()" + ] + }, + { + "source": [ + "# 2.3 Save\n", + "Save the maximum topic for each document/hadm_id" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_hadmids = mm.loc[:,['hadm_id','max_topic','max_topic_val']]\n", + "all_hadmids.to_pickle(path1 + day + model + str(num) + '_filtered_v1_max_topic_all_hadmids.pkl')\n", + "all_hadmids.to_csv(path1 + day + model + str(num) + '_filtered_v1_max_topic_all_hadmids.csv')" + ] + }, + { + "source": [ + "# 2.4 Plot" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "plot_hist(mm.max_topic_val, fig_title='Histogram of Maximum Topic Coef ('+ str(num) +' topics) ', \\\n", + " save_name='_max_topic_all_hadmids', num=num, log_y=False)#, bins=100)" + ] + }, + { + "source": [ + "# 2.5 Select documents for further Review\n", + "Get the documents where the max topic for the document is less than the threshold set (.15). This denotes that the document fit into many different topics which indicates that the patient is complex and/or should be reviewed by an expert or clinician to evaluate for adverse events. \n", + "\n", + "Print the length to see how many documents are captured by the threshold. This threshold may need to be adjusted based on the number of topics, or specific dataset. The idea is to capture the 'left tail' of the maximum topic distribution. The amount of that tail that is captured should be based on how many documents the researcher is looking to review. \n", + "\n", + "This list will also include documents that did not fit into any topic (if present) and will have a max topic score that is at the min topic score value (instead of zero, there is typically a constant that is assigned to each topic that is not present in a document). We recommend reviewing these as well." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "len(mm[mm.max_topic_val<=.15])\n", + "amb_topic = mm[mm.max_topic_val<=.15]\n", + "amb_topic = amb_topic.sort_values('max_topic_val',ascending=True)" + ] + }, + { + "source": [ + "# 2.6 Save \n", + "Save these hadm_ids with a low maximum topic for SME review" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "amb_hadmids = amb_topic.loc[:,['hadm_id','max_topic','max_topic_val']]\n", + "amb_hadmids.to_pickle(path1 + day + model + str(num) + '_filtered_v1_thresh_15_outlier_hadmids.pkl')\n", + "amb_hadmids.to_csv(path1 + day + model + str(num) + '_filtered_v1_thresh_15_outlier_hadmids.csv')" + ] + }, + { + "source": [ + "# 2.7 Plot\n", + "Plot the distribution of the scores for all documents for a single topic (35)" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mm.iloc[35,4:].plot.bar(figsize=(10,10),fontsize=10, title='probability per topic '+ str(num));" + ] + }, + { + "source": [ + "Plot the topic distribution for a single patient (insert an hadm_id where there are xxxxxx)" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "one_pt = amb_topic[amb_topic.hadm_id==xxxxxx]\n", + "one_pt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "one_pt.iloc[0,4:].plot.bar(figsize=(10,10),fontsize=10, title='probability per topic '+ str(num));" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "one_pt1 = amb_topic[amb_topic.hadm_id==xxxxxx]\n", + "one_pt1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "one_pt1.iloc[0,4:].plot.bar(figsize=(10,10),fontsize=10, title='probability per topic '+ str(num));" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file