-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
93464ec
commit 269fd94
Showing
8 changed files
with
1,453 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,50 @@ | ||
# awesome | ||
# DCAN | ||
|
||
Dilated Convolutional Attention Network (DCAN), integrating dilated convolutions, residual connections, and label attention, for medical code assignment. It adopts dilated convolutions to capture complex medical patterns with a receptive field which increases exponentially with dilation size. | ||
|
||
## Data | ||
Download MIMIC-III dataset from [physionet](https://mimic.physionet.org). | ||
|
||
Organize your data using the following structure | ||
|
||
``` | ||
data | ||
| D_ICD_DIAGNOSES.csv | ||
| D_ICD_PROCEDURES.csv | ||
| ICD9_descriptions | ||
└───mimic3/ | ||
| | NOTEEVENTS.csv | ||
| | DIAGNOSES_ICD.csv | ||
| | PROCEDURES_ICD.csv | ||
| | *_hadm_ids.csv | ||
``` | ||
|
||
|
||
`ICD9_descriptions` is avaiable [in this repo](https://github.com/jamesmullenbach/caml-mimic/blob/master/mimicdata/ICD9_descriptions), and | ||
`*_hadm_ids.csv` are avaiable [here](https://github.com/jamesmullenbach/caml-mimic/tree/master/mimicdata/mimic3). | ||
`MIMIC_RAW_DSUMS` is available [here](https://physionet.org/works/ICD9CodingofDischargeSummaries/), while the rest file for MIMIC2 can be generated with their code. | ||
If you use Python3 `consctruct_datasest.py` in `ICD9_Coding_of_Discharge_Summaries` to create data files, remember to convert dict object to list (line 82&83) and use `dict.items()` instead of `dict.iteritems()`. | ||
Assign the directories of MIMIC data using `MIMIC_3_DIR`. | ||
|
||
## Run | ||
``python3 main.py`` | ||
|
||
Configs available at `options.py`. | ||
|
||
Requirements: | ||
- python 3.7 | ||
- pytorch 1.5.0 | ||
|
||
## Citation | ||
``` | ||
@inproceedings{ji2020dilated, | ||
title={Dilated Convolutional Attention Network for Medical Code Assignment from Clinical Text}, | ||
author={Ji, Shaoxiong and Cambria, Erik and Marttinen, Pekka}, | ||
booktitle={3rd Clinical Natural Language Processing Workshop at EMNLP}, | ||
year={2020} | ||
} | ||
``` | ||
|
||
## References | ||
- https://github.com/jamesmullenbach/caml-mimic | ||
- https://github.com/foxlf823/Multi-Filter-Residual-Convolutional-Neural-Network |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
import csv | ||
import torch | ||
import numpy as np | ||
from collections import defaultdict | ||
from transformers import AutoTokenizer | ||
from torch.utils.data import Dataset | ||
from elmo import elmo | ||
|
||
|
||
def load_vocab_dict(args, vocab_file): | ||
""" | ||
Load vocabulary dictionary from file: vocab_file | ||
""" | ||
vocab = set() | ||
with open(vocab_file, 'r') as vocabfile: | ||
for i, line in enumerate(vocabfile): | ||
line = line.rstrip() | ||
if line != '': | ||
vocab.add(line.strip()) | ||
ind2w = {i + 1: w for i, w in enumerate(sorted(vocab))} | ||
w2ind = {w: i for i, w in ind2w.items()} | ||
return ind2w, w2ind | ||
|
||
|
||
def load_full_codes(train_path, mimic2_dir, version='mimic3'): | ||
""" | ||
Load full set of ICD codes | ||
""" | ||
if version == 'mimic2': | ||
ind2c = defaultdict(str) | ||
codes = set() | ||
with open(mimic2_dir, 'r') as f: | ||
r = csv.reader(f) | ||
next(r) # skip header | ||
for row in r: | ||
codes.update(set(row[-1].split(';'))) | ||
codes = set([c for c in codes if c != '']) | ||
ind2c = defaultdict(str, {i:c for i,c in enumerate(sorted(codes))}) | ||
else: | ||
codes = set() | ||
for split in ['train', 'dev', 'test']: | ||
with open(train_path.replace('train', split), 'r') as f: | ||
lr = csv.reader(f) | ||
next(lr) | ||
for row in lr: | ||
for code in row[3].split(';'): | ||
codes.add(code) | ||
codes = set([c for c in codes if c != '']) | ||
ind2c = defaultdict(str, {i:c for i,c in enumerate(sorted(codes))}) | ||
return ind2c | ||
|
||
|
||
def load_lookups(args): | ||
""" | ||
Load lookup dictionaries: index2word, word2index, index2code, code2index | ||
""" | ||
ind2w, w2ind = load_vocab_dict(args, args.vocab) | ||
if args.Y == 'full': | ||
ind2c = load_full_codes(args.data_path, '%s/proc_dsums.csv' % args.MIMIC_2_DIR, version=args.version) | ||
else: | ||
codes = set() | ||
with open("%s/TOP_%s_CODES.csv" % (args.MIMIC_3_DIR, str(args.Y)), 'r') as labelfile: | ||
lr = csv.reader(labelfile) | ||
for i, row in enumerate(lr): | ||
codes.add(row[0]) | ||
ind2c = {i:c for i,c in enumerate(sorted(codes))} | ||
c2ind = {c:i for i,c in ind2c.items()} | ||
dicts = {'ind2w': ind2w, 'w2ind': w2ind, 'ind2c': ind2c, 'c2ind': c2ind} | ||
return dicts | ||
|
||
|
||
def prepare_instance(dicts, filename, args, max_length): | ||
# filename: data/mimic[2/3]/[train/dev/test]_[50/full].csv, e.g., data/mimic3/train_50.csv | ||
ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind'] | ||
instances = [] | ||
num_labels = len(dicts['ind2c']) | ||
with open(filename, 'r') as infile: | ||
r = csv.reader(infile) | ||
next(r) # skip header | ||
for row in r: | ||
text = row[2] | ||
labels_idx = np.zeros(num_labels) | ||
labelled = False | ||
for l in row[3].split(';'): | ||
if l in c2ind.keys(): | ||
code = int(c2ind[l]) | ||
labels_idx[code] = 1 | ||
labelled = True | ||
if not labelled: | ||
continue | ||
tokens_ = text.split() | ||
tokens = [] | ||
tokens_id = [] | ||
for token in tokens_: | ||
if token == '[CLS]' or token == '[SEP]': | ||
continue | ||
tokens.append(token) | ||
token_id = w2ind[token] if token in w2ind else len(w2ind) + 1 | ||
tokens_id.append(token_id) | ||
if len(tokens) > max_length: | ||
tokens = tokens[:max_length] | ||
tokens_id = tokens_id[:max_length] | ||
dict_instance = {'label': labels_idx, 'tokens': tokens, "tokens_id": tokens_id} | ||
instances.append(dict_instance) | ||
return instances | ||
|
||
|
||
def prepare_instance_bert(dicts, filename, args, max_length): | ||
ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind'] | ||
instances = [] | ||
num_labels = len(dicts['ind2c']) | ||
wp_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") | ||
with open(filename, 'r') as infile: | ||
r = csv.reader(infile) | ||
next(r) | ||
for row in r: | ||
text = row[2] | ||
labels_idx = np.zeros(num_labels) | ||
labelled = False | ||
for l in row[3].split(';'): | ||
if l in c2ind.keys(): | ||
code = int(c2ind[l]) | ||
labels_idx[code] = 1 | ||
labelled = True | ||
if not labelled: | ||
continue | ||
tokens_ = text.split() | ||
tokens = [] | ||
for token in tokens_: | ||
if token == '[CLS]' or token == '[SEP]': | ||
continue | ||
wps = wp_tokenizer.tokenize(token) | ||
tokens.extend(wps) | ||
tokens_max_len = max_length-2 # for CLS SEP | ||
if len(tokens) > tokens_max_len: | ||
tokens = tokens[:tokens_max_len] | ||
tokens.insert(0, '[CLS]') | ||
tokens.append('[SEP]') | ||
tokens_id = wp_tokenizer.convert_tokens_to_ids(tokens) | ||
masks = [1] * len(tokens) | ||
segments = [0] * len(tokens) | ||
dict_instance = {'label':labels_idx, 'tokens':tokens, "tokens_id":tokens_id, | ||
"segments":segments, "masks":masks} | ||
instances.append(dict_instance) | ||
return instances | ||
|
||
|
||
class MyDataset(Dataset): | ||
def __init__(self, X): | ||
self.X = X | ||
|
||
def __len__(self): | ||
return len(self.X) | ||
|
||
def __getitem__(self, idx): | ||
return self.X[idx] | ||
|
||
|
||
def pad_sequence(x, max_len, type=np.int): | ||
padded_x = np.zeros((len(x), max_len), dtype=type) | ||
for i, row in enumerate(x): | ||
padded_x[i][:len(row)] = row | ||
return padded_x | ||
|
||
|
||
def my_collate(x): | ||
words = [x_['tokens_id'] for x_ in x] | ||
seq_len = [len(w) for w in words] | ||
masks = [[1]*len(w) for w in words] | ||
max_seq_len = max(seq_len) # TODO | ||
# max_seq_len = args.MAX_LENGTH # TODO for capsule network | ||
|
||
inputs_idx = torch.LongTensor(pad_sequence(words, max_seq_len)) | ||
inputs_mask = torch.LongTensor(pad_sequence(masks, max_seq_len)) | ||
labels = torch.FloatTensor([x_['label'] for x_ in x]) | ||
inputs_text = [x_['tokens'] for x_ in x] | ||
inputs_text = elmo.batch_to_ids(inputs_text) | ||
return inputs_idx, labels, inputs_text, inputs_mask | ||
|
||
|
||
def my_collate_bert(x): | ||
words = [x_['tokens_id'] for x_ in x] | ||
segments = [x_['segments'] for x_ in x] | ||
masks = [x_['masks'] for x_ in x] | ||
seq_len = [len(w) for w in words] | ||
max_seq_len = max(seq_len) # max of batch | ||
|
||
inputs_idx = torch.LongTensor(pad_sequence(words, max_seq_len)) | ||
segments = torch.LongTensor(pad_sequence(segments, max_seq_len)) | ||
masks = torch.LongTensor(pad_sequence(masks, max_seq_len)) | ||
labels = torch.FloatTensor([x_['label'] for x_ in x]) | ||
return inputs_idx, segments, masks, labels | ||
|
Oops, something went wrong.