-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
46 lines (43 loc) · 1008 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
LABELS = (
'Obstetrics',
'Paediatrics',
'Musculoskeletal',
'Gynaecology',
'CNS',
'Misc',
'Hepatobiliary',
'Oncology',
'HN',
'Breast',
'Gastrointestinal',
'Cardiac',
'Chest',
'Trauma',
'Urogenital',
'Vascular',
'Spine',
'Haematology',
'Forensic',
'Interventional'
)
def preprocess(text):
stop_words = set(stopwords.words("english"))
punctuation_set = set(i for i in string.punctuation)
# make lowercase
text = text.lower()
# remove no width space
text = text.replace('\u200b', '')
# separate out punctuation
for mark in punctuation_set:
text = text.replace(mark, f' {mark} ')
# tokenize using nltk punkt
tokens = word_tokenize(text)
#remove stopwords, punctuation, and standalone numerals
tokens = [i for i in tokens \
if i not in stop_words \
if i not in punctuation_set \
if not i.isnumeric()]
return tokens