-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk.py
30 lines (22 loc) · 934 Bytes
/
nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#Import nltk preprocessing library to convert text into a readable format
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
#Tokenize the string (create a list -> each index is a word)
data['title'] = data.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
#Define text lemmatization model (eg: walks will be changed to walk)
lemmatizer = WordNetLemmatizer()
#Loop through title dataframe and lemmatize each word
def lemma(data):
return [lemmatizer.lemmatize(w) for w in data]
#Apply to dataframe
data['title'] = data['title'].apply(lemma)
#Define all stopwords in the English language (it, was, for, etc.)
stop = stopwords.words('english')
#Remove them from our dataframe
data['title'] = data['title'].apply(lambda x: [i for i in x if i not in stop])
data.head()