forked from kalwargupta/nltk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
partOfSpeech.py
58 lines (41 loc) · 1.34 KB
/
partOfSpeech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 24 10:19:20 2019
@author: jeetu
"""
import nltk
#importing the speech of george bush from state union
from nltk.corpus import state_union
# PunktSentenceTokenizer is unsupervised ML
from nltk.tokenize import PunktSentenceTokenizer
#training the data of 2005 speech
train_text = state_union.raw("2005-GWBush.txt")
#sample text of 2006
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer=PunktSentenceTokenizer(train_text)
#tokenized by sentenced
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for i in tokenized:
words=nltk.word_tokenize(i)
tagged=nltk.pos_tag(words)
#Named Entity
namedEnt = nltk.ne_chunk(tagged)
namedEnt.draw()
except Exception as e:
print(str(e))
process_content()
'''
NE Type Examples
ORGANIZATION Georgia-Pacific Corp., WHO
PERSON Eddy Bonte, President Obama
LOCATION Murray River, Mount Everest
DATE June, 2008-06-29
TIME two fifty a m, 1:30 p.m.
MONEY 175 million Canadian Dollars, GBP 10.40
PERCENT twenty pct, 18.75 %
FACILITY Washington Monument, Stonehenge
GPE South East Asia, Midlothian
'''