-
Notifications
You must be signed in to change notification settings - Fork 10
/
parser.py
executable file
·141 lines (93 loc) · 3.23 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
'''
Sentence Parser
Using context-free grammar formalism to parse English sentences to
determine their structure to help computer to better understand the
meaning of the sentence and can be utilized for various purpose such
as sentimental analysis, reviews analysis, etc.
'''
# Importing Libraries
import nltk
import sys
import re
# nltk.download('punkt')
# Defining terminals involved in the Context-Free Grammar
TERMINALS = """
Adj -> "country" | "dreadful" | "enigmatical" | "little" | "moist" | "red"
Adv -> "down" | "here" | "never"
Conj -> "and"
Det -> "a" | "an" | "his" | "my" | "the"
N -> "armchair" | "companion" | "day" | "door" | "hand" | "he" | "himself"
N -> "holmes" | "home" | "i" | "mess" | "paint" | "palm" | "pipe" | "she"
N -> "smile" | "thursday" | "walk" | "we" | "word"
P -> "at" | "before" | "in" | "of" | "on" | "to" | "until"
V -> "arrived" | "came" | "chuckled" | "had" | "lit" | "said" | "sat"
V -> "smiled" | "tell" | "were"
"""
# Defining non-terminals involved in the Context-Free Grammar
NONTERMINALS = """
S -> NP VP | S Conj S | NP VP Conj VP
AP -> Adj | Adj AP
NP -> N | Det NP | AP NP | NP PP
PP -> P NP | P S
VP -> V | V NP | V NP PP | V PP | VP Adv | Adv VP
"""
# Forming the grammer
grammar = nltk.CFG.fromstring(NONTERMINALS + TERMINALS)
parser = nltk.ChartParser(grammar)
def main():
# If filename specified, read sentence from file
if len(sys.argv) == 2:
with open(sys.argv[1]) as f:
s = f.read()
# Otherwise, get sentence as input
else:
s = input("Sentence: ")
# Convert input into list of words
s = preprocess(s)
# Attempt to parse sentence
try:
trees = list(parser.parse(s))
except ValueError as e:
print(e)
return
if not trees:
print("Could not parse sentence.")
return
# Print each tree with noun phrase chunks
for tree in trees:
tree.pretty_print()
print("Noun Phrase Chunks")
for np in np_chunk(tree):
print(" ".join(np.flatten()))
def preprocess(sentence):
"""
Convert `sentence` to a list of its words.
Pre-process sentence by converting all characters to lowercase
and removing any word that does not contain at least one alphabetic
character.
"""
sentence = sentence.lower()
words = nltk.word_tokenize(sentence)
return [word for word in words if re.match('[a-z]', word)]
def np_chunk(tree):
"""
Return a list of all noun phrase chunks in the sentence tree.
A noun phrase chunk is defined as any subtree of the sentence
whose label is "NP" that does not itself contain any other
noun phrases as subtrees.
"""
return [subtree for subtree in tree.subtrees(is_np_chunk)]
def is_np_chunk(tree):
"""
Returns true if given tree is a NP chunk.
A noun phrase chunk is defined as any subtree of the sentence
whose label is "NP" that does not itself contain any other
noun phrases as subtrees.
"""
if tree.label() == 'NP' and \
not list(tree.subtrees(lambda t: t.label() == 'NP' and t != tree)):
return True
else:
return False
if __name__ == "__main__":
main()