-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsypt_ptree.py
108 lines (89 loc) · 3.31 KB
/
sypt_ptree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from pycorenlp import StanfordCoreNLP
from nltk.tree import ParentedTree
import re, os, subprocess
from nltk.tokenize import sent_tokenize, word_tokenize
arrow = ' => '
def leaf_to_root(node, ignoreleaf=True):
rules=[]
while node.label()!="ROOT":
rule= node.label() + arrow
for s in node:
if type(s)== ParentedTree:
rule += s.label() +' '
else:
rule += s +' '
rules.append(rule)
node = node.parent()
if ignoreleaf and len(rules) >1:
return rules[1:]
else:
return rules
def traverse(parsed_sentence, ignoreleaf=True):
t = ParentedTree.fromstring(parsed_sentence)
q = [t]
rules = []
while len(q) > 0:
current = q.pop()
for s in current:
if type(s) == ParentedTree:
q.append(s)
else:
rules.append(leaf_to_root(current, ignoreleaf=ignoreleaf))
return rules
def get_pt_features_coreNLP(doc, ignoreleaf=True):
en = doc.encode('utf-8')
de = en.decode('utf-8')
doc = de
chars_to_remove = ['{', '}', '(', ')']
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
doc = re.sub(rx, '', doc)
nlp = StanfordCoreNLP('http://localhost:9000')
sentences = sent_tokenize(doc)
ptree_features = list()
for sentence in sentences:
try:
if sentence != "" and len(word_tokenize(sentence)) <= 80: # less than 50 words
output = nlp.annotate(sentence, properties={
'annotators': 'parse',
'outputFormat': 'json'
})
parsed = (output['sentences'][0]['parse'])
rules = traverse(parsed, ignoreleaf=ignoreleaf)
ptree_features.append(rules)
except:
print('Problem in parsing sentece = %s' % sentence)
return ptree_features
def get_pt_features_standalone(doc, tmp_path='', ignoreleaf=True):
tmp_path = os.path.join(tmp_path, 'tmp_marjan')
args = ['java','-mx3000m', '-cp', 'stanford-parser-full-2018-02-27/stanford-parser.jar',
'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
'-encoding', 'utf-8',
'-model', 'stanford-parser-full-2018-02-27/englishPCFG.ser.gz',
'-maxLength', '50',
'-sentences', 'newline',
'-outputFormat', 'penn',
tmp_path]
en = doc.encode('utf-8')
de = en.decode('utf-8')
doc = de
chars_to_remove = ['{', '}', '(', ')']
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
doc = re.sub(rx, '', doc)
sentences = sent_tokenize(doc)
ptree_features = list()
with open(tmp_path, 'wt') as fw:
for sentence in sentences:
if sentence.strip() != '':
fw.write(sentence+'\n')
p = subprocess.Popen(args, stdin=None, stdout=-1, stderr=-1)
outs, err = p.communicate()
outs = outs.decode('utf-8').replace('(())\n', '') # removing output of long sentences
for parsed in outs.split('\n\n'):
if parsed != "":
try:
rules = traverse(parsed, ignoreleaf= ignoreleaf)
ptree_features.append(rules)
except ValueError:
print('Problem in converting parsed sentence = %s ' % (parsed))
raise
return ptree_features