-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconstruct.py
164 lines (148 loc) · 7.88 KB
/
construct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import pandas as pd
import re, glob, nltk, sys, os
import nltk
# Connective cuewords are defined in paper
CUEWORDS_COMPARISON = set(['but', 'however', 'although', 'by contrast'])
CUEWORDS_CONTINGENCY = set(['because', 'so', 'thus', 'as a result', 'consequently', 'therefore'])
CUEWORDS_EXPANSION = set(['also', 'for example', 'in addition', 'instead', 'indeed', 'moreover', 'for instance', 'in fact', 'furthermore', 'or', 'and'])
CUEWORDS_TEMPORAL = set(['then', 'previously', 'earlier', 'later', 'after', 'before'])
# the following two list should match the sequence between each other
CUEWORDS_ALL = [CUEWORDS_COMPARISON, CUEWORDS_CONTINGENCY, CUEWORDS_EXPANSION, CUEWORDS_TEMPORAL]
RELATION_NAMES = ['comparison', 'contingency', 'expansion', 'temporal']
def identify_pairs(utt_text, utt_text_prev="", show=True):
results = []
utt_text = utt_text.lower()
# Divide the text to individual sentence, and deal with sentence separately
text_lst = nltk.sent_tokenize(utt_text)
# Load previous turn utterance, to provide info for argument in previous turn, if the connective appears at the beginning
text_prev_lst = nltk.sent_tokenize(utt_text_prev)
# Pick paris for each text in text list
for text_count, text in enumerate(text_lst):
text_pos = nltk.pos_tag(nltk.word_tokenize(text))
for relation_idx, cuewords_lst in enumerate(CUEWORDS_ALL):
for cue_this in cuewords_lst:
# find all appears of connectives
iter = re.finditer(r"\b%s\b" % cue_this, text)
indices = [m.start(0) for m in iter]
if len(indices) > 0:
# if do exist a connective of this kind of relation
if show:
print (text)
print (cue_this)
print (indices)
for connective_index in indices:
# if the connective is in the middle
if connective_index == 0:
if text_count > 0:
arg1 = text_lst[text_count - 1]
elif len(text_prev_lst) > 1:
arg1 = text_prev_lst[-1]
else:
arg1 = ''
arg2 = text
type_this = 'begin'
if show:
print (type_this)
else:
# for each detected connectives
arg1 = text[0:connective_index]
arg2 = text[connective_index:]
type_this = 'mid'
if show:
print (type_this)
# Begin: Rules to filter out not common connectives cases
# find the place of the connective
select_flag = True
# remove connectives
arg2 = (arg2[len(cue_this):]).strip()
try:
# Filter 1: pos tag of the connective words
if type_this == 'mid':
connective_pos = text_pos[len(nltk.word_tokenize(arg1))][1]
if show:
print (text_pos[len(nltk.word_tokenize(arg1))])
else:
connective_pos = text_pos[0][1]
if show:
print (text_pos[0])
if connective_pos not in ['IN', 'CC']:
select_flag = False
# Filter 2: length of arguments
if len(nltk.word_tokenize(arg1)) < 3 or len(nltk.word_tokenize(arg2)) < 4:
select_flag = False
if cue_this == 'and' and (len(nltk.word_tokenize(arg1)) < 12 or len(nltk.word_tokenize(arg2)) < 12):
select_flag = False
if cue_this == 'and' and type_this == 'begin':
select_flag = False
if cue_this == 'or' and (len(nltk.word_tokenize(arg1)) < 12 or len(nltk.word_tokenize(arg2)) < 12):
select_flag = False
if cue_this == 'after' and (len(nltk.word_tokenize(arg1)) < 12 or len(nltk.word_tokenize(arg2)) < 12):
select_flag = False
if cue_this == 'before' and (len(nltk.word_tokenize(arg1)) < 6 or len(nltk.word_tokenize(arg2)) < 6):
select_flag = False
except:
select_flag = False
# End of rules
if select_flag:
# save to results dictionary list
result = {}
result['arg1'] = arg1
result['arg2'] = arg2
result['relation'] = RELATION_NAMES[relation_idx]
result['connective'] = cue_this
result['type'] = type_this
result['original_utt'] = utt_text
result['original_utt_prev'] = utt_text_prev
results.append(result)
if show:
print ('->', RELATION_NAMES[relation_idx])
return results
# Get pairs with cuewords
def get_pairs(df, header, output, show=True):
data = pd.DataFrame(columns=header)
list_utt_name = 'Answer.sentence'
index = 0
list_utt_range = 100
prev_text = ""
for i in range(1, len(df.index)):
# for i in range(1, 5):
for range_i in range(1, list_utt_range):
col_name_this = '%s%s' % (list_utt_name, str(range_i))
if col_name_this not in df.columns:
# if there is not more utterances in this conversation, terminate
break
else:
# for each utterance
text = str(df.ix[i, col_name_this])
results = identify_pairs(text, prev_text, show=show)
# update previous text
prev_text = text
# save result to csv row
for result in results:
for key, value in result.items():
data.loc[index, key] = value
index += 1
data.to_csv(output, columns=header, index=False)
if __name__ == '__main__':
'''
Code for original edina corpus
'''
header = ['arg1', 'arg2', 'relation', 'connective', 'type', 'original_utt', 'original_utt_prev']
for foldername in os.listdir('Edina/'):
if os.path.isdir('Edina/%s' % foldername):
print ('+++++ %s ...' % foldername)
for filename in os.listdir('Edina/%s' % foldername):
pathname = 'Edina/%s/%s' % (foldername, filename)
df = pd.read_csv(pathname)
output = 'Edina-DR2/Edina-DR/%s/pairs_%s' % (foldername, filename)
if not os.path.exists('Edina-DR2/Edina-DR/%s' % foldername):
os.makedirs('Edina-DR2/Edina-DR/%s' % foldername)
get_pairs(df, header, output, show=False)
'''
Code for debugging
'''
# identify_pairs('I love you but do you love me?', show=True)
# identify_pairs('I love elephant a lot but I go to zoo to see them', show=True)
# identify_pairs('I love cute small elephant a lot and I go to zoo to see the love ones them', show=True)
# identify_pairs('This is Tom and Jerry', show=True)
# identify_pairs("no kidding, right, he's our best scorer. hopefully he'll be back before that game.")