-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-features-ppl.py
121 lines (91 loc) · 4.49 KB
/
extract-features-ppl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
import os
import random
import sys
import subprocess
assert len(sys.argv) == 6
file_path = sys.argv[1]
file_path_pos = sys.argv[2]
src = sys.argv[3]
other = sys.argv[4]
train_test = sys.argv[5]
def read_datasets(file_path_local, file_path_local_pos):
# SANITY CHECK
assert os.path.isfile(file_path_local) and os.path.isfile(file_path_local_pos)
dataset = []
with open(file_path_local, 'r') as f_p:
for line in f_p:
dataset.append(line.strip())
dataset_pos = []
with open(file_path_local_pos, 'r') as f_p:
for line in f_p:
dataset_pos.append(line.strip())
return dataset, dataset_pos
def datasets_to_file(dataset, dataset2, dataset_pos, dataset_pos2, sentence_length, num_prep, ppl, ppl2, ppl_pos, ppl_pos2, header, destination):
#def datasets_to_file(dataset, dataset_pos, header, destination):
with open(destination, 'w') as f:
f.write(header + '\n')
for idx in range(len(dataset)):
#f.write(dataset[idx] + ',' + dataset2[idx] + ',' + dataset_pos[idx] + ',' + dataset_pos2[idx] + ',' + sentence_length[idx] + ',' + num_prep +'\n')
f.write('{},{},{},{},{},{},{},{},{},{}\n'.format(dataset[idx], dataset2[idx], dataset_pos[idx], dataset_pos2[idx], sentence_length[idx], num_prep[idx], ppl[idx], ppl2[idx], ppl_pos[idx], ppl_pos2[idx]))
print("Reading datasets:" + file_path + "," + file_path_pos)
dataset, dataset_pos = read_datasets(file_path, file_path_pos)
print("Extracting features")
f_wh = range(len(dataset))
f_posh = range(len(dataset_pos))
f_wmt = range(len(dataset))
f_posmt = range(len(dataset_pos))
sentence_length = range(len(dataset_pos))
number_of_prep = range(len(dataset_pos))
f_pplh = range(len(dataset))
f_pplhpos = range(len(dataset_pos))
f_pplmt = range(len(dataset))
f_pplmtpos = range(len(dataset_pos))
for idx in range(len(dataset)):
sentence = dataset[idx]
import re
sentence_length[idx] = len([word for word in sentence.split(' ') if re.match("\w+",word, re.U)])
number_of_prep[idx] = len(re.findall('sp[0C][0M][0S]',dataset_pos[idx]))
import tempfile
############### For h ###############
with tempfile.NamedTemporaryFile() as temp:
temp.write(dataset[idx])
temp.flush()
command = './rnnlm-0.4b/rnnlm -rnnlm models/model_h -test {}'.format(temp.name)
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
stdout_value = proc.communicate()[0]
log_probability = stdout_value.split('\n')[3].split(':')[1]
f_wh[idx] = log_probability.strip()
f_pplh[idx] = stdout_value.split('\n')[5].split(':')[1]
with tempfile.NamedTemporaryFile() as temp:
temp.write(dataset_pos[idx])
temp.flush()
command = './rnnlm-0.4b/rnnlm -rnnlm models/model_h_pos -test {}'.format(temp.name)
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
stdout_value = proc.communicate()[0]
log_probability = stdout_value.split('\n')[3].split(':')[1]
f_posh[idx] = log_probability.strip()
f_pplhpos[idx] = stdout_value.split('\n')[5].split(':')[1]
############### For mt ###############
with tempfile.NamedTemporaryFile() as temp:
temp.write(dataset[idx])
temp.flush()
command = './rnnlm-0.4b/rnnlm -rnnlm models/model_mt -test {}'.format(temp.name)
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
stdout_value = proc.communicate()[0]
log_probability = stdout_value.split('\n')[3].split(':')[1]
f_wmt[idx] = log_probability.strip()
f_pplmt[idx] = stdout_value.split('\n')[5].split(':')[1]
with tempfile.NamedTemporaryFile() as temp:
temp.write(dataset_pos[idx])
temp.flush()
command = './rnnlm-0.4b/rnnlm -rnnlm models/model_mt_pos -test {}'.format(temp.name)
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
stdout_value = proc.communicate()[0]
log_probability = stdout_value.split('\n')[3].split(':')[1]
f_posmt[idx] = log_probability.strip()
f_pplmtpos[idx] = stdout_value.split('\n')[5].split(':')[1]
# print("Current index {}".format(idx))
file_to_write = 'features_ppl/{}_scores_feat_{}'.format(train_test, src)
header = 'f_wh,f_wmt,f_posh,f_posmt,length,num_prep, f_pplh, f_pplmt, f_pplhpos, f_pplmtpos'
datasets_to_file(f_wh, f_wmt, f_posh, f_posmt, sentence_length, number_of_prep, f_pplh, f_pplmt, f_pplhpos, f_pplmtpos, header, file_to_write)