-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsandbox.py
288 lines (275 loc) · 13.5 KB
/
sandbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/python
# -*- coding:utf-8 -*-
# import sys
# from bin.txt2tmx import getVecAlign
#
# for sub in ["citi1", "citi2", "cour", "hans", "ilo", "onu", "tao1", "tao2", "tao3", "verne", "xerox"]:
# fr = "/data/rali5/Tmp/alfonsda/workRali/004tradBureau/023bafAlignmentTestCorpora/alignedWithVecalign/{0}.segm_ed.fr".format(sub)
# en = "/data/rali5/Tmp/alfonsda/workRali/004tradBureau/023bafAlignmentTestCorpora/alignedWithVecalign/{0}.segm_ed.en".format(sub)
# with open(fr, "r", encoding="latin1") as frf:
# frcontent = frf.read()
# with open(fr, "w") as frf:
# frf.write(frcontent)
# with open(en, "r", encoding="latin1") as enF:
# encontent = enF.read()
# with open(en, "w") as enF:
# enF.write(encontent)
#
# getVecAlign(fr,
# en,
# ["fr", "en"],
# txtSrcOutputPath="/data/rali5/Tmp/alfonsda/workRali/004tradBureau/023bafAlignmentTestCorpora/alignedWithVecalign/{0}.output.fr".format(sub),
# txtTrgtOutputPath="/data/rali5/Tmp/alfonsda/workRali/004tradBureau/023bafAlignmentTestCorpora/alignedWithVecalign/{0}.output.en".format(sub))
# # #COMPARE YASA TO VECALIGN USING BLEUALIGN (Vecalign eval set)
# import subprocess
# import sys, os
# from collections import defaultdict
# import numpy as np
# from ast import literal_eval
#
#
# def createEmptyFolder(folderPath):
# """ given a non existing folder path, creates the necessary folders so the path exists """
# if not os.path.exists(folderPath):
# os.makedirs(folderPath)
#
#
# def getYasaAlign(srcFilePath, trgtFilePath, outputFolderPath="./tmp/"):
# """
# use YASA to align two parallel files and output the result in a human readeable fashion
# :param srcFilePath: path to the source file
# :param trgtFilePath: path to the target file
# :param outputFolderPath:
# :return:
# """
# createEmptyFolder(outputFolderPath)
# # apply the yasa script
# subprocess.call(["./resources/yasa/yasa", "-i", "o", "-o", "a",
# srcFilePath, trgtFilePath, u"{0}yasa.output.arcadeformat".format(outputFolderPath)])
# subprocess.call(["./resources/yasa/yasa", "-i", "o", "-o", "r",
# srcFilePath, trgtFilePath, u"{0}yasa.output.raliformat".format(outputFolderPath)])
# # open the arcade format and get the index of the aligned sentences
# indexInfo = []
# indexAlignList = []
# with open(u"{0}yasa.output.arcadeformat".format(outputFolderPath)) as arcadeFile:
# with open(u"{0}yasa.output.raliformat".format(outputFolderPath)) as raliFile:
# # first line
# arcadeLn = arcadeFile.readline()
# raliLn = raliFile.readline()
# while arcadeLn:
# # split the different sections of the output data
# arcadeSplit = arcadeLn.split(u'"')
# raliSplit = raliLn.split(u" ")
# # get the line indexes and score
# indexSect = arcadeSplit[1].split(";")
# indexSrc = [int(s.replace(u",", "")) - 1 if s != "" else None for s in indexSect[0].split(" ")]
# indexTgrt = [int(s.replace(u",", "")) - 1 if s != "" else None for s in indexSect[1].split(" ")]
# arcadeScore = float(arcadeSplit[3].replace(u",", ""))
# raliScore = float(raliSplit[1].replace(u"\n", "").replace(u",", ""))
# indexInfo.append({u"src": indexSrc, "trgt": indexTgrt, "scores": [arcadeScore, raliScore]})
# indexAlignList.append(([] if None in indexSrc else indexSrc,
# [] if None in indexTgrt else indexTgrt))
# # next line
# arcadeLn = arcadeFile.readline()
# raliLn = raliFile.readline()
# return indexAlignList
#
#
# def read_alignments(fin):
# alignments = []
# with open(fin, 'rt', encoding="utf-8") as infile:
# for line in infile:
# fields = [x.strip() for x in line.split(':') if len(x.strip())]
# if len(fields) < 2:
# raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
# try:
# src = literal_eval(fields[0])
# tgt = literal_eval(fields[1])
# except:
# raise Exception('Failed to parse line "%s"' % line.strip())
# alignments.append((src, tgt))
#
# # I know bluealign files have a few entries entries missing,
# # but I don't fix them in order to be consistent previous reported scores
# return alignments
#
#
# def _precision(goldalign, testalign):
# """
# Computes tpstrict, fpstrict, tplax, fplax for gold/test alignments
# """
# tpstrict = 0 # true positive strict counter
# tplax = 0 # true positive lax counter
# fpstrict = 0 # false positive strict counter
# fplax = 0 # false positive lax counter
#
# # convert to sets, remove alignments empty on both sides
# testalign = set([(tuple(x), tuple(y)) for x, y in testalign if len(x) or len(y)])
# goldalign = set([(tuple(x), tuple(y)) for x, y in goldalign if len(x) or len(y)])
#
# # mappings from source test sentence idxs to
# # target gold sentence idxs for which the source test sentence
# # was found in corresponding source gold alignment
# src_id_to_gold_tgt_ids = defaultdict(set)
# for gold_src, gold_tgt in goldalign:
# for gold_src_id in gold_src:
# for gold_tgt_id in gold_tgt:
# src_id_to_gold_tgt_ids[gold_src_id].add(gold_tgt_id)
#
# for (test_src, test_target) in testalign:
# if (test_src, test_target) == ((), ()):
# continue
# if (test_src, test_target) in goldalign:
# # strict match
# tpstrict += 1
# tplax += 1
# else:
# # For anything with partial gold/test overlap on the source,
# # see if there is also partial overlap on the gold/test target
# # If so, its a lax match
# target_ids = set()
# for src_test_id in test_src:
# for tgt_id in src_id_to_gold_tgt_ids[src_test_id]:
# target_ids.add(tgt_id)
# if set(test_target).intersection(target_ids):
# fpstrict += 1
# tplax += 1
# else:
# print(111111111111, test_src, test_target)
# print(222222222222, [t for t in goldalign if t[0] == test_src])
# fpstrict += 1
# fplax += 1
#
# return np.array([tpstrict, fpstrict, tplax, fplax], dtype=np.int32)
#
#
# def score_multiple(gold_list, test_list, value_for_div_by_0=0.0):
# # accumulate counts for all gold/test files
# pcounts = np.array([0, 0, 0, 0], dtype=np.int32)
# rcounts = np.array([0, 0, 0, 0], dtype=np.int32)
# for goldalign, testalign in zip(gold_list, test_list):
# pcounts += _precision(goldalign=goldalign, testalign=testalign)
# # recall is precision with no insertion/deletion and swap args
# test_no_del = [(x, y) for x, y in testalign if len(x) and len(y)]
# gold_no_del = [(x, y) for x, y in goldalign if len(x) and len(y)]
# rcounts += _precision(goldalign=test_no_del, testalign=gold_no_del)
#
# # Compute results
# # pcounts: tpstrict,fnstrict,tplax,fnlax
# # rcounts: tpstrict,fpstrict,tplax,fplax
#
# if pcounts[0] + pcounts[1] == 0:
# pstrict = value_for_div_by_0
# else:
# pstrict = pcounts[0] / float(pcounts[0] + pcounts[1])
#
# if pcounts[2] + pcounts[3] == 0:
# plax = value_for_div_by_0
# else:
# plax = pcounts[2] / float(pcounts[2] + pcounts[3])
#
# if rcounts[0] + rcounts[1] == 0:
# rstrict = value_for_div_by_0
# else:
# rstrict = rcounts[0] / float(rcounts[0] + rcounts[1])
#
# if rcounts[2] + rcounts[3] == 0:
# rlax = value_for_div_by_0
# else:
# rlax = rcounts[2] / float(rcounts[2] + rcounts[3])
#
# if (pstrict + rstrict) == 0:
# fstrict = value_for_div_by_0
# else:
# fstrict = 2 * (pstrict * rstrict) / (pstrict + rstrict)
#
# if (plax + rlax) == 0:
# flax = value_for_div_by_0
# else:
# flax = 2 * (plax * rlax) / (plax + rlax)
#
# result = dict(recall_strict=rstrict,
# recall_lax=rlax,
# precision_strict=pstrict,
# precision_lax=plax,
# f1_strict=fstrict,
# f1_lax=flax)
#
# return result
#
#
# def log_final_scores(res):
# print(' ---------------------------------', file=sys.stderr)
# print('| | Strict | Lax |', file=sys.stderr)
# print('| Precision | {precision_strict:.3f} | {precision_lax:.3f} |'.format(**res), file=sys.stderr)
# print('| Recall | {recall_strict:.3f} | {recall_lax:.3f} |'.format(**res), file=sys.stderr)
# print('| F1 | {f1_strict:.3f} | {f1_lax:.3f} |'.format(**res), file=sys.stderr)
# print(' ---------------------------------', file=sys.stderr)
#
#
# # gold_files = ["./resources/vecalign/bleualign_data/test0.defr", "./resources/vecalign/bleualign_data/test1.defr",
# # "./resources/vecalign/bleualign_data/test2.defr", "./resources/vecalign/bleualign_data/test3.defr",
# # "./resources/vecalign/bleualign_data/test4.defr", "./resources/vecalign/bleualign_data/test5.defr",
# # "./resources/vecalign/bleualign_data/test6.defr"]
# gold_files = ["./resources/vecalign/bleualign_data/test0.defr", "./resources/vecalign/bleualign_data/test1.defr",
# "./resources/vecalign/bleualign_data/test2.defr", "./resources/vecalign/bleualign_data/test3.defr",
# "./resources/vecalign/bleualign_data/test4.defr", "./resources/vecalign/bleualign_data/test5.defr",
# "./resources/vecalign/bleualign_data/test6.defr"]
# gold_list = [read_alignments(x) for x in gold_files]
#
# # test_files = [["./resources/vecalign/bleualign_data/test0.de", "./resources/vecalign/bleualign_data/test0.fr"],
# # ["./resources/vecalign/bleualign_data/test1.de", "./resources/vecalign/bleualign_data/test1.fr"],
# # ["./resources/vecalign/bleualign_data/test2.de", "./resources/vecalign/bleualign_data/test2.fr"],
# # ["./resources/vecalign/bleualign_data/test3.de", "./resources/vecalign/bleualign_data/test3.fr"],
# # ["./resources/vecalign/bleualign_data/test4.de", "./resources/vecalign/bleualign_data/test4.fr"],
# # ["./resources/vecalign/bleualign_data/test5.de", "./resources/vecalign/bleualign_data/test5.fr"],
# # ["./resources/vecalign/bleualign_data/test6.de", "./resources/vecalign/bleualign_data/test6.fr"],
# # ]
# test_files = [["./resources/vecalign/bleualign_data/test0.de", "./resources/vecalign/bleualign_data/test0.fr"],
# ["./resources/vecalign/bleualign_data/test1.de", "./resources/vecalign/bleualign_data/test1.fr"],
# ["./resources/vecalign/bleualign_data/test2.de", "./resources/vecalign/bleualign_data/test2.fr"],
# ["./resources/vecalign/bleualign_data/test3.de", "./resources/vecalign/bleualign_data/test3.fr"],
# ["./resources/vecalign/bleualign_data/test4.de", "./resources/vecalign/bleualign_data/test4.fr"],
# ["./resources/vecalign/bleualign_data/test5.de", "./resources/vecalign/bleualign_data/test5.fr"],
# ["./resources/vecalign/bleualign_data/test6.de", "./resources/vecalign/bleualign_data/test6.fr"],
# ]
# test_list = [getYasaAlign(tfiles[0], tfiles[1], outputFolderPath="./sandboxtmp/") for tfiles in test_files]
# res = score_multiple(gold_list=gold_list, test_list=test_list)
# log_final_scores(res)
### DISTRIBUTION OF ALIGNMENTS
import os
# path = "/home/d/Documents/programming/workRali/019DemoPipeline/resources/vecalign/bleualign_data/"
# elInCommon = ["test", ".defr"] # gold bleaualign
# sep = [":", ", "]
path = "/home/d/Documents/programming/workRali/023bafAlignmentTestCorpora/baf-1.1/"
elInCommon = [".ind.cesalign"] # gold baf
sep = [";", " "]
# path = "/home/d/Documents/programming/workRali/023bafAlignmentTestCorpora/alignedWithVecalign/"
# elInCommon = [".tsv"] # vecalign align for baf corpus
# sep = ["\t", ","]
# path = "/home/d/Documents/programming/workRali/023bafAlignmentTestCorpora/alignedWithYasa/yasa/allArcade/"
# elInCommon = [".arcadeformat"] # yasa align for baf corpus
# sep = [";", " "]
adict = {"t":0}
files = ["{0}{1}".format(path, file) for file in os.listdir(path) if (elInCommon[0] in file)]
for file in files:
with open(file) as file:
lns = file.readlines()
for ln in lns:
if ln not in ["""<!DOCTYPE CESALIGN PUBLIC "-//CES//DTD cesAlign//EN" []>\n""",
"""<CESALIGN VERSION="1.14">\n""", """<LINKLIST>\n""", """<LINKGRP>\n""", """</LINKGRP>\n""",
"""</LINKLIST>\n""", """</CESALIGN>\n""", """</CESALIGN>"""]:
spl = ln.split(sep[0])
print(spl)
l = spl[0].replace("[", "").replace("]", "").split('"')[-1]
r = spl[1].replace("[", "").replace("]", "").replace("\n", "").split('"')[0]
print(1111, repr(l), repr(r))
l = len([x.replace(",", "") for x in l.split(sep[1])]) if l.split(sep[1])[0] != "" else 0
r = len(r.split(sep[1])) if r.split(sep[1])[0] != "" else 0
print(22222, l, r)
k = "{0}:{1}".format(l,r)
if k not in adict:
adict[k] = 0
adict[k] += 1
adict["t"] += 1
print(adict)