-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsandboxExtractFromData.py
111 lines (94 loc) · 4.11 KB
/
sandboxExtractFromData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import sys, math
sys.path.append(u'../utils')
sys.path.append(u'./utils')
import utilsOs, utilsString, b000path, b003heuristics
import re
from scipy.stats import pearsonr
# import pandas as pd
# import numpy as np
# count the time the algorithm takes to run
startTime = utilsOs.countTime()
count = 0
total = 0
cogn = {}
lenT = {}
lenC = {}
tablMat = {'small':{'0.0-0.19':0, '0.2-0.39':0, '0.4-0.59':0, '0.6-0.79':0, '0.8-1.0':0},
u'dashOrNb':{'0.0-0.19':0, '0.2-0.39':0, '0.4-0.59':0, '0.6-0.79':0, '0.8-1.0':0}}
count = {'small':0, 'dashOrNb':0}
addSeparators = [u'.', u',', u':', u'/', u'-', u'h', u"''", u"'"]
# srcTrgtFiles = utilsOs.goDeepGetFiles(b000path.getBtFolderPath(flagFolder=u'a'), format=u'.tmx')
srcTrgtFiles = [u'./002manuallyAnnotated/sample', u'./003negativeNaiveExtractors/000manualAnnotation/sample']
nbrs = re.compile(r'[0-9]')
for filePath in srcTrgtFiles:
srcFilePath = u'{0}.en'.format(filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath)
trgtFilePath = u'{0}.fr'.format(filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath)
refFilePath = filePath.replace('sample', 'sampleAnnotation.tsv')
# open line by line and apply extractors
try:
with open(srcFilePath) as srcFile:
srcLines = srcFile.readlines()
with open(trgtFilePath) as trgtFile:
trgtLines = trgtFile.readlines()
with open(refFilePath) as refFile:
refLines = refFile.readlines()
for srcLnIndex, srcLn in enumerate(srcLines):
trgtLn = trgtLines[srcLnIndex]
docLoc = srcLnIndex / len(srcLines)
# sizeSrc = len(srcLn)
# sizeTrgt = len(trgtLn)
# if abs(sizeSrc-sizeTrgt) not in lenC:
# lenC[abs(sizeSrc-sizeTrgt)] = 0
# lenC[abs(sizeSrc-sizeTrgt)] += 1
if len(utilsString.extractNumbersFromString(srcLn[:3])) != 0 or u'-' in srcLn[:3] or u'.' in srcLn[:3] :
if docLoc < 0.2:
tablMat['dashOrNb']['0.0-0.19'] += 1
elif docLoc < 0.4:
tablMat['dashOrNb']['0.2-0.39'] += 1
elif docLoc < 0.6:
tablMat['dashOrNb']['0.4-0.59'] += 1
elif docLoc < 0.8:
tablMat['dashOrNb']['0.6-0.79'] += 1
else:
tablMat['dashOrNb']['0.8-1.0'] += 1
if refLines[srcLnIndex] != u'1.0\n':
count[u'small'] += 1
srcLn = utilsString.nltkTokenizer(srcLn, addSeparators)
trgtLn = utilsString.nltkTokenizer(trgtLn, addSeparators)
# # compile the cognates of each token for the source and target
# srcCognates = b003heuristics.getCognates(srcLn, 4)
# trgtCognates = set(b003heuristics.getCognates(trgtLn, 4))
# # get intersection of cognates
# intersection = [cog for cog in srcCognates if cog in trgtCognates]
# lenin = len(intersection)
# if lenin not in cogn:
# cogn[lenin] = 0
# cogn[lenin] += 1
# sizeSrc = len(srcLn)
# sizeTrgt = len(trgtLn)
# if abs(sizeSrc-sizeTrgt) not in lenT:
# lenT[abs(sizeSrc-sizeTrgt)] = 0
# lenT[abs(sizeSrc-sizeTrgt)] += 1
if len(srcLn) <= 4:
if docLoc < 0.2:
tablMat['small']['0.0-0.19'] += 1
elif docLoc < 0.4:
tablMat['small']['0.2-0.39'] += 1
elif docLoc < 0.6:
tablMat['small']['0.4-0.59'] += 1
elif docLoc < 0.8:
tablMat['small']['0.6-0.79'] += 1
else:
tablMat['small']['0.8-1.0'] += 1
if refLines[srcLnIndex] != u'1.0\n':
count[u'dashOrNb'] += 1
total += 1
except FileNotFoundError:
pass
# print(111, total, lenC)
# print(222, total, lenT)
# print(333, total, cogn)
# print(444, tablMat)
print(len(refLines), count)
# print the time the algorithm took to run
print(u'\nTIME IN SECONDS ::', utilsOs.countTime(startTime))