forked from drdhaval2785/samasasplitter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split.py
264 lines (255 loc) · 8.94 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# -*- coding: utf-8 -*-
"""
python split.py aDigrahaRa MD
or
python split.py batchprocess/input.txt MW batchprocess/output.txt
"""
import sys, re
import codecs
import string
import datetime
import itertools
from lxml import etree
from io import StringIO, BytesIO
from math import log
import transcoder
# Function to return timestamp
def timestamp():
return datetime.datetime.now()
def triming(lst):
output = []
for member in lst:
member = member.strip()
output.append(member)
return output
def preparation(inputfile,translit='deva'):
infile = codecs.open(inputfile,'r','utf-8')
inputwords = infile.read().split()
inputwords = triming(inputwords)
output = []
for word in inputwords:
word = transcoder.transcoder_processString(word,'deva','slp1')
if re.search('[^A-Za-z]',word):
word = re.sub('[^A-Za-z]','',word)
if not word == '':
output.append(word)
else:
output.append(word)
return output
def sanhw2():
fin = codecs.open('../CORRECTIONS/sanhw2/sanhw2.txt','r','utf-8')
lines = fin.readlines()
output = []
for line in lines:
line = line.strip()
split = line.split(':') # ['aMSakalpanA', 'CAE;4,CCS;4,MD;4,MW;21,PD;50,PW;9']
word = split[0] # 'aMSakalpanA'
dictswithlnum = split[1].split(',') # ['CAE;4','CCS;4','MD;4','MW;21','PD;50','PW;9']
dicts = []
lnums = []
for dictwlnum in dictswithlnum:
[dict,lnum] = dictwlnum.split(';')
dicts.append(dict) # ['CAE','CCS','MD','MW','PD','PW']
lnums.append(lnum) # [4,4,4,21,50,9]
output.append((word,dicts,lnums))
return output
def createhwlist(dictname):
print "Creating headword data of sanhw2.txt"
global sanhw2
sanhw2 = sanhw2()
sanhw2 = sorted(sanhw2, key=lambda x: (len(x[1]),len(x[0])), reverse=True)
fout = codecs.open('dicts/hwsorted.txt','w','utf-8')
hw = []
for (hword,dicts,lnums) in sanhw2:
if len(hword) > 1 and dictname in dicts:
fout.write(hword+"\n")
hw.append(hword)
fout.close()
print len(hw)
print "Created headword data of sanhw2.txt"
def readwords(dictionary):
return open(dictionary).read().split()
def startingpatterns(words):
output = []
for word in words:
output += [word[2:x] for x in range(len(word))]
return output
def readmwkey2():
fin = codecs.open('dicts/mw2.txt','r','utf-8')
lines = fin.readlines()
lines = triming(lines)
pairs = []
for line in lines:
[word,split] = line.split(':')
pairs.append((word,split))
return pairs
def unique(lst):
output = []
for member in lst:
if member not in output:
output.append(member)
return output
# Asked the procedure at http://stackoverflow.com/questions/34108900/optionally-replacing-a-substring-python
def permut(word,lstrep,dictionary):
global startpatterns # words from dictionary base
dictset = set(dictionary)
input_str = word
# make substitution list a dict for easy lookup
lstrep_map = dict(lstrep)
# a substitution is an index plus a string to substitute. build
# list of subs [[(index1, sub1), (index1, sub2)], ...] for all
# characters in lstrep_map.
subs = []
for i, c in enumerate(input_str):
if c in lstrep_map:
subs.append([(i, sub) for sub in lstrep_map[c]])
# build output by applying each sub recorded
out = []
for sub in itertools.product(*subs):
# make input a list for easy substitution
input_list = list(input_str)
for j, cc in sub:
if ''.join(input_list[0:2]) == word[0:2] and input_list[-1] == word[-1]:
if input_str[0:j]+cc[0] in dictset:
input_list[j] = cc
out.append(''.join(input_list))
out = list(set(out))
out = sorted(out, key=len)
return out
def permut1(word,lstrep,dictionary):
global startpatterns # words from dictionary base
dictset = set(dictionary)
input_str = word
# make substitution list a dict for easy lookup
lstrep_map = dict(lstrep)
# a substitution is an index plus a string to substitute. build
# list of subs [[(index1, sub1), (index1, sub2)], ...] for all
# characters in lstrep_map.
subs = []
for i, c in enumerate(input_str):
if c in lstrep_map:
subs.append([(i, sub) for sub in lstrep_map[c]])
# build output by applying each sub recorded
out = []
for sub in itertools.product(*subs):
# make input a list for easy substitution
input_list = list(input_str)
for j, cc in sub:
if ''.join(input_list[0:2]) == word[0:2] and input_list[-1] == word[-1]:
if input_str[0:j]+cc[0] in dictset:
input_list[j] = cc
out.append(''.join(input_list))
out = list(set(out))
out = sorted(out, key=len)
return out
replas = [('kk','k'),('kK','K'),('gg','g'),('gG','G'),('NN','N'),('cc','c'),('cC','C'),('jj','j'),('jJ','J'),('YY','Y'),('ww','w'),('wW','W'),('qq','q'),('qQ','Q'),('RR','R'),('tt','t'),('tT','T'),('dd','d'),('dD','D'),('nn','n'),('pp','p'),('pP','P'),('bb','b'),('bB','B'),('mm','m'),('yy','y'),('rr','r'),('ll','l'),('vv','v'),('SS','S'),('zz','z'),('ss','s'),('hh','h'),('y','i'),('y','I'),('v','u'),('v','U'),]
def deduplicate(word):
global replas
for (a,b) in replas:
word = word.replace(a,b)
return word
term = [('A','a'),('I','i'),('AH','a'),('AH','as'),('aH','as'),('H',''),('m',''),('M',''),('O',''),('I','a'),('e','a')]
def determ(word):
global term
output = []
if re.search('[AHImMO]$',word):
for (a,b) in term:
if re.search(a+'$',word):
output.append(re.sub(a+'$',b,word))
return output
#http://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words/11642687#11642687
def infer_spaces(s,dictionary):
global words
global wordcost
global maxword
#print len(words), len(wordcost), maxword
"""Uses dynamic programming to infer the location of spaces in a string
without spaces."""
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
def best_match(i):
candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)
# Build the cost array.
cost = [0]
for i in range(1,len(s)+1):
c,k = best_match(i)
cost.append(c)
# Backtrack to recover the minimal-cost string.
out = []
i = len(s)
while i>0:
c,k = best_match(i)
assert c == cost[i]
out.append(s[i-k:i])
i -= k
return "+".join(reversed(out))
if __name__=="__main__":
debug = 1
lstrep = [('A',('A','aa','aA','Aa','AA','As')),('I',('I','ii','iI','Ii','II')),('U',('U','uu','uU','Uu','UU')),('F',('F','ff','fx','xf','Fx','xF','FF')),('e',('e','ea','ai','aI','Ai','AI')),('o',('o','oa','au','aU','Au','AU','aH','aHa','as')),('E',('E','ae','Ae','aE','AE')),('O',('O','ao','Ao','aO','AO')),('ar',('af','ar')),('d',('t','d')),('H',('H','s')),('S',('S','s','H')),('M',('m','M')),('y',('y','i','I')),('N',('N','M')),('Y',('Y','M')),('R',('R','M')),('n',('n','M')),('m',('m','M')),('v',('v','u','U')),('r',('r','s','H')),]
dictionary = 'dicts/md.txt'
if len(sys.argv) > 2:
dictionary = 'dicts/'+sys.argv[2]+'.txt'
if len(sys.argv) > 1:
inputwords = [sys.argv[1]]
if len(sys.argv) == 4:
outfile = codecs.open(sys.argv[3],'w','utf-8')
inputwords = preparation(sys.argv[1])
global solutions
solutions = {}
if debug == 1:
print 'Reading knownpairs', timestamp()
knownpairs = readmwkey2()
if debug == 1:
print 'Calculating costs of dictionary headwords', timestamp()
words = readwords(dictionary)
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
#print sys.argv[2]+"cost =",
#print wordcost
if debug == 1:
print 'Calculated costs of dictionary headwords', timestamp()
maxword = max(len(x) for x in words)
#print sys.argv[2]+"maxword =",
#print maxword
if debug == 1:
print 'Calculated maxword', timestamp()
counter = 0
for inputword in inputwords:
test = infer_spaces(inputword,dictionary)
if any(a == inputword for (a,b) in knownpairs):
if len(sys.argv) == 4:
outfile.write(inputword+':'+inputword+':1\n')
#print inputword, '1'
elif not re.search('[+]',test):
if len(sys.argv) == 4:
outfile.write(inputword+':'+inputword+':2\n')
#print inputword, '2'
else:
perm = [inputword]
perm += permut(inputword,lstrep,words)
print 'valid permutations are', len(perm)
print timestamp()
output = []
for mem in perm:
split = infer_spaces(mem,dictionary)
if split is not False:
output.append(split)
output = sorted(output,key=lambda x:x.count('+'))
output = [member for member in output if not re.search('[+][^AsmMH][+]',member) and not re.search('[+][^mMsH]{1}$',member)] # Remove the splits which have single letter members.
output = unique(output)
if len(output) == 1 and output == [inputword]:
if len(sys.argv) == 4:
outfile.write(inputword+':'+inputword+':3\n')
print inputword, '3'
elif len(output) == 0:
if len(sys.argv)==4:
outfile.write(inputword+':'+inputword+':4\n')
print inputword, '4'
else:
if len(sys.argv) == 4:
outfile.write(inputword+':'+output[0]+':5\n')
print output[0:5], '5'
if debug == 1:
print timestamp()