-
Notifications
You must be signed in to change notification settings - Fork 2
/
harvest_bigram.py
123 lines (112 loc) · 3.6 KB
/
harvest_bigram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
# (C) 2016-2021 Muthiah Annamalai
#
# This file is part of 'open-tamil' package
# We generate unigram and bi-gram statistics for Tamil texts
#
import tamil
from ngram.LetterModels import Unigram
import codecs
import pprint
import copy
import operator
from functools import cmp_to_key
import sys
import glob
import os
def print_tamil_words_by_frequency(frequency, fp=None):
# sort words by descending order of occurence
if not fp:
fp = sys.stdout
fp.write(u"# unique words = %d\n" % (len(frequency)))
fp.write(u"# sorted in Frequency order\n")
fp.write(u"freqsort_data = [\n")
for k, v in sorted(frequency.items(), key=operator.itemgetter(1), reverse=True):
fp.write(u"[u'%s',%g],\n" % (k, v))
fp.write("]\n")
fp.write(u"#" * 80 + u"\n")
fp.write(u"# sorted in Tamil order\n")
fp.write(u"alphasort_data = [\n")
for l in sorted(
frequency.keys(), key=cmp_to_key(tamil.utf8.compare_words_lexicographic)
):
k, v = l, frequency[l]
fp.write(u"[u'%s',%g],\n" % (k, v))
fp.write("]\n")
return
def get_prob(data):
# adjust for non-zero probability of all symbols
delta = 1e9
data2 = copy.copy(data)
s = 0.0
nzeros = 0
for k, v in data2.items():
s += float(v)
if v == 0:
nzeros += 1
elif v < delta: # and not zero
delta = v
# delta has lowest frequency
delta = float(delta) / 2.0
if nzeros > 0:
s = s + delta * nzeros
print(u"n-zeros = %d,%g" % (nzeros, delta / s))
for k, v in data2.items():
if data2[k] == 0:
data2[k] = delta
data2[k] = float(data2[k]) / s
# fudge adjust so probabilities sum to 1.0
eps = abs(sum([v for k, v in data2.items()]) - 1.0)
data2[k] = data2[k] - eps
return data2
def proc_stats(data, filename):
with codecs.open(filename, "w", "utf-8") as fp:
data_as_prob = get_prob(data)
print_tamil_words_by_frequency(data_as_prob, fp)
return
def get_stats():
obj = Unigram("out-tamil-words.txt")
obj.frequency_model()
with codecs.open("ta_data_freq.txt", "w", "utf-8") as fp:
pprint.pprint(obj.letter, stream=fp)
proc_stats(obj.letter, u"ta_data_freq2.txt")
return
class BigramHash(Unigram):
def __init__(self, filename):
Unigram.__init__(self, filename)
self.bigram = dict()
self.bigram_filename=filename
def frequency_model(self):
""" build a letter frequency model for Tamil letters from a corpus """
prev_letter = None
# use a generator in corpus
letters=list(self.corpus.next_tamil_letter())
if len(letters)<2:
print("WARNING: too small file {0}".format(self.bigram_filename))
return
prev_letter = letters[0]
for next_letter in letters[1:]:
# update frequency from corpus
key = prev_letter + next_letter
val = self.bigram.get(key, None)
prev_letter = next_letter
if not val:
self.bigram[key] = 0
self.bigram[key] += 1
return
def proc_stats2(data,outputfile):
proc_stats(get_prob(data), outputfile)
def get_stats2(filename,prior=None):
obj = BigramHash(filename)
if prior:
obj.bigram.update( prior.bigram )
obj.frequency_model()
return obj
def run(parent,outputfile):
x=None
for filename in glob.glob(os.path.join(parent,"*.word")):
x = get_stats2(filename,x)
proc_stats2(x.bigram,outputfile)
return x
if __name__ == "__main__":
run('plain_text','pm_bigram_sorted_042521.txt')