-
Notifications
You must be signed in to change notification settings - Fork 8
/
word_dict.py
63 lines (53 loc) · 1.75 KB
/
word_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
a python library for building word ngram dictionaries
Written by Ye Kyaw Thu
(Current post: Visiting Professor, NECTEC, Thailand)
Last updated: 1 Sept 2021
Reference: https://www.geeksforgeeks.org/python-handling-recursion-limit/
"""
import os
import sys
import tempfile
import numpy as np
from collections import defaultdict
from pylab import *
import pickle
import math
import functools
sys.setrecursionlimit(10**6)
# recursion limit added by Ye.
def count_bigram (file, bigram_dict_txt, bigram_dict_bin):
fileBI_txt = open(bigram_dict_txt, "w")
bigram = defaultdict (int)
with open (file, 'r') as fh:
for line in fh:
words = line.rstrip('\n').split()
if len(words) > 0:
pword = words[0]
for word in words[1:]:
bigram[(pword,word)] += 1
pword = word
for key, value in bigram.items():
fileBI_txt.write (str(key)+'\t'+str(value)+'\n')
fileBI_txt.close()
# write binary dictionary
fileBI_bin = open(bigram_dict_bin, "wb")
pickle.dump(bigram, fileBI_bin)
fileBI_bin.close()
return bigram
def count_unigram (file, unigram_dict_txt, unigram_dict_bin):
fileUNI_txt = open(unigram_dict_txt, "w")
unigram = defaultdict (int)
with open (file, 'r') as fh:
for line in fh:
words = line.rstrip('\n').split()
for word in words:
unigram[word] += 1
for key, value in unigram.items():
fileUNI_txt.write (str(key)+'\t'+str(value)+'\n')
fileUNI_txt.close()
# write binary dictionary
fileUNI_bin = open(unigram_dict_bin, "wb")
pickle.dump(unigram, fileUNI_bin)
fileUNI_bin.close()
return unigram