This repository has been archived by the owner on Sep 12, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
input.py
68 lines (51 loc) · 1.74 KB
/
input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
from collections import OrderedDict
class input():
def __init__(self, content):
self.__n = 3 # trigram
self.words = self.__tokenize(content)
self.alphabet = self.__characters()
self.ngrams = self.__model()
def __tokenize(self, content):
words = re.compile(r'\w(?:[-\w]*\w)?').findall(content)
words = [x.lower() for x in words]
words.sort()
return words
def __characters(self):
alphabet = []
for word in self.words:
for i in range(len(word)):
if word[i] not in alphabet:
alphabet.append(word[i])
alphabet.sort()
return alphabet
def __model(self):
bound = self.__n - 1
ngrams = {}
for word in self.words:
if len(word) < self.__n:
pass
for i in range(len(word) - bound):
key = () #generated ngram
for j in range(bound):
key = key + (word[i+j],)
if key in ngrams:
ngrams[key].append(word[i + bound])
else:
ngrams[key] = [word[i + bound]]
return OrderedDict(sorted(ngrams.items(), key=lambda t: t[0]))
def countStartingWith(self, i):
count = 1 #start with 1 for smoothing
for word in self.words:
if word[:1] is i:
count += 1
return count
def countPairs(self, i, j):
if i in self.ngrams.keys():
count = 1 #start with 1 for smoothing
for value in self.ngrams.get(i):
if value is j:
count += 1
return count
else:
return 1 #return for smoothing