-
Notifications
You must be signed in to change notification settings - Fork 3
/
find-max-frequent-words-with-trie.py
74 lines (61 loc) · 1.74 KB
/
find-max-frequent-words-with-trie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class Trie:
def __init__(self, parent=None):
self.parent = parent
self.children = {}
self.n = 0
def add(self, c):
return self.children.setdefault(c, Trie(self))
def letter(self, child):
for c in self.children:
if self.children[c] == child:
return c
return None
def spell(self):
if self.parent:
return self.parent.spell() + self.parent.letter(self)
return ""
def wrap(text):
"""iterates through text, yielding lowercase letter or single ' '"""
inword = False
for c in text:
if str(c).isalpha():
yield c
inword = True
else:
if inword:
yield " "
inword = False
if inword:
yield " "
def pairmax(text):
"""finds the word pair with maximum occurrence"""
root = word2 = Trie()
word1 = None
best = Trie()
for c in wrap(text):
if c == " ":
if word1:
word2.n += 1
word2 = word1.add(" ")
word1 = root
else:
word1 = root
word2 = word2.add(" ")
else:
word2 = word2.add(c)
if word2.n > best.n:
best = word2
if word1:
word1 = word1.add(c)
print(word2.n)
if word2.n > best.n:
best = word2
return best.spell(), best.n
if __name__ == "__main__":
from urllib.request import urlopen
import sys
urls = ["http://www.rfc-editor.org/rfc/rfc19%02d.txt" %
i for i in range(30, 40)]
for url in urls:
word, n = pairmax(urlopen(url).read())
sys.stdout.write('%s : %dx "%s"\n' % (url, n, word))