-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathwordproblems.py
370 lines (295 loc) · 12.4 KB
/
wordproblems.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
from random import Random
from bisect import bisect_left, bisect_right
# Use binary search to determine if the given word is in the sorted wordlist.
def is_legal_word(word, words):
idx = bisect_left(words, word)
return idx < len(words) and words[idx] == word
# Compute a histogram of individual characters in words.
def histogram(words):
result = {}
for word in words:
for c in word:
result[c] = result.get(c, 0) + 1
return result
# Find all words that are find_palindromes.
def find_palindromes(words):
return [word for word in words if word == word[::-1]]
# Find all words that are a different word when read backwards.
def find_semordnilaps(words):
return [word for word in words if word != word[::-1] and is_legal_word(word[::-1], words)]
# Find all rotodromes, words that become other words when rotated.
def find_rotodromes(words):
# Define a nested helped function to perform the filtering.
def is_rotodrome(word):
for i in range(1, len(word)):
rotated_word = word[i:] + word[:i]
if word != rotated_word and is_legal_word(rotated_word, words):
return True
return False
return [word for word in words if is_rotodrome(word)]
# Find the "almost palindromes", words that become palindromes when
# one letter is tactically removed.
def find_almost_palindromes(words):
def is_almost_palindrome(word):
# Words that are already palindromes don't count.
if word != word[::-1]:
# Loop through the positions to remove a character.
for i in range(len(word)):
removed_word = word[:i] + word[i + 1:]
if removed_word == removed_word[::-1]:
return True
return False
return [word for word in words if len(word) > 2 and is_almost_palindrome(word)]
# Rotate the consonants of the text cyclically, keeping the rest of
# the characters as they are, and maintaining the capitalization of
# the individual characters. For example, "Ilkka" becomes "Iklka".
__cons = "bcdfghjklmnpqrstvwxyz"
__cons += __cons.upper()
def rotate_consonants(text, off=1):
# Find the positions of all consonants in text.
cons_pos = [i for (i, c) in enumerate(text) if c in __cons]
# Process the text one character at the time.
result, pos = '', 0
for (i, c) in enumerate(text):
if c in __cons:
# Location of the next consonant in the consonant list.
succ = (pos + off) % len(cons_pos)
# The consonant that comes into the current position i.
sc = text[cons_pos[succ]]
# Maintain the capitalization.
result += sc.upper() if c.isupper() else sc.lower()
# Next consonant and incoming consonant advance in lockstep.
# pos = (pos + 1) % len(cons_pos)
pos += 1
else:
# Take the character into result as is.
result += c
return result
# Find the words that contain at least three duplicated letters.
def triple_duplicates(words):
result = []
for word in words:
count = 0
prev = '$'
for c in word:
if c == prev:
count += 1
prev = '$'
else:
prev = c
if count > 2:
result.append(word)
return result
# How many words can be spelled out using only given characters?
def limited_alphabet(words, chars):
return [word for word in words if all(c in chars for c in word)]
# From Programming Praxis. Given text string and integer k,
# find and return the longest substring that contains at most
# k different characters inside it.
def longest_substring_with_k_chars(text, k=2):
# The k most recently seen characters mapped to the last
# position index of where they occurred.
last_seen = {}
len_, max_, maxpos = 0, 0, 0
for (i, c) in enumerate(text):
# If no conflict, update the last_seen dictionary.
if len(last_seen) < k or c in last_seen:
last_seen[c] = i
len_ += 1
if len_ > max_:
max_ = len_
maxpos = i - max_ + 1
else:
# Find the least recently seen character.
min_, minc = len(text), '$'
for cc in last_seen:
if last_seen[cc] < min_:
min_ = last_seen[cc]
minc = cc
# Remove it from dictionary...
last_seen.pop(minc)
# ... and bring the current character to its place.
last_seen[c] = i
len_ = i - min_
# Extract the longest found substring as the answer.
return text[maxpos:maxpos + max_]
# Given a sorted list of words and the first word, construct a
# word chain in which each word starts with the suffix of the
# previous word with the first k characters removed, for example
# ['grama', 'ramal', 'amala', 'malar', 'alarm'] for k = 1.
# Since words are sorted, we can use binary search algorithm to
# quickly find the sublist whose words start with the given prefix.
def word_chain(words, first, k=1, len_=3):
# Recursive algorithm to complete the given wordlist.
def backtrack(chain):
# If the wordlist is long enough, return it.
if len(chain) == len_:
return chain
# Extract the suffix of the last word of the wordlist.
suffix = chain[-1][k:]
# Extract the words that start with that suffix.
start = bisect_left(words, suffix)
end = bisect_right(words, suffix + k * 'z')
# Try out those words one at the time.
for idx in range(start, end):
word = words[idx]
if len(word) > len(chain[-1]) - k and word not in chain:
# Extend the wordlist with this word.
chain.append(word)
if backtrack(chain): # Solution found
return chain
# Remove that word and try the next one.
chain.pop()
return None
return backtrack(first)
# What words remain words after removing one character? Create and return
# a list whose i:th element is a dictionary of all such words of length
# i, mapped to the list of words of length i-1 that they can be turned
# into by removing one letter.
def remain_words(words):
result = [[], [x for x in words if len(x) == 1]]
word_length = 2
while True:
next_level, has_words = {}, False
for w in (x for x in words if len(x) == word_length):
shorter = []
for i in range(0, word_length - 1):
ww = w[:i] + w[i + 1:] # word with i:th letter removed
if ww in result[word_length - 1]:
shorter.append(ww)
if len(shorter) > 0:
next_level[w] = shorter
has_words = True
if has_words:
result.append(next_level)
word_length += 1
else:
return result
# Generate a table of all anagrams from the given word list.
# The first 26 prime numbers, one for each letter from a to z.
__primes = {'a': 2, 'b': 3, 'c': 5, 'd': 7, 'e': 11, 'f': 13, 'g': 17,
'h': 19, 'i': 23, 'j': 29, 'k': 31, 'l': 37, 'm': 41, 'n': 43,
'o': 47, 'p': 53, 'q': 59, 'r': 61, 's': 67, 't': 71, 'u': 73,
'v': 79, 'w': 83, 'x': 89, 'y': 97, 'z': 101}
def prime_code(word):
code = 1
for c in word:
code *= __primes.get(c, 1)
return code
def all_anagrams(words):
codes = {}
for word in words:
code = prime_code(word)
# All anagrams have the same prime code, thanks to the
# commutativity of integer multiplication combined with
# the Fundamental Theorem of Arithmetic that says every
# integer has exactly one prime possible factorization.
codes[code] = codes.get(code, []) + [word]
return codes
def __demo():
rng = Random(12345)
with open('words_sorted.txt', encoding="utf-8") as f:
words = [x.strip() for x in f]
print(f"Read in {len(words)} words.")
# Binary search can quickly find all words with given prefix.
for prefix in ['aor', 'jims', 'propo']:
result = []
idx = bisect_left(words, prefix)
while idx < len(words) and words[idx].startswith(prefix):
result.append(words[idx])
idx += 1
result = ", ".join(result)
print(f"\nWords that start with {prefix!r} are {result}.")
# How about finding all words that end with given suffix?
words_r = [word[::-1] for word in words]
words_r.sort()
for suffix in ["itus", "roo", "lua"]:
suffix = suffix[::-1]
result = []
idx = bisect_left(words_r, suffix)
while idx < len(words_r) and words_r[idx].startswith(suffix):
result.append(words_r[idx][::-1])
idx += 1
result.sort()
result = ", ".join(result)
print(f"\nWords that end with {suffix[::-1]!r} are {result}.")
hist = histogram(words).items()
hist = sorted(hist, key=lambda x: x[1], reverse=True)
print("\nHistogram of letters sorted by their frequencies:")
print(hist)
pals = find_palindromes(words)
print(f"\nThere are {len(pals)} palindromes. ", end="")
print("Some of them aresa.:")
print(", ".join(rng.sample(pals, 10)))
sems = find_semordnilaps(words)
print(f"\nThere are {len(sems)} semordnilaps. Some of them are:")
print(", ".join(rng.sample(sems, 10)))
almost = find_almost_palindromes(words)
print(f"\nThere are {len(almost)} almost palindromes. ", end="")
print("Some of them are:")
print(", ".join(rng.sample(almost, 10)))
print("\nLet us next look for some rotodromes.")
for n in range(2, 13):
rotos = find_rotodromes([word for word in words if len(word) == n])
print(f"There are {len(rotos)} rotodromes of length {n}.")
print(f"Some of these rotodromes are:")
print(f"{', '.join(rng.sample(rotos, min(10, len(rotos))))}.")
name = 'Donald Erwin Knuth'
print(f"\nSome consonant rotations of {name!r}.")
for off in range(-5, 6):
print(f"{off:2}: {rotate_consonants(name, off)}")
print("\n\nWords that contain only hexadecimal digits [a-f]:")
for word in limited_alphabet(words, "abcdef"):
print(word, end=' ')
print("\n\nWords that contain only vowels:")
for word in limited_alphabet(words, "aeiouy"):
print(word, end=' ')
print("\n\nWords spelled with upside down calculator:")
for word in limited_alphabet(words, "oieslbg"):
print(word.upper(), end=' ')
text = "ceterumautemcenseocarthaginemessedelendam"
print(f"\n\nThe text is '{text}'.")
print("Let's print out its longest substrings with k letters.")
for k in range(1, 16):
print(f"k = {k:2}: {longest_substring_with_k_chars(text, k)}")
print(f"\nHow about the longest 10-char substring of War and Peace? It is:")
with open('warandpeace.txt', encoding="utf-8") as wap:
text = " ".join(wap)
text.replace("\n", " ")
print(f"{longest_substring_with_k_chars(text, 10)}")
print("\nNext, some word chains of five-letter words.")
words5 = [word for word in words if len(word) == 5]
count, total = 0, 0
while count < 10:
total += 1
first = rng.choice(words5)
best = [first]
while len(best) < 5:
better = word_chain(words5, [first], 1, len(best) + 1)
if better:
best = better
else:
break
if len(best) > 3:
print(f"{first}: {best}")
count += 1
print(f"Found {count} word chains after trying {total} firsts.")
print("\nSome letter eliminations:")
elim_dict_list = remain_words(words)
start_words = list(elim_dict_list[8])
for n in range(10):
word = rng.choice(start_words)
while len(word) > 1:
print(word, end=" -> ")
word = rng.choice(elim_dict_list[len(word)][word])
print(word)
N, M = 7, 8
print(f"\nLet us compute all anagrams for the {N}-letter words.")
anagrams = all_anagrams(word for word in words if len(word) == N)
print(f"The anagram groups with {M} or more members are:\n")
# Note that anagrams is a dictionary that maps each prime codes
# to lists of words that all share that same prime code.
for code in (c for c in anagrams if len(anagrams[c]) >= M):
print(", ".join(anagrams[code]))
if __name__ == "__main__":
__demo()