-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlist-words.py
148 lines (120 loc) · 3.65 KB
/
list-words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# ------------------------------------------------------------------------------
# Python Scripts scriptarium/[list-words.py]
# (c) balarabe@protonmail.com
# ------------------------------------------------------------------------------
# list-words lists the words in every text file
# in the current folder and its subfolders.
import re
from typing import List
from sys import argv as sys_argv
from constants import text_file_exts
from functions import list_files
def split_words(s: str) -> List[str]:
mode = ''
ar = []
w = ''
for c in s:
if c.isupper():
if mode == 'l' or mode == 'd':
ar.append(w)
w = c
else:
w += c
mode = 'u'
elif c.islower():
mode = 'l'
w += c
elif c.isdigit():
mode = 'd'
w += c
else:
mode = ''
if w != '':
ar.append(w)
w = ''
if w != '':
ar.append(w)
return ar
def test_split_words():
test_cases = [
['Hello', ['Hello']],
['HelloWorld', ['Hello', 'World']],
['helloWorld', ['hello', 'World']],
['yard_number', ['yard', 'number']],
]
for tc in test_cases:
input = tc[0]
want = tc[1]
have = split_words(input)
if have != want:
print('ok')
raise Exception(f'words(\'{input}\') -> {have} expected: {want}')
test_split_words()
#-------------------------------------------------------------------------------
print('\n'*10)
dictionary = {}
if len(sys_argv) > 1:
for fname in sys_argv:
if fname == sys_argv[0]:
continue
with open(fname, mode='r', encoding='utf-8') as fl:
for s in fl.read().splitlines():
i = s.find('#')
if i != -1:
s = s[:i]
s = s.strip()
if s != '':
if s in dictionary:
print('duplicate:', s)
else:
dictionary[s] = True
u = s.upper()
if u != s:
if u in dictionary:
print('duplicate:', u)
else:
dictionary[u] = True
print('loaded dictionary')
for fname in list_files('.'):
# skip file types not listed in text_file_exts
if not next((ext for ext in text_file_exts if fname.endswith(ext)), False):
continue
print('\n'*2 + '-'*80 + '\n' + fname)
words = []
ln = ''
with open(fname, mode='r', encoding='utf-8') as fl:
s = fl.read()
ln = ''
for c in s:
if (c == '_') \
or (c >= '0' and c <= '9') \
or (c >= 'a' and c <= 'z') \
or (c >= 'A' and c <= 'Z'):
ln += c
continue
if ln != '' and not ln in words:
words.append(ln)
ln = ''
if ln != '' and not ln in words:
words.append(ln)
words.sort()
for ln in words:
if ln == '':
continue
if ln[0].isdigit():
continue
if ln in dictionary:
continue
if ln[:1].lower()+ln[1:] in dictionary:
continue
allWordsInDict = True
for w in split_words(ln):
lw = w[:1].lower()+w[1:]
if (w not in dictionary) and (lw not in dictionary):
allWordsInDict = False
break
if allWordsInDict:
continue
print(ln)
quit()
# end