-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_patterns.py
193 lines (155 loc) · 6.13 KB
/
find_patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from PyTib.common import open_file, write_file, write_csv, pre_process, clean_string
import os
import re
from collections import defaultdict
import pickle
import regex
def is_punct(string):
"""returns False at the first character that is not a punctuation in the list, else True"""
puncts = ['༄', '༅', '༆', '༇', '༈', '།', '༎', '༏', '༐', '༑', '༔', '_']
for char in string:
if char not in puncts:
return False
return True
def punct_view(prepared):
"""keeps all the punctuation and replaces syllables either by dashes(count=False) or by a number(count=True)"""
out = []
for el in prepared:
if type(el) == str and el != '':
out.append(el)
elif type(el) == tuple:
dashes = ['-' for a in range(int(el[1]/2))]
new = dashes + [str(el[1])] + dashes
out.append(''.join(new))
return out
def preprocess(string):
"""a list of punctuation and paragraphs. punctuations are strings, paragraphs are splitted in a list of syls."""
#
splitted = re.split(r'\n+', string)
joined = ''.join([a+'-' if is_punct(a[-1]) else a for a in splitted])
processed = []
paragraphs = pre_process(joined, mode='words')
for a in paragraphs:
if a != '-' and not is_punct(a):
syls = pre_process(a, mode='syls')
par_len = len(syls)
first_syl = syls[0]
last_syl = syls[-1]
processed.append((first_syl, par_len, last_syl))
elif a != '-':
processed.append(a)
return processed
def create_missing_dir(path):
"""creates the folder designated in path if it does not exist"""
if not os.path.exists(path):
os.makedirs(path)
def write_output(output, out_path, output_type):
"""
writes the output found in the
:param output: content to be written
:param out_path: where to write
:param output_type: name of the containing folder and output file suffix
:return: writes files to the corresponding folder
"""
out_dir = '{}/{}'.format(out_path, output_type)
create_missing_dir(out_dir)
for vol, out in output.items():
write_file('{}/{}_{}.txt'.format(out_dir, vol, output_type), ' '.join(out))
def find_punct_types(collection):
"""counts the overall frequency of each punct type for the whole collection"""
types = defaultdict(int)
for vol, prepared in collection.items():
for a in prepared:
if type(a) == str:
types[a] += 1
return types
def sorted_punct_types(types_dict, reverse=True):
tupled = [(k, v) for k, v in types_dict.items()]
return sorted(tupled, key=lambda x: x[1], reverse=reverse)
def missing_dirs():
"""create the dirs in the list if they are missing"""
for dir in ['input', 'cache', 'output']:
create_missing_dir(dir)
def prepare_collection(in_path):
"""
warning : the returned punctuation is the one processed by clean_string() and not the raw one.
:param in_path:
:return:
"""
collection = {}
for f in os.listdir(in_path):
vol_name = f.split('.')[0]
raw = open_file('{}/{}'.format(in_path, f))
raw = clean_string(raw, strip=True, single_spaces=True, tabs2spaces=True, spaces2same=True)
# pre-processing
collection[vol_name] = preprocess(raw)
return collection
def collection_dots(collection):
all_dots = {}
for vol, prepared in collection.items():
all_dots[vol] = punct_view(prepared)
return all_dots
def open_prepared(in_path):
cache_name = in_path.split('/')[-1]
cache_file = 'cache/{}_pre_processed.p'.format(cache_name)
if os.path.isfile(cache_file):
prepared_vols = pickle.load(open(cache_file, 'rb'))
else:
prepared_vols = prepare_collection(in_path)
pickle.dump(prepared_vols, open(cache_file, 'wb'))
return prepared_vols
def full_text_conc(in_path, vol_name, punct):
full_vol = open_file('{}/{}.txt'.format(in_path, vol_name))
conc = regex.search(punct, full_vol)
# conc.detach_string()
return conc
def punct_conc(punct, prepared, in_path):
concs = []
for vol_name, volume in prepared.items():
if punct in volume:
for num, el in enumerate(volume):
if el == punct:
if num-1 > 0:
left = volume[num - 1]
else:
left = 'start'
if num+1 < len(volume):
right = volume[num + 1]
else:
right = 'end'
# if there is a tuple, format to have only the syllables around the punct
if type(left) == tuple:
left = '--{}--{}'.format(left[1], left[2])
if type(right) == tuple:
right = '{}--{}--'.format(right[0], right[1])
# find the full text concordance
full_conc = full_text_conc(in_path, vol_name, punct)
concs.append((left, el, right, full_conc, vol_name))
return concs
def concs_by_freq(prepared, in_path, all_puncts, frequency):
all_concs = []
for punct, freq in all_puncts.items():
if freq <= frequency:
conc = punct_conc(punct, prepared, in_path)
all_concs.append((punct, conc))
return all_concs
def main():
in_path = '../derge-tengyur/derge-tengyur-tags' # default is 'input'
out_path = 'output'
missing_dirs()
# pre-processing
print('loading the collection')
prepared_vols = open_prepared(in_path)
# counting
print('counting the punctuation types')
punct_types = find_punct_types(prepared_vols)
# sort by inversed frequency and write to csv
write_csv('{}/total_types.csv'.format(out_path), sorted_punct_types(punct_types), header=['punct', 'frequency', 'to check'])
# processing
print('generating "with dots" data')
dots = collection_dots(prepared_vols)
write_output(dots, out_path, 'with_dots')
# concordances = concs_by_freq(prepared_vols, in_path, punct_types, 1)
print('ok')
if __name__ == '__main__':
main()