-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathredactor.py
97 lines (80 loc) · 3.94 KB
/
redactor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re
from quotes_and_brackets import get_quotes
def remove_tech_line_breaks (text, glue=' ', line_ends=get_quotes() | set('.!?…:;-–—‒؟؛।。!?:;一፡።፣፤፥፦፧፨፠'), start_line=2):
"""Remove technical line breaks in the middle of sentences appearing as a result
of converting text formats, often PDF to TXT.
str `text` - text in which to remove technical line breaks;
set/list/tuple/str `line_ends` - set of characters; if a paragraph ends with
a charachter from this set, removing line break is not needed here;
return str - text with technical line breaks removed.
"""
text = text.split('\n')
text = [line.strip() for line in text]
text_edited = [text.pop(0) for _ in range(start_line)]
for i in range(len(text)):
if text[i] == '':
continue
if text_edited[-1][-1] in line_ends or text[i][0] in get_quotes():
text_edited.append(text[i])
else:
text_edited[-1] += glue + text[i]
return '\n'.join(text_edited)
def set_to_str (input_set):
return ''.join(sorted(list(input_set)))
def replace_by_regex(text, regex, old, new):
"""Replace old substring to new one inside the hits found by regular expression.
str `regex` - string with regular expression for searching hits by re.findall;
str `old` - string to be replace inside the hits found by regex;
str `new` - target replacement;
return: str - text with replacements in the hits.
"""
for hit in set(re.findall(regex, text)):
hit_replace = hit.replace(old, new)
text = text.replace(hit, hit_replace)
return text
def recover_apostrophes_by_lang (text, lang, replace_inner=False):
"""Replace typewriter apostrophe "'" to punctuation apostrophe "’" in the text.
str `text` - text in which to recover apostrophes;
str `lang` - ISO code of the language of the text;
bool `replace_inner` - specifies if to recover all the apostrophes located inside
words, i.e. surrounded by letter characters;
return str - text with apostrophes recovered.
"""
if lang == 'en':
queries = ("I'm", "o'clock", "ma'am", "Qur'an", "ne'er-do-well", "'twere")
for query in queries:
for q in (query, query.upper(), query.capitalize()):
regex = rf"\b{q}\b"
text = replace_by_regex(text, regex, '\'', '’')
queries = ('s', 'll', 'd', 've', 're')
for query in queries:
for q in (query, query.upper()):
text = replace_by_regex(text, rf"\w'{q}\b", '\'', '’')
queries = ("n't")
for query in queries:
for q in (query, query.upper()):
text = replace_by_regex(text, rf"\w{q}\b", '\'', '’')
elif lang == 'fr':
queries = ("aujourd'hui")
for query in queries:
for q in (query, query.upper(), query.capitalize()):
regex = rf"\b{q}\b"
text = replace_by_regex(text, regex, '\'', '’')
queries = ('c', 'd', 'j', 'l', 'n', 'qu', 't', 'm', 's',
'quelqu', 'quoiqu', 'lorsqu', 'puisqu')
for query in queries:
for q in (query, query.upper(), query.capitalize()):
regex = rf"\b{q}'\w"
text = replace_by_regex(text, regex, '\'', '’')
elif replace_inner or lang in ('be', 'uk'):
regex = '\w+\'\w+'
text = replace_by_regex(text, regex, '\'', '’')
return text
def get_number_intervals_by_dash (text, dash):
"""Get number intervals of the form "123–456" with a particular dash.
str `text` - text where to search the number intervals;
str `dash` - character of dash;
return set - set of strings from the text representing number intervals
with dashes.
"""
return set(re.findall(f'\d+ ?{dash} ?\d+', text))