-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathtextutil.py
129 lines (104 loc) · 3.59 KB
/
textutil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
from itertools import islice
def window(words_seq, n):
"""Returns a sliding window (of width n) over data from the iterable"""
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
it = iter(words_seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
def remove_repeating_char(text):
# return re.sub(r'(.)\1+', r'\1', text) # keep only 1 repeat
return re.sub(r'(.)\1+', r'\1\1', text) # keep 2 repeat
def process_text(text, n=1,
remove_vowel_marks=False,
remove_repeated_chars=False,
):
clean_text = text
if remove_vowel_marks:
clean_text = remove_diacritics(clean_text)
if remove_repeated_chars:
clean_text = remove_repeating_char(clean_text)
if n == 1:
return clean_text.split()
else:
tokens = clean_text.split()
grams = tokens
for i in range(2, n + 1):
grams = list(window(tokens, i))
grams = [' '.join(g) for g in grams]
return grams
arabic_diacritics = re.compile(""" ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
def remove_diacritics(text):
text = re.sub(arabic_diacritics, '', text)
return text
def load_data(data_file, label, n):
print('read {} data ...'.format(label))
data_features = list()
data = list()
infile = open(data_file, encoding='utf-8')
for line in infile:
if not line.strip():
continue
text_features = process_text(line, n)
data_features += text_features
data.append((text_features, label))
return data, data_features
def load_tsv(data_file, n):
data_features = list()
data = list()
infile = open(data_file, encoding='utf-8')
for line in infile:
if not line.strip():
continue
label, text = line.split('\t')
text_features = process_text(text, n)
data_features += text_features
data.append((text_features, label))
return data, data_features
def document_features(document, corpus_features):
document_words = set(document)
features = {}
for word in corpus_features:
features['has({})'.format(word)] = (word in document_words)
return features
def read_data(data_file, label):
# print('read {} data ...'.format(label))
text_data = list()
labels = list()
infile = open(data_file, encoding='utf-8')
for line in infile:
if not line.strip():
continue
text_data.append(line)
labels.append(label)
return text_data, labels
def read_tsv(data_file):
text_data = list()
labels = list()
infile = open(data_file, encoding='utf-8')
for line in infile:
if not line.strip():
continue
label, text = line.split('\t')
text_data.append(text)
labels.append(label)
return text_data, labels
def remove_diacritics(text):
text = re.sub(arabic_diacritics, '', text)
return text
def remove_repeating_char(text):
# return re.sub(r'(.)\1+', r'\1', text) # keep only 1 repeat
return re.sub(r'(.)\1+', r'\1\1', text) # keep 2 repeat