-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing.py
100 lines (82 loc) · 2.45 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
extract and normalize from raw data and save the resutls.
Amir Harati, April 2018
issues:
1- chunking is not used (e.g. new york would be new + york)
"""
import json
import textacy as tc
import glob, os
import spacy as sp
import re
regex = re.compile('[^0-9a-zA-Z\s+\.\?]')
outdir = "./data"
data = []
lines = [line.strip() for line in open("data/anna_karenina.txt")]
for line in lines:
if len(line) > 0:
#line = line.replace('?', '.')
#line = line.replace('!', '.')
line = line.replace('?', ' ? ')
#line = line.replace('\'ve',' have ')
#line = line.replace('\'re',' are ')
line = line.replace('...', ' ')
line = line.replace('..', ' ')
line = line.replace('.', ' . ')
line = line.lower()
nline = regex.sub('', line)
#print("*** ", nline)
data.append(nline.split())
data = [item for sublist in data for item in sublist]
#print(data[0:1000])
line = ""
pdata = []
words = set()
chars = set()
for w in data:
if w != ".":
words.add(w)
for c in w:
chars.add(c)
line += w + " "
else:
pdata.append(line[0:-1])
line = ""
chars = list(chars)
words = list(words)
print("#chars: ", len(chars))
print("#words:", len(words))
words = sorted(words)
chars = sorted(chars)
# for words add start, end and pad symboles
words = ["<PAD>", "<START>", "<EOS>"] + words
# for chars in addition to above add space
chars = ["<PAD>", "<START>", "<EOS>"," "] + chars
words_to_ids = {w: id for id, w in enumerate(words)}
ids_to_words = {words_to_ids[x]: x for x in words_to_ids}
chars_to_ids = {w: id for id, w in enumerate(chars)}
ids_to_chars = {chars_to_ids[x]: x for x in chars_to_ids}
# save data
with open(outdir + "/annakarenina_word2id.txt", "w") as wif:
for key, val in words_to_ids.items():
wif.write(key + "\t" + str(val) + "\n")
with open(outdir + "/annakarenina_chars2id.txt", "w") as wif:
for key, val in chars_to_ids.items():
wif.write(key + "\t" + str(val) + "\n")
with open(outdir + "/annakarenina_text_data.txt", "w") as f:
for sen in pdata:
f.write(sen + "\n")
with open(outdir + "/annakarenina_charid_data.txt", "w") as f:
for sen in pdata:
ostr = ""
#le = [x for x in tweet]
for c in sen:
ostr = ostr + str(chars_to_ids[c]) + " "
f.write(ostr + "\n")
with open(outdir + "/annakarenina_wordid_data.txt", "w") as f:
for sen in pdata:
ostr = ""
for word in sen.split():
#print(word)
ostr = ostr + str(words_to_ids[word]) + " "
f.write(ostr + "\n")