forked from huggingface/torchMoji
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_all_datasets.py
110 lines (89 loc) · 3.31 KB
/
convert_all_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import print_function
import json
import math
import pickle
import sys
from io import open
import numpy as np
from os.path import abspath, dirname
sys.path.insert(0, dirname(dirname(abspath(__file__))))
from torchmoji.word_generator import WordGenerator
from torchmoji.create_vocab import VocabBuilder
from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
from torchmoji.tokenizer import tokenize
try:
unicode # Python 2
except NameError:
unicode = str # Python 3
IS_PYTHON2 = int(sys.version[0]) == 2
DATASETS = [
'Olympic',
'PsychExp',
'SCv1',
'SCv2-GEN',
'SE0714',
#'SE1604', # Excluded due to Twitter's ToS
'SS-Twitter',
'SS-Youtube',
]
DIR = '../data'
FILENAME_RAW = 'raw.pickle'
FILENAME_OWN = 'own_vocab.pickle'
FILENAME_OUR = 'twitter_vocab.pickle'
FILENAME_COMBINED = 'combined_vocab.pickle'
def roundup(x):
return int(math.ceil(x / 10.0)) * 10
def format_pickle(dset, train_texts, val_texts, test_texts, train_labels, val_labels, test_labels):
return {'dataset': dset,
'train_texts': train_texts,
'val_texts': val_texts,
'test_texts': test_texts,
'train_labels': train_labels,
'val_labels': val_labels,
'test_labels': test_labels}
def convert_dataset(filepath, extend_with, vocab):
print('-- Generating {} '.format(filepath))
sys.stdout.flush()
st = SentenceTokenizer(vocab, maxlen)
tokenized, dicts, _ = st.split_train_val_test(texts,
labels,
[data['train_ind'],
data['val_ind'],
data['test_ind']],
extend_with=extend_with)
pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
dicts[0], dicts[1], dicts[2])
with open(filepath, 'w') as f:
pickle.dump(pick, f)
cover = coverage(tokenized[2])
print(' done. Coverage: {}'.format(cover))
with open('../model/vocabulary.json', 'r') as f:
vocab = json.load(f)
for dset in DATASETS:
print('Converting {}'.format(dset))
PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)
with open(PATH_RAW, 'rb') as dataset:
if IS_PYTHON2:
data = pickle.load(dataset)
else:
data = pickle.load(dataset, fix_imports=True)
# Decode data
try:
texts = [unicode(x) for x in data['texts']]
except UnicodeDecodeError:
texts = [x.decode('utf-8') for x in data['texts']]
wg = WordGenerator(texts)
vb = VocabBuilder(wg)
vb.count_all_words()
# Calculate max length of sequences considered
# Adjust batch_size accordingly to prevent GPU overflow
lengths = [len(tokenize(t)) for t in texts]
maxlen = roundup(np.percentile(lengths, 80.0))
# Extract labels
labels = [x['label'] for x in data['info']]
convert_dataset(PATH_OWN, 50000, {})
convert_dataset(PATH_OUR, 0, vocab)
convert_dataset(PATH_COMBINED, 10000, vocab)