-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
207 lines (168 loc) · 7.12 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import os
import json
import shutil
from collections import Counter
import torch
import numpy as np
from datasets import load_dataset
def log_gradient_norm(model, writer, step, mode, norm_type=2):
"""Writes model param's gradients norm to tensorboard"""
total_norm = 0
for p in model.parameters():
if p.requires_grad:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** 2
total_norm = total_norm ** (1. / 2)
writer.add_scalar(f"Gradient/{mode}", total_norm, step)
def save_checkpoint(model, start_time, epoch):
"""Saves specified model checkpoint."""
target_dir = f"checkpoints\\{start_time}"
os.makedirs(target_dir, exist_ok=True)
# Save model weights
save_path = f"{target_dir}\\model_{epoch}.pth"
torch.save(model.state_dict(), save_path)
print("Model saved to:", save_path)
# Save model configuration
if not os.path.exists(f"{target_dir}\\config.json"):
shutil.copy("config.json", os.path.join(target_dir, "config.json"))
shutil.copy("classifier.py", os.path.join(target_dir, "classifier.py"))
shutil.copy("transformer.py", os.path.join(target_dir, "transformer.py"))
shutil.copy("utils.py", os.path.join(target_dir, "utils.py"))
def process_subset(subset, separator="\t"):
"""Processes the given subset.
Extracts the input tokens (words) and labels for each sequence in the subset.
Forms a representation string for each sample in the following format:
SEQ_LEN [SEPARATOR] INPUT_TOKENS [SEPARATOR] LABELS
where:
SEQ_LEN - Number of tokens in that particular sequence
INPUT_TOKENS - Input tokens separated by the @separator
LABELS - Integer labels separated by the @separator
"""
processed_subset = []
max_len = 0
for sample in subset:
# Load and process tokens
tokens = sample["tokens"]
tokens = [token.strip() for token in tokens]
# Load and process NER tags
ner_tags = sample["ner_tags"]
ner_tags = [str(tag) for tag in ner_tags]
sample_size = len(sample["tokens"])
max_len = max(max_len, sample_size)
processed_sample = f"{sample_size}{separator}"
processed_sample += separator.join(tokens + ner_tags) + "\n"
processed_subset.append(processed_sample)
return processed_subset
def save_subset(subset, dataset_dir, subset_name):
"""Saves processed subset to the desired dataset directory.
Arguments:
subset: Subset to save
dataset_dir (str): Dataset directory to which to save subset
subset_name (str): Name of the subset
"""
if subset_name not in ["train", "validation", "test"]:
raise ValueError(
"Subset name invalid! Expected: train, validation or test but received {}".format(subset_name)
)
save_path = os.path.join(dataset_dir, "{}.txt".format(subset_name))
with open(save_path, "w") as f:
f.writelines(subset)
def download_dataset(dataset_dir):
"""Downloads the CoNLL2003 dataset from the HuggingFace
and saves separate subsets into dataset directory.
Arguments:
dataset_dir (str): Directory to which to save the dataset subsets
"""
# Download the dataset from the HuggingFace
DATASET_NAME = "conll2003"
dataset_group = load_dataset(DATASET_NAME)
os.makedirs(dataset_dir, exist_ok=True)
# Extract subsets of data
train_set = dataset_group["train"]
valid_set = dataset_group["validation"]
test_set = dataset_group["test"]
# Perform subset processing
train_set_processed = process_subset(train_set)
valid_set_processed = process_subset(valid_set)
test_set_processed = process_subset(test_set)
# Save processed subsets
save_subset(train_set_processed, dataset_dir, "train")
save_subset(valid_set_processed, dataset_dir, "validation")
save_subset(test_set_processed, dataset_dir, "test")
print("\nDataset downloaded and processed.")
return train_set, valid_set, test_set
def create_vocabulary(train_set, vocab_size):
"""Creates vocabulary out of the training set tokens.
Arguments:
train_set: CoNLL2003 train_set from HuggingFace
vocab_size (int): Maximum number of tokens in the vocab
Returns:
vocab (dict): Vocabulary of all tokens in the training set
key: token
value: ordinal number of token in the vocabulary
"""
all_tokens = []
for token_subseq in train_set["tokens"]:
all_tokens += token_subseq
# Perform some pre-processing of the tokens
all_tokens_lower = list(map(str.lower, all_tokens))
all_tokens_strip = list(map(str.strip, all_tokens_lower))
# Count the occurence of every word
counter = Counter(all_tokens_strip)
# Extract VOCAB_SIZE - 2 since we will define tokens for padding elements
# and words which aren't present in the training set
most_frequent = counter.most_common(vocab_size - 2)
# Initialize the vocabulary
vocab = {
"UNK": 0,
"PADD": 1
}
ind = len(vocab)
# Populate the vocab
for token, _ in most_frequent:
vocab[token] = ind
ind += 1
print("\nCreated vocabulary of {} tokens.".format(ind))
return vocab
def extract_embeddings(config, vocab):
"""Extracts GloVe word embeddings for words in vocab.
Arguments:
config (object): Contains dataset & pipeline configuration info
vocab (dict): word - ordinal number mapping
"""
embeddings_config = config["embeddings"]
save_path_emb = embeddings_config["path"]
embedding_dim = embeddings_config["size"]
save_path_map = config["word2idx_path"]
# Used for finding the embedding vector for each token
word_to_idx = {"<unk>": 0, "<pad>": 1}
vectors = []
idx = 0
vocab_bias = len(word_to_idx)
embedding_file_name = "glove.6B.{}d.txt".format(embedding_dim)
embeddings_path = os.path.join(config["glove_dir"], embedding_file_name)
with open(embeddings_path, "rb") as f:
for line in f:
line = line.decode().split()
# Extract and pre-process the token
word = line[0]
word = word.strip().lower()
# Remember the embedding vector if the word is in the vocab
if word in vocab.keys():
word_to_idx[word] = idx + vocab_bias
embedding_vec = np.array(line[1:], dtype="float")
vectors += [embedding_vec]
idx += 1
vectors = np.array(vectors)
# Embedding vector for tokens used for padding the input sequence
pad_embedding = np.zeros((embedding_dim,))
# Embedding vector for tokens not present in the training set
unk_embedding = vectors.mean(axis=0)
vectors = np.vstack([unk_embedding, pad_embedding, vectors])
# Save extracted embeddings
np.savetxt(save_path_emb, vectors)
# Save token:index mapping
with open(save_path_map, "w", encoding="utf8") as f:
json.dump(word_to_idx, f)
print("\nExtracted GloVe embeddings for all tokens in the training set.")
print("Number of tokens:", vectors.shape[0], "Embedding vectors size:", embedding_dim)