-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcustomBERT.py
62 lines (48 loc) · 2.41 KB
/
customBERT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Source:
# https://stackoverflow.com/questions/64156202/add-dense-layer-on-top-of-huggingface-bert-model
# https://towardsdatascience.com/identifying-hate-speech-with-bert-and-cnn-b7aa2cddd60d
import torch
from torch import nn
import torch.nn.functional as F
from transformers import BertModel
from transformers import AutoTokenizer
class CustomBERTModel(nn.Module):
def __init__(self):
super(CustomBERTModel, self).__init__()
self.bert = BertModel.from_pretrained("bert-base-uncased")
# custom layers (3x linear) on top of bert:
self.linear1 = nn.Linear(768, 256) # TODO: modify dimension of all 3 linear layers
self.linear2 = nn.Linear(256, 128)
self.linear3 = nn.Linear(128, 2)
def forward(self, ids, attention_mask, labels):
sequence_output, pooled_output = self.bert(ids, attention_mask=attention_mask)
# shape of sequence_output: (batch_size, sequence_length, 768)
linear1_output = self.linear1(sequence_output[:,0,:].view(-1,768)) # extract the 1st token's embeddings
linear2_output = self.linear2(linear1_output)
linear3_output = self.linear3(linear2_output)
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(linear3_output.view(-1, 2), labels.view(-1))
else:
loss = None
return (loss, linear3_output)
def train(input_ids, attention_mask, data_loader, epochs):
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = CustomBERTModel()
model.to(torch.device("cuda:0"))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
for epoch in epochs:
for batch in data_loader:
data = batch[0]
targets = batch[1] # assuming that data loader returns a tuple of data and its targets
optimizer.zero_grad()
encoding = tokenizer.batch_encode_plus(data, return_tensors="pt", padding=True, truncation=True, max_length=64, add_special_tokens=True)
outputs = model(input_ids, attention_mask=attention_mask)
outputs = F.log_softmax(outputs, dim=1)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
return model