-
Notifications
You must be signed in to change notification settings - Fork 1
/
model.py
116 lines (94 loc) · 4.88 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import torch
import math
import torch.nn as nn
class IngredientParser(nn.Module):
def __init__(self, word_embedding, config):
super(IngredientParser, self).__init__()
self.hidden_size = config["hidden_size"]
self.head_size = config["head_size"]
self.dropout = config["dropout"]
self.layers = config["layers"]
self.output_size = config["output_size"]
self.freeze_embeddings = config["freeze_embeddings"]
self.self_attn = \
nn.ModuleList([SelfattLayer(self.hidden_size, self.head_size, self.dropout) for _ in range(self.layers)])
self.classifer = nn.Linear(self.hidden_size, self.output_size)
self.tag_embedding = nn.Embedding(39, 300)
self.word_embedding = word_embedding
if self.freeze_embeddings:
self.word_embedding.weight.requires_grad = False
def forward(self, input_ids, tag_ids):
word_repr = self.word_embedding(input_ids)
tag_repr = self.word_embedding(tag_ids)
word_tag_repr = (word_repr + tag_repr) / 2
for layer_id in range(self.layers):
word_tag_repr = self.self_attn[layer_id](word_tag_repr)
logits = self.classifer(word_tag_repr)
return logits
class SelfattLayer(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.1):
super(SelfattLayer, self).__init__()
self.self_attn = Attention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
self.ffn = FeedForwardLayer(hidden_size, hidden_dropout_prob)
def forward(self, input_tensor, attention_mask=None):
self_output = self.self_attn(input_tensor, input_tensor, attention_mask)
attention_output = self.ffn(self_output, input_tensor)
return attention_output
class Attention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob=0.2, ctx_dim=None):
super().__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, num_attention_heads))
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
if ctx_dim is None:
ctx_dim = hidden_size
self.query = nn.Linear(self.hidden_size, self.all_head_size)
self.key = nn.Linear(ctx_dim, self.all_head_size)
self.value = nn.Linear(ctx_dim, self.all_head_size)
self.dropout = nn.Dropout(attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(self, hidden_states, context, attention_mask=None):
"""
hidden_states: [batch_size, N, h1], N=1089 or max_length
context: [batch_size, M, h2]
"""
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(context)
mixed_value_layer = self.value(context)
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = nn.Softmax(dim=-1)(attention_scores)
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer
class FeedForwardLayer(nn.Module):
def __init__(self, hidden_size, hidden_dropout_prob):
super(FeedForwardLayer, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
"""
hidden_states: output of attention module
input_tensor: input of attention module
"""
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states