cache value to avoid tensor conversion in torch (#654)

Co-authored-by: Haifeng Jin <haifeng-jin@users.noreply.github.com>
keras-team · Aug 1, 2023 · 489237f · 489237f
1 parent c466112
commit 489237f
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/keras_core/layers/attention/multi_head_attention.py b/keras_core/layers/attention/multi_head_attention.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 
+from keras_core import backend
 from keras_core import constraints
 from keras_core import initializers
 from keras_core import ops
@@ -115,6 +116,8 @@ def __init__(
         self.supports_masking = True
         self._num_heads = num_heads
         self._key_dim = key_dim
+        # Cache 1.0 / math.sqrt(self._key_dim).
+        self._inverse_sqrt_key_dim = None
         self._value_dim = value_dim if value_dim else key_dim
         self._dropout = dropout
         self._use_bias = use_bias
@@ -311,6 +314,9 @@ def _build_attention(self, rank):
         )
         self._softmax = Softmax(axis=norm_axes)
         self._dropout_layer = Dropout(rate=self._dropout)
+        self._inverse_sqrt_key_dim = backend.convert_to_tensor(
+            1.0 / math.sqrt(float(self._key_dim))
+        )
 
     def _masked_softmax(self, attention_scores, attention_mask=None):
         # Normalize the attention scores to probabilities.
@@ -355,7 +361,7 @@ def _compute_attention(
         # Note: Applying scalar multiply at the smaller end of einsum improves
         # XLA performance, but may introduce slight numeric differences in
         # the Transformer attention head.
-        query = ops.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))
+        query = ops.multiply(query, self._inverse_sqrt_key_dim)
 
         # Take the dot product between "query" and "key" to get the raw
         # attention scores.