Cast to bfloat16 if possible

mosaicml · Jan 4, 2024 · 4cbdec1 · 4cbdec1
1 parent 6a8d85b
commit 4cbdec1
Showing 1 changed file with 18 additions and 6 deletions.
diff --git a/examples/benchmarks/bert/src/bert_layers.py b/examples/benchmarks/bert/src/bert_layers.py
@@ -39,6 +39,7 @@
 import os
 import sys
 import warnings
+from functools import lru_cache
 from typing import List, Optional, Tuple, Union
 
 # Add folder root to path to allow us to use relative imports regardless of what directory the script is run from
@@ -79,6 +80,13 @@
 logger = logging.getLogger(__name__)
 
 
+@lru_cache
+def _get_half_dtype() -> torch.dtype:
+    if torch.cuda.is_bf16_supported():
+        return torch.bfloat16
+    return torch.float16
+
+
 class BertEmbeddings(nn.Module):
     """Construct the embeddings for words, ignoring position.
 
@@ -250,13 +258,15 @@ def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
                 assert slopes.shape[
                     -1] == self.num_attention_heads, f'{slopes=}'
 
-                convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
+                convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
                 if convert_dtype:
                     # FA2 implementation only supports fp16 and bf16
+                    # If FA2 is supported, bfloat16 must be supported
+                    # as of FA2 2.4.2. (Turing GPUs not supported)
                     orig_dtype = qkv.dtype
-                    qkv = qkv.to(torch.float16)
+                    qkv = qkv.to(torch.bfloat16)
                     bias_dtype = bias.dtype
-                    bias = bias.to(torch.float16)
+                    bias = bias.to(torch.bfloat16)
 
                     attention = flash_attn_qkvpacked_func(
                         qkv, dropout_p=self.p_dropout, alibi_slopes=slopes)
@@ -267,13 +277,15 @@ def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
                         qkv, dropout_p=self.p_dropout, alibi_slopes=slopes)
             else:
                 # Triton implementation only supports 0 attention dropout
-                convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
+                convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
                 if convert_dtype:
+                    half = _get_half_dtype()
+
                     # Triton implementation only supports fp16 and bf16
                     orig_dtype = qkv.dtype
-                    qkv = qkv.to(torch.float16)
+                    qkv = qkv.to(half)
                     bias_dtype = bias.dtype
-                    bias = bias.to(torch.float16)
+                    bias = bias.to(half)
                     attention = flash_attn_qkvpacked_func(qkv, bias)
                     attention = attention.to(orig_dtype)
                     bias = bias.to(bias_dtype)