diff --git a/.gitignore b/.gitignore
index 30c0a9c94..b1f8a9715 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,4 +34,6 @@ csrc/flash_attn_ck
 core.*
 *.csv
 *.png
-*.html
\ No newline at end of file
+*.html
+*.json
+*.txt
diff --git a/flash_attn/flash_attn_triton_amd/bwd_prefill.py b/flash_attn/flash_attn_triton_amd/bwd_prefill.py
index 6b53eca2f..5da5634fb 100644
--- a/flash_attn/flash_attn_triton_amd/bwd_prefill.py
+++ b/flash_attn/flash_attn_triton_amd/bwd_prefill.py
@@ -1,32 +1,204 @@
 import torch
 import triton
 import triton.language as tl
-from .utils import get_shape_from_layout, get_strides_from_layout, DEBUG, PERF
+from .utils import get_shape_from_layout, get_strides_from_layout, DEBUG, write_dropout_mask
 
 @triton.jit
-def cdiv_fn(x, y):
-    return (x + y - 1) // y
+def _bwd_preprocess_use_p(
+    Q,
+    K,
+    V,
+    sm_scale,
+    DO,
+    L,
+    Delta,
+    stride_dq_all,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_vk,
+    stride_deltaz, 
+    stride_deltah, 
+    stride_deltam,
+    Z,
+    HQ,
+    HK,
+    num_block_m,
+    num_block_n,
+    cu_seqlens_q,  
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p, 
+    philox_seed, 
+    philox_offset_base,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    SEQUENCE_PARALLEL: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    DROPOUT: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    # program ids
+    off_zh = tl.program_id(0)
+    start_m = tl.program_id(1)
+    off_z = off_zh // HQ
+    off_hq = off_zh % HQ
 
-@triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    # tl.device_print('bwd_philox_offset:', philox_offset)
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
+    GROUP_SIZE = HQ // HK
+    if GROUP_SIZE != 1:
+        off_hk = off_hq // GROUP_SIZE
+    else:
+        off_hk = off_hq
 
+    if IS_VARLEN:
+        # Compute sequence lengths for the current batch
+        q_start = tl.load(cu_seqlens_q + off_z)
+        q_end = tl.load(cu_seqlens_q + off_z + 1)
+        k_start = tl.load(cu_seqlens_k + off_z)
+        k_end = tl.load(cu_seqlens_k + off_z + 1)
 
-@triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
+        # Compute actual sequence lengths
+        N_CTX_Q = q_end - q_start
+        N_CTX_K = k_end - k_start
+    else:
+        q_start = 0
+        k_start = 0
+        N_CTX_Q = max_seqlen_q
+        N_CTX_K = max_seqlen_k
 
+    if DROPOUT:
+        stride_sz = HQ * max_seqlen_q * max_seqlen_k
+        stride_sh = max_seqlen_q * max_seqlen_k
+        stride_sm = max_seqlen_k
+        batch_philox_offset = philox_offset_base + off_z * stride_sz + off_hq * stride_sh + q_start * stride_sm
+    else:
+        batch_philox_offset = 0
 
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
+    # input tensor offsets
+    q_offset = Q + off_z * stride_qz + off_hq * stride_qh + q_start * stride_qm
+    k_offset = K + off_z * stride_kz + off_hk * stride_kh + k_start * stride_kn
+    v_offset = V + off_z * stride_vz + off_hk * stride_vh + k_start * stride_vn
+    do_offset = DO + off_z * stride_qz + off_hq * stride_qh + q_start * stride_qm
+    l_offset = L + off_z * stride_deltaz + off_hq * stride_deltah + q_start * stride_deltam
+    delta_offset = Delta + off_z * stride_deltaz + off_hq * stride_deltah + q_start * stride_deltam
+
+    if CAUSAL:
+        # TODO: Causal can skip more blocks with something like lo = start_m * BLOCK_M
+        lo = 0
+    else:
+        lo = 0
+
+    # initialize head offsets
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+
+    # masks
+    mask_d = offs_d < ACTUAL_BLOCK_DMODEL
+
+    # loop over rows
+    offs_m = start_m* BLOCK_M + tl.arange(0, BLOCK_M)
+    # offs_m = start_m + tl.arange(0, BLOCK_M)
+    q_ptrs = q_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    do_ptrs = do_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    
+    # update mask as row block changes
+    mask_m = offs_m < N_CTX_Q
+    q_mask = mask_m[:, None] & mask_d[None, :]
+
+    # load q, k, v, do on-chip
+    q = tl.load(q_ptrs, mask=q_mask, other=0.0).to(tl.float32)
+    do = tl.load(do_ptrs, mask=q_mask, other=0.0).to(tl.float32)
+
+    # delta
+    delta_ptrs = delta_offset + offs_m * stride_deltam
+    delta_partial = tl.zeros([BLOCK_M], dtype=tl.float32)
+    
+    for start_n in range(lo, num_block_n):
+        # print("start_n:", start_n)
+        # offs_n = start_n + tl.arange(0, BLOCK_N)
+        offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        mask_n = offs_n < N_CTX_K
+        kv_mask = mask_n[:, None] & mask_d[None, :]
+
+        # load k and v once per column block
+        k_ptrs = k_offset + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
+        v_ptrs = v_offset + offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vk
+        k = tl.load(k_ptrs, mask=kv_mask, other=0.0).to(tl.float32)
+        v = tl.load(v_ptrs, mask=kv_mask, other=0.0).to(tl.float32)
+
+        # recompute p = softmax(qk, dim=-1).T
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # print("q:", q)
+        # print("k:", k)
+        qk += tl.dot(q, tl.trans(k))
+
+        if CAUSAL:
+            col_offset = N_CTX_Q - N_CTX_K
+            causal_mask = offs_m[:, None] >= (col_offset + offs_n[None, :])
+            qk = tl.where(causal_mask, qk, float("-inf"))
+
+        l_ptrs = l_offset + offs_m * stride_deltam
+        l_i = tl.load(l_ptrs, mask=mask_m)
+
+        # compute p
+        if USE_EXP2:
+            RCP_LN2: tl.constexpr = 1.4426950408889634
+            qk *= sm_scale * RCP_LN2
+            l_i *= RCP_LN2
+            p = tl.math.exp2(qk - l_i[:, None])
+        else:
+            qk *= sm_scale
+            p = tl.math.exp(qk - l_i[:, None])
+
+        # mask block in the cases where the data is smaller the block size
+        p_mask = mask_m[:, None] & mask_n[None, :]
+        p = tl.where(p_mask, p, 0.0)
+        # print("p:", p)
+        
+        # NOTE: must create a new var p_drop to prevent p (which is used later to compute ds) from changing
+        if DROPOUT:
+            stride_sm = N_CTX_K
+            stride_sn = 1
+            philox_offset = batch_philox_offset + offs_m[:, None] * stride_sm + offs_n[None, :] * stride_sn
+            # print("philox_seed:", philox_seed)
+            # print("philox_offset:", philox_offset)
+            rand_vals = tl.rand(philox_seed, philox_offset)
+            dropout_mask = rand_vals > dropout_p
+            dropout_scale = 1/ (1 - dropout_p)
+            p_drop = tl.where(dropout_mask, p, 0.0)
+            p_drop_scaled = p_drop * dropout_scale
+
+            # compute dp
+            dp_drop_scaled = tl.dot(do, tl.trans(v))
+            dp = tl.where(dropout_mask, dp_drop_scaled, 0.0) * dropout_scale
+            # dp = tl.where(p_mask, dp, 0.0)
+
+            # print("dp:", dp)
+
+            # compute delta
+            delta = tl.sum(p * dp, axis=1)
+        else:
+            # compute dp
+            dp = tl.dot(do, tl.trans(v))
+
+            # compute delta
+            delta = tl.sum(p * dp, axis=1)
+        # print("delta:", delta)
+
+        delta_partial += delta
+
+    tl.store(delta_ptrs, delta_partial, mask=mask_m)
 
 @triton.jit
 def _bwd_preprocess_use_o(
@@ -48,8 +220,8 @@ def _bwd_preprocess_use_o(
     H: tl.constexpr,
     IS_VARLEN: tl.constexpr
 ):
-    pid_m = tl.program_id(0)
-    pid_bh = tl.program_id(1)
+    pid_bh = tl.program_id(0)
+    pid_m = tl.program_id(1)
 
     # Compute batch and head indices
     off_z = pid_bh // H
@@ -119,8 +291,9 @@ def _bwd_kernel_one_col_block(
     dq_offset,
     dk_offset,
     dv_offset,
-    d_offset,
     l_offset,
+    delta_offset,
+    dropout_offset,
     stride_dq_all,
     stride_qz,
     stride_qh,
@@ -137,12 +310,15 @@ def _bwd_kernel_one_col_block(
     stride_deltaz, 
     stride_deltah, 
     stride_deltam,
+    stride_dropoutz, stride_dropouth, stride_dropoutm, stride_dropoutn,
     N_CTX_Q,
     N_CTX_K,
     start_n,
     num_block_m,
     num_block_n,
-    dropout_p, philox_seed, philox_offset_base,
+    dropout_p,
+    philox_seed,
+    batch_philox_offset,
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     ACTUAL_BLOCK_DMODEL: tl.constexpr,
@@ -153,6 +329,9 @@ def _bwd_kernel_one_col_block(
     USE_EXP2: tl.constexpr,
     GROUP_SIZE: tl.constexpr,
 ):
+    DEBUG_DROPOUT = False
+
+    # causal
     if CAUSAL:
         # TODO: Causal can skip more blocks with something like lo = start_m * BLOCK_M
         lo = 0
@@ -179,9 +358,12 @@ def _bwd_kernel_one_col_block(
     k = tl.load(k_ptrs, mask=kv_mask, other=0.0)
     v = tl.load(v_ptrs, mask=kv_mask, other=0.0)
 
+    if DROPOUT:
+        dropout_scale = 1/ (1 - dropout_p)
+
     # loop over rows
-    for start_m in range(lo, num_block_m * BLOCK_M, BLOCK_M):
-        offs_m = start_m + tl.arange(0, BLOCK_M)
+    for start_m in range(lo, num_block_m):
+        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
         q_ptrs = q_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
         dq_ptrs = dq_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
         do_ptrs = do_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
@@ -223,46 +405,70 @@ def _bwd_kernel_one_col_block(
         
         # NOTE: must create a new var p_drop to prevent p (which is used later to compute ds) from changing
         if DROPOUT:
-            philox_offset = philox_offset_base + start_m * N_CTX_K + start_n * BLOCK_N
-            keep = dropout_mask(philox_seed, philox_offset, dropout_p, BLOCK_M, BLOCK_N, N_CTX_K)
-            p_drop = tl.where(keep, p, 0.0)
-
-            p_drop = p_drop / (1 - dropout_p)
-            p_drop = p_drop.to(Q.dtype.element_ty)
-        else:
-            p_drop = p
-        
-        # compute dv
-        dv += tl.dot(tl.trans(p), do)
-
-        # compute dp
-        dp = tl.dot(do, tl.trans(v))
-
-        if DROPOUT:
-            philox_offset = philox_offset_base + start_m * N_CTX_K + start_n * BLOCK_N
-            keep = dropout_mask(philox_seed, philox_offset, dropout_p, BLOCK_M, BLOCK_N, N_CTX_K)
-            dp = tl.where(keep, dp, 0.0)
-
-            dp = dp / (1 - dropout_p)
-            dp = dp.to(Q.dtype.element_ty)
-
-        # compute ds , ds = p * (dp - delta[:, None])
-        d_ptrs = d_offset + offs_m * stride_deltam
-        Di = tl.load(d_ptrs, mask=mask_m)
-        ds = (p * (dp - Di[:, None])) * sm_scale
-        ds = tl.where(p_mask, ds, 0.0)
-        ds = ds.to(tl.float16)
-        
-        # compute dk = dot(ds.T, q)
-        dk += tl.dot(tl.trans(ds), q)
-
-        # compute dq
-        if SEQUENCE_PARALLEL:
-            dq = tl.dot(ds, k)
+            philox_offset = batch_philox_offset + offs_m[:, None] * stride_dropoutm + offs_n[None, :] * stride_dropoutn
+            # print("philox_seed:", philox_seed)
+            # print("philox_offset:", philox_offset)
+            rand_vals = tl.rand(philox_seed, philox_offset)
+            dropout_mask = rand_vals > dropout_p
+
+            if DEBUG_DROPOUT:
+                dropout_ptrs = dropout_offset + offs_m[:, None] * stride_dropoutm + offs_n[None, :] * stride_dropoutn
+                tl.store(dropout_ptrs, dropout_mask, mask=p_mask)
+            
+            # apply dropout mask
+            p_drop = tl.where(dropout_mask, p, 0.0)
+            p_drop_scaled = p_drop * dropout_scale
+
+            # compute dv
+            dv += tl.dot(tl.trans(p_drop_scaled), do) # dropout scale is applied at the end
+
+            # compute dp
+            dp_drop_scaled = tl.dot(do, tl.trans(v))
+            dp = tl.where(dropout_mask, dp_drop_scaled, 0.0) * dropout_scale
+
+            # compute ds
+            delta_ptrs = delta_offset + offs_m * stride_deltam
+            delta_i = tl.load(delta_ptrs, mask=mask_m)
+            dscores_scaled = (p * (dp - delta_i[:, None]))
+            ds = dscores_scaled * sm_scale
+            ds = tl.where(p_mask, ds, 0.0)
+            ds = ds.to(tl.float16)
+            
+            # compute dk
+            dk += tl.dot(tl.trans(ds), q)
+
+            # compute dq
+            if SEQUENCE_PARALLEL:
+                dq = tl.dot(ds, k)
+            else:
+                dq = tl.load(dq_ptrs, mask=q_mask, other=0.0)
+                dq += tl.dot(ds, k)
+            tl.store(dq_ptrs, dq.to(Q.dtype.element_ty), mask=q_mask)
         else:
-            dq = tl.load(dq_ptrs, mask=q_mask, other=0.0)
-            dq += tl.dot(ds, k)
-        tl.store(dq_ptrs, dq.to(Q.dtype.element_ty), mask=q_mask)
+            # compute dv
+            dv += tl.dot(tl.trans(p), do)
+
+            # compute dp
+            dp = tl.dot(do, tl.trans(v))
+
+            # compute ds
+            delta_ptrs = delta_offset + offs_m * stride_deltam
+            delta_i = tl.load(delta_ptrs, mask=mask_m)
+            dscores_scaled = (p * (dp - delta_i[:, None]))
+            ds = dscores_scaled * sm_scale
+            ds = tl.where(p_mask, ds, 0.0)
+            ds = ds.to(tl.float16)
+            
+            # compute dk
+            dk += tl.dot(tl.trans(ds), q)
+
+            # compute dq
+            if SEQUENCE_PARALLEL:
+                dq = tl.dot(ds, k)
+            else:
+                dq = tl.load(dq_ptrs, mask=q_mask, other=0.0)
+                dq += tl.dot(ds, k)
+            tl.store(dq_ptrs, dq.to(Q.dtype.element_ty), mask=q_mask)
 
     # write-back dv and dk
     dk_ptrs = dk_offset + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
@@ -289,7 +495,8 @@ def _bwd_kernel(
     DK,
     DV,
     L,
-    D,
+    Delta,
+    Dropout_mask,
     stride_dq_all,
     stride_qz,
     stride_qh,
@@ -306,6 +513,7 @@ def _bwd_kernel(
     stride_deltaz, 
     stride_deltah, 
     stride_deltam,
+    stride_dropoutz, stride_dropouth, stride_dropoutm, stride_dropoutn,
     Z,
     HQ,
     HK,
@@ -315,7 +523,9 @@ def _bwd_kernel(
     cu_seqlens_k,
     max_seqlen_q,
     max_seqlen_k,
-    dropout_p, philox_seed, philox_offset,
+    dropout_p, 
+    philox_seed, 
+    philox_offset_base,
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     ACTUAL_BLOCK_DMODEL: tl.constexpr,
@@ -339,11 +549,6 @@ def _bwd_kernel(
     else:
         off_hk = off_hq
 
-    if DROPOUT:
-        batch_philox_offset = philox_offset + off_hq * max_seqlen_q * max_seqlen_k
-    else:
-        batch_philox_offset = 0
-
     if IS_VARLEN:
         # Compute sequence lengths for the current batch
         q_start = tl.load(cu_seqlens_q + off_z)
@@ -359,7 +564,6 @@ def _bwd_kernel(
         k_start = 0
         N_CTX_Q = max_seqlen_q
         N_CTX_K = max_seqlen_k
-    
 
     # input tensor offsets
     q_offset = Q + off_z * stride_qz + off_hq * stride_qh + q_start * stride_qm
@@ -367,7 +571,15 @@ def _bwd_kernel(
     v_offset = V + off_z * stride_vz + off_hk * stride_vh + k_start * stride_vn
     do_offset = DO + off_z * stride_qz + off_hq * stride_qh + q_start * stride_qm
     l_offset = L + off_z * stride_deltaz + off_hq * stride_deltah + q_start * stride_deltam
-    d_offset = D + off_z * stride_deltaz + off_hq * stride_deltah + q_start * stride_deltam
+    delta_offset = Delta + off_z * stride_deltaz + off_hq * stride_deltah + q_start * stride_deltam
+
+    if DROPOUT:
+        batch_philox_offset = philox_offset_base + off_z * stride_dropoutz + off_hq * stride_dropouth #+ q_start * stride_dropoutm
+        dropout_offset = Dropout_mask + off_z * stride_dropoutz + off_hq * stride_dropouth #+ q_start * stride_dropoutm
+    else:
+        batch_philox_offset = 0
+        dropout_offset = 0
+    
 
     # output tensor offsets
     dk_offset = DK + off_z * stride_kz + off_hk * stride_kh + k_start * stride_kn
@@ -390,7 +602,7 @@ def _bwd_kernel(
             DK,
             DV,
             L,
-            D,
+            Delta,
             q_offset,
             k_offset,
             v_offset,
@@ -398,8 +610,9 @@ def _bwd_kernel(
             dq_offset,
             dk_offset,
             dv_offset,
-            d_offset,
             l_offset,
+            delta_offset,
+            dropout_offset,
             stride_dq_all,
             stride_qz,
             stride_qh,
@@ -413,9 +626,10 @@ def _bwd_kernel(
             stride_vh,
             stride_vn,
             stride_vk,
-            stride_deltaz, 
-            stride_deltah, 
+            stride_deltaz,
+            stride_deltah,
             stride_deltam,
+            stride_dropoutz, stride_dropouth, stride_dropoutm, stride_dropoutn,
             N_CTX_Q,
             N_CTX_K,
             start_n,
@@ -445,7 +659,7 @@ def _bwd_kernel(
                 DK,
                 DV,
                 L,
-                D,
+                Delta,
                 q_offset,
                 k_offset,
                 v_offset,
@@ -453,8 +667,9 @@ def _bwd_kernel(
                 dq_offset,
                 dk_offset,
                 dv_offset,
-                d_offset,
                 l_offset,
+                delta_offset,
+                dropout_offset,
                 stride_dq_all,
                 stride_qz,
                 stride_qh,
@@ -468,9 +683,10 @@ def _bwd_kernel(
                 stride_vh,
                 stride_vn,
                 stride_vk,
-                stride_deltaz, 
-                stride_deltah, 
+                stride_deltaz,
+                stride_deltah,
                 stride_deltam,
+                stride_dropoutz, stride_dropouth, stride_dropoutm, stride_dropoutn,
                 N_CTX_Q,
                 N_CTX_K,
                 start_n,
@@ -503,14 +719,15 @@ def attention_prefill_backward_triton_impl(
     sm_scale: float,
     alibi_slopes,
     causal,
-    dropout_p,
     layout: str,
     cu_seqlens_q,
     cu_seqlens_k,
     max_seqlen_q: int,
     max_seqlen_k: int,
+    dropout_p, 
+    philox_seed, 
+    philox_offset,
     use_exp2: bool,
-    rng_state: torch.Tensor,
     sequence_parallel = True,
 ):
     if DEBUG:
@@ -533,8 +750,10 @@ def attention_prefill_backward_triton_impl(
         print("cu_seqlens_k:", cu_seqlens_k)
         print("max_seqlen_q:", max_seqlen_q)
         print("max_seqlen_k:", max_seqlen_k)
+        print("dropout_p:", dropout_p)
+        print("philox_seed:", philox_seed)
+        print("philox_offset:", philox_offset)
         print("use_exp2:", use_exp2)
-        print("rng_state", rng_state)
         print("sequence_parallel:", sequence_parallel)
 
     # make contigious
@@ -551,13 +770,7 @@ def attention_prefill_backward_triton_impl(
     stride_vz, stride_vh, stride_vn, stride_vk = v_strides
     stride_oz, stride_oh, stride_om, stride_ok = o_strides
     is_varlen = layout == "thd"
-    
-
-    # get dropout metadata
-    if dropout_p > 0.0:
-        philox_seed, philox_offset = rng_state[0].item(), rng_state[1].item()
-    else:
-        philox_seed, philox_offset = None, None
+    use_dropout = (dropout_p > 0.0)
 
     # FIXME: some configs lead to oom for some reason when using 64 x 64 blocks
     if max_seqlen_q <= 32 or max_seqlen_k <= 32:
@@ -566,6 +779,10 @@ def attention_prefill_backward_triton_impl(
     else:
         BLOCK_M = 64 
         BLOCK_N = 64
+    if DEBUG:
+        print("BLOCK_M:", BLOCK_M)
+        print("BLOCK_N:", BLOCK_N)
+
     num_warps = 4 # NOTE: originial is 8. changing it to 1 caused issues be careful
     num_stages = 1
     waves_per_eu = 1
@@ -646,27 +863,74 @@ def attention_prefill_backward_triton_impl(
     else:
         stride_deltaz, stride_deltah, stride_deltam = delta.stride()
 
-    _bwd_preprocess_use_o[(num_blocks_m, batch * nheads_q)](
-        o,
-        do,
-        delta,
-        stride_oz, stride_oh, stride_om, stride_ok,
-        stride_oz, stride_oh, stride_om, stride_ok,
-        stride_deltaz, stride_deltah, stride_deltam,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        BLOCK_M=BLOCK_M,
-        BLOCK_DMODEL=BLOCK_DMODEL,
-        ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
-        N_CTX_Q=max_seqlen_q,
-        Z=batch,
-        H=nheads_q,
-        IS_VARLEN=is_varlen
-    )
+    # dropout mask tensor for debugging. We dump the dropout mask created in the kernel for testing
+    if use_dropout:
+        dropout_mask = torch.zeros((batch, nheads_q, max_seqlen_q, max_seqlen_k), device=q.device,
+                                        dtype=torch.float32)
+        stride_dropoutz, stride_dropouth, stride_dropoutm, stride_dropoutn = (dropout_mask.stride(0), dropout_mask.stride(1), dropout_mask.stride(2), dropout_mask.stride(3))
+    else:
+        dropout_mask = None
+        stride_dropoutz, stride_dropouth, stride_dropoutm, stride_dropoutn = (0, 0 , 0 , 0)
+
+    if False: #dropout_p > 0.0:
+        _bwd_preprocess_use_p[(batch * nheads_q, num_blocks_m)](
+            q,
+            k,
+            v,
+            sm_scale,
+            do,
+            softmax_lse,
+            delta,
+            stride_dq_all,
+            stride_qz, stride_qh, stride_qm, stride_qk,
+            stride_kz, stride_kh, stride_kn, stride_kk,
+            stride_vz, stride_vh, stride_vn, stride_vk,
+            stride_deltaz, stride_deltah, stride_deltam,
+            batch,
+            nheads_q,
+            nheads_k,
+            num_blocks_m,
+            num_blocks_n,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p, philox_seed, philox_offset,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL=BLOCK_DMODEL,
+            ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
+            SEQUENCE_PARALLEL=sequence_parallel,
+            CAUSAL=causal,
+            DROPOUT=use_dropout,
+            USE_EXP2=use_exp2,
+            num_warps=num_warps,
+            num_stages=num_stages,
+            waves_per_eu = waves_per_eu,
+            IS_VARLEN=is_varlen
+            )
+    else:
+        _bwd_preprocess_use_o[(batch * nheads_q, num_blocks_m)](
+            o,
+            do,
+            delta,
+            stride_oz, stride_oh, stride_om, stride_ok,
+            stride_oz, stride_oh, stride_om, stride_ok,
+            stride_deltaz, stride_deltah, stride_deltam,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            BLOCK_M=BLOCK_M,
+            BLOCK_DMODEL=BLOCK_DMODEL,
+            ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
+            N_CTX_Q=max_seqlen_q,
+            Z=batch,
+            H=nheads_q,
+            IS_VARLEN=is_varlen
+        )
 
-    if DEBUG:
+    if False:
         print("_bwd_kernel inputs")
         print("do:", do, do.shape)
         print("q:", q, q.shape)
@@ -695,6 +959,7 @@ def attention_prefill_backward_triton_impl(
         print("ACTUAL_BLOCK_DMODEL:",ACTUAL_BLOCK_DMODEL)
         print("SEQUENCE_PARALLEL:",sequence_parallel)
         print("CAUSAL:",causal)
+        print("DROPOUT:", use_dropout)
         print("num_warps:",num_warps)
         print("num_stages:", num_stages)
         print("USE_EXP2:", use_exp2)
@@ -713,11 +978,13 @@ def attention_prefill_backward_triton_impl(
         dv,
         softmax_lse,
         delta,
+        dropout_mask,
         stride_dq_all,
         stride_qz, stride_qh, stride_qm, stride_qk,
         stride_kz, stride_kh, stride_kn, stride_kk,
         stride_vz, stride_vh, stride_vn, stride_vk,
         stride_deltaz, stride_deltah, stride_deltam,
+        stride_dropoutz, stride_dropouth, stride_dropoutm, stride_dropoutn,
         batch,
         nheads_q,
         nheads_k,
@@ -734,7 +1001,7 @@ def attention_prefill_backward_triton_impl(
         ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
         SEQUENCE_PARALLEL=sequence_parallel,
         CAUSAL=causal,
-        DROPOUT=dropout_p>0.0,
+        DROPOUT=use_dropout,
         USE_EXP2=use_exp2,
         num_warps=num_warps,
         num_stages=num_stages,
@@ -747,11 +1014,15 @@ def attention_prefill_backward_triton_impl(
 
     if DEBUG:
         print("attention_prefill_backward_triton_impl outputs")
-        print("dq:", dq, dq.shape)
-        print("dk:", dk, dk.shape)
-        print("dv:", dv, dv.shape)
         print("delta:", delta, delta.shape)
+        print("dv:", dv, dv.shape)
+        print("dk:", dk, dk.shape)
+        print("dq:", dq, dq.shape)
         print("copy_back:", copy_back)
+        if use_dropout:
+            print("dropout_mask:", dropout_mask, dropout_mask.shape if dropout_mask is not None else None)
+            print("dropout_fraction bwd:", 1.0 - (dropout_mask.sum()/ dropout_mask.numel()).item())
+            write_dropout_mask(dropout_mask, "dropout_mask_bwd")
 
     if copy_back["dq"]:
         dq_og.copy_(dq)
diff --git a/flash_attn/flash_attn_triton_amd/bwd_ref.py b/flash_attn/flash_attn_triton_amd/bwd_ref.py
index 5d1856521..cf491730b 100644
--- a/flash_attn/flash_attn_triton_amd/bwd_ref.py
+++ b/flash_attn/flash_attn_triton_amd/bwd_ref.py
@@ -2,10 +2,10 @@
 import math
 from .utils import DEBUG
 
-DEBUG_CORE = DEBUG and False
+DEBUG_CORE = False
 
 def attention_backward_core_ref_impl(
-    do, q, k, v, o, softmax_lse, sm_scale, causal, use_exp2
+    do, q, k, v, o, softmax_lse, sm_scale, causal, dropout_p, philox_seed, philox_offset, use_exp2
 ):
     if DEBUG_CORE:
         print()
@@ -18,6 +18,9 @@ def attention_backward_core_ref_impl(
         print("softmax_lse:", softmax_lse, softmax_lse.shape)
         print("sm_scale:", sm_scale)
         print("causal:", causal)
+        print("dropout_p:", dropout_p)
+        print("philox_seed:", philox_seed)
+        print("philox_offset:", philox_offset)
         print("use_exp2:", use_exp2)
     
     # cast to float32
@@ -30,7 +33,7 @@ def attention_backward_core_ref_impl(
 
 
     # recompute attention_scores. Make sure it matches the forward impl. i.e. It use float32
-    attention_scores = torch.matmul(q.to(torch.float32), k.transpose(-2, -1).to(torch.float32))
+    attention_scores = torch.matmul(q, k.transpose(-2, -1))
     if DEBUG_CORE:
         print("attention_scores:", attention_scores, attention_scores.shape)
 
@@ -65,58 +68,95 @@ def attention_backward_core_ref_impl(
     else:
         softmax_lse_3d =  softmax_lse.unsqueeze(-1)
         p = torch.exp(attention_scaled_scores - softmax_lse_3d)
-
     if DEBUG_CORE:
         print("softmax_lse_3d:", softmax_lse_3d, softmax_lse_3d.shape)
         print("p:", p, p.shape)
-    # compute gradient wrt v
-    dv = torch.matmul(p.transpose(-2, -1), do.to(torch.float32))
-    if DEBUG_CORE:
-        print("dv:", dv, dv.shape)
 
-    # compute dp
-    dp = torch.matmul(do, v.transpose(-2, -1))
-    if DEBUG_CORE:
-        print("dp:", dp, dp.shape)
+    if dropout_p > 0.0:
+        rand_vals = torch.rand(p.shape, generator=torch.Generator(device=p.device).manual_seed(philox_seed), device=p.device, dtype=p.dtype)
+        dropout_mask, dropout_scale = rand_vals > dropout_p,  (1.0 / (1 - dropout_p))
+        if DEBUG:
+            print("dropout_scale:", dropout_scale)
+            print("dropout_mask:", dropout_mask)
+            
+        p_drop = torch.where(dropout_mask, p, torch.zeros_like(p))
+        p_drop_scaled =  p_drop * dropout_scale
+        if DEBUG_CORE:
+            print("dropout_scale:", dropout_scale)
+            print("p_drop:", p_drop, p_drop.shape)
+            print("p_drop_scaled:", p_drop_scaled, p_drop_scaled.shape)
+        
+        # compute gradient wrt v
+        dv = torch.matmul(p_drop_scaled.transpose(-2, -1), do)
+        if DEBUG_CORE:
+            print("dv:", dv, dv.shape)
+
+        # compute dp
+        dp_dropout = torch.matmul(do, v.transpose(-2, -1))
+        dp = torch.where(dropout_mask, dp_dropout , torch.zeros_like(dp_dropout)) * dropout_scale
+        if DEBUG_CORE:
+            print("dp_dropout:", dp_dropout, dp_dropout.shape)
+            print("dp:", dp, dp.shape)
 
-    # calculate ds using dp
-    if True:
-        delta = torch.sum(o * do, axis=-1).to(torch.float32)  # what OAI kernel uses
-        delta_3d = delta.unsqueeze(-1)
+        # calculate ds
+        if False:
+            delta = torch.sum(o * do, axis=-1).unsqueeze(-1)
+        else:
+            delta = torch.sum(p * dp, axis=-1).unsqueeze(-1)
+        dscores_scaled = p * (dp - delta)
+        ds = dscores_scaled * sm_scale
+        if DEBUG_CORE:
+            print("delta:", delta, delta.shape)
+            print("dscores_scaled:", dscores_scaled, dscores_scaled.shape)
+            print("ds:", ds, ds.shape)
+
+        # compute gradient wrt k & q
+        dk = torch.matmul(ds.transpose(-2, -1), q)
+        dq = torch.matmul(ds, k)
+        if DEBUG_CORE:
+            print("dk:", dk, dk.shape)
+            print("dq:", dq, dq.shape)
     else:
-        delta = torch.sum(p * dp, axis=-1) # what the math says you should use
-        delta_3d = delta.unsqueeze(-1)
-    if DEBUG_CORE:
-        print("delta_3d:", delta_3d, delta_3d.shape)
-    ds = (p * (dp - delta_3d)) * sm_scale
-    if DEBUG_CORE:
-        print("ds:", ds, ds.shape)
-   
+        # compute gradient wrt v
+        dv = torch.matmul(p.transpose(-2, -1), do)
+        if DEBUG_CORE:
+            print("dv:", dv, dv.shape)
 
-    # compute gradient wrt k
-    dk = torch.matmul(ds.transpose(-2, -1), q.to(torch.float32))
-    if DEBUG_CORE:
-        print("dk:", dk, dk.shape)
+        # compute dp
+        dp = torch.matmul(do, v.transpose(-2, -1))
+        if DEBUG_CORE:
+            print("dp:", dp, dp.shape)
 
-    # compute gradient wrt q
-    dq = torch.matmul(ds, k.to(torch.float32))
-    if DEBUG_CORE:
-        print("dq:", dq, dq.shape)
+        # calculate ds
+        delta = torch.sum(o * do, axis=-1).unsqueeze(-1)
+        dscores_scaled = p * (dp - delta)
+        ds = dscores_scaled * sm_scale
+        if DEBUG_CORE:
+            print("delta:", delta, delta.shape)
+            print("dscores_scaled:", dscores_scaled, dscores_scaled.shape)
+            print("ds:", ds, ds.shape)
+    
+
+        # compute gradient wrt k & q
+        dk = torch.matmul(ds.transpose(-2, -1), q)
+        dq = torch.matmul(ds, k)
+        if DEBUG_CORE:
+            print("dk:", dk, dk.shape)
+            print("dq:", dq, dq.shape)
 
     # cast back to original dtype
     dq = dq.to(torch.float16)
     dk = dk.to(torch.float16)
     dv = dv.to(torch.float16)
-
     # remove d dim with size 1
-    delta = delta_3d.squeeze(-1)
+    delta = delta.squeeze(-1)
 
     if DEBUG_CORE:
         print("attention_backward_core_ref_impl output")
-        print("dq:", dq, dq.shape)
-        print("dk:", dk, dk.shape)
-        print("dv:", dv, dv.shape)
         print("delta:", delta, delta.shape)
+        print("dv:", dv, dv.shape)
+        print("dk:", dk, dk.shape)
+        print("dq:", dq, dq.shape)
 
     return dq, dk, dv, delta
 
@@ -134,6 +174,9 @@ def attention_varlen_backward_pytorch_ref_impl(
     cu_seqlens_k,
     max_seqlen_q,
     max_seqlen_k,
+    dropout_p, 
+    philox_seed, 
+    philox_offset,
     use_exp2,
 ):
     # Ensure the layout is 'thd'
@@ -208,6 +251,9 @@ def attention_varlen_backward_pytorch_ref_impl(
             softmax_lse_i,
             sm_scale,
             causal,
+            dropout_p, 
+            philox_seed, 
+            philox_offset,
             use_exp2
         )
 
@@ -251,6 +297,9 @@ def attention_vanilla_backward_pytorch_ref_impl(
     sm_scale,
     causal,
     layout,
+    dropout_p, 
+    philox_seed, 
+    philox_offset,
     use_exp2,
 ):
     if layout == "bshd":
@@ -312,6 +361,9 @@ def attention_vanilla_backward_pytorch_ref_impl(
         softmax_lse,
         sm_scale,
         causal,
+        dropout_p, 
+        philox_seed, 
+        philox_offset,
         use_exp2
     )
 
@@ -359,14 +411,15 @@ def attention_backward_pytorch_ref_impl(
     softmax_lse,
     sm_scale,
     causal,
-    dropout_p,
     layout,
     cu_seqlens_q,
     cu_seqlens_k,
     max_seqlen_q,
     max_seqlen_k,
-    use_exp2,
-    rng_state
+    dropout_p, 
+    philox_seed, 
+    philox_offset,
+    use_exp2
 ):
 
     if DEBUG:
@@ -385,6 +438,9 @@ def attention_backward_pytorch_ref_impl(
         print("cu_seqlens_k:", cu_seqlens_k)
         print("max_seqlen_q:", max_seqlen_q)
         print("max_seqlen_k:", max_seqlen_k)
+        print("dropout_p:", dropout_p)
+        print("philox_seed:", philox_seed)
+        print("philox_offset:", philox_offset)
         print("use_exp2:", use_exp2)
 
 
@@ -403,6 +459,9 @@ def attention_backward_pytorch_ref_impl(
             cu_seqlens_k,
             max_seqlen_q,
             max_seqlen_k,
+            dropout_p, 
+            philox_seed, 
+            philox_offset,
             use_exp2,
         )
     else:
@@ -416,6 +475,9 @@ def attention_backward_pytorch_ref_impl(
             sm_scale,
             causal,
             layout,
+            dropout_p, 
+            philox_seed, 
+            philox_offset,
             use_exp2,
         )
         
@@ -423,9 +485,9 @@ def attention_backward_pytorch_ref_impl(
     if DEBUG:
         print()
         print("attention_backward_pytorch_ref_impl outputs")
-        print("dq:", dq, dq.shape)
-        print("dk:", dk, dk.shape)
-        print("dv:", dv, dv.shape)
         print("delta:", delta, delta.shape)
+        print("dv:", dv, dv.shape)
+        print("dk:", dk, dk.shape)
+        print("dq:", dq, dq.shape)
 
     return dq, dk, dv, delta
diff --git a/flash_attn/flash_attn_triton_amd/common.py b/flash_attn/flash_attn_triton_amd/common.py
new file mode 100755
index 000000000..bc1fe4727
--- /dev/null
+++ b/flash_attn/flash_attn_triton_amd/common.py
@@ -0,0 +1,7 @@
+import torch
+
+def compute_alibi_tensor_ref(alibi_slopes, seqlen_q, seqlen_k):
+    q_idx = torch.arange(seqlen_q, dtype=torch.int32, device="cuda").unsqueeze(-1)  # (N_CTX_Q, 1)
+    k_idx = torch.arange(seqlen_k, dtype=torch.int32, device="cuda").unsqueeze(0)  # (1, N_CTX_K)
+    relative_pos = torch.abs(q_idx + seqlen_k - seqlen_q - k_idx)  # (N_CTX_Q, N_CTX_K)
+    return -1 * alibi_slopes.unsqueeze(-1).unsqueeze(-1) * relative_pos  # (Z, H, N_CTX_Q, N_CTX_K)
\ No newline at end of file
diff --git a/flash_attn/flash_attn_triton_amd/compare.py b/flash_attn/flash_attn_triton_amd/compare.py
deleted file mode 100644
index d80361171..000000000
--- a/flash_attn/flash_attn_triton_amd/compare.py
+++ /dev/null
@@ -1,767 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-from .utils import get_shape_from_layout, get_strides_from_layout, DEBUG, PERF
-
-
-@triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
-
-
-@triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
-
-
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
-
-@triton.jit
-def store_dropout_mask(X, philox_seed, philox_offset, dropout_p: tl.constexpr, m: tl.constexpr, n: tl.constexpr, stride: tl.constexpr):
-    x = tl.zeros((m, n), tl.float32)
-    # import pdb; pdb.set_trace()
-    x = dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride)
-    x_block = (tl.arange(0, m)[:, None]*n + tl.arange(0, n)[None, :])
-    tl.store(X+x_block, x, mask=((tl.arange(0, m)[:, None] < m) & (tl.arange(0, n)[None, :] < n)))
-
-
-@triton.jit
-def _bwd_preprocess_use_o(
-    Out,
-    DO,
-    Delta,
-    stride_oz, stride_oh, stride_om, stride_ok,
-    stride_doz, stride_doh, stride_dom, stride_dok,
-    stride_deltaz, stride_deltah, stride_deltam,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    N_CTX_Q: tl.constexpr,
-    Z: tl.constexpr,
-    H: tl.constexpr,
-    IS_VARLEN: tl.constexpr
-):
-    pid_m = tl.program_id(0)
-    pid_bh = tl.program_id(1)
-
-    # Compute batch and head indices
-    off_z = pid_bh // H
-    off_h = pid_bh % H
-
-    if IS_VARLEN:
-        # Compute sequence lengths for the current batch
-        q_start = tl.load(cu_seqlens_q + off_z)
-        q_end = tl.load(cu_seqlens_q + off_z + 1)
-        k_start = tl.load(cu_seqlens_k + off_z)
-        k_end = tl.load(cu_seqlens_k + off_z + 1)
-
-        # Compute actual sequence lengths
-        N_CTX_Q = q_end - q_start
-        N_CTX_K = k_end - k_start
-    else:
-        q_start = 0
-        k_start = 0
-        N_CTX_Q = max_seqlen_q
-        N_CTX_K = max_seqlen_k
-
-    off_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_d = tl.arange(0, BLOCK_DMODEL)
-
-    # create masks
-    mask_m = off_m < N_CTX_Q
-    mask_d = off_d < ACTUAL_BLOCK_DMODEL
-
-    # compute offsets
-    o_offset = Out + off_z * stride_oz + off_h * stride_oh + q_start * stride_om
-    do_offset = DO + off_z * stride_oz + off_h * stride_oh + q_start * stride_om
-
-    # compute pointers
-    out_ptrs = o_offset + off_m[:, None] * stride_om + off_d[None, :] * stride_ok
-    do_ptrs = do_offset + off_m[:, None] * stride_dom + off_d[None, :] * stride_dok
-
-    # load
-    o = tl.load(out_ptrs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.float32)
-    do = tl.load(do_ptrs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.float32)
-
-    # compute delta
-    delta = tl.sum(o * do, axis=1)
-
-    # write-back delta
-    delta_offset = Delta + off_z * stride_deltaz + off_h * stride_deltah + q_start * stride_deltam
-    delta_ptrs = delta_offset + off_m * stride_deltam
-    tl.store(delta_ptrs, delta, mask=mask_m)
-
-
-@triton.jit
-def _bwd_kernel_one_col_block(
-        Q,
-        K,
-        V,
-        sm_scale,
-        Out,
-        DO,
-        DQ,
-        DK,
-        DV,
-        L,
-        D,
-        q_offset,
-        k_offset,
-        v_offset,
-        do_offset,
-        dq_offset,
-        dk_offset,
-        dv_offset,
-        d_offset,
-        l_offset,
-        stride_dq_all,
-        stride_qz,
-        stride_qh,
-        stride_qm,
-        stride_qk,
-        stride_kz,
-        stride_kh,
-        stride_kn,
-        stride_kk,
-        stride_vz,
-        stride_vh,
-        stride_vn,
-        stride_vk,
-        stride_deltaz, 
-        stride_deltah, 
-        stride_deltam,
-        Z,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        off_h,
-        off_z,
-        off_hz,
-        start_n,
-        num_block_m,
-        num_block_n,
-        dropout_p, philox_seed, philox_offset_base,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        ACTUAL_BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        SEQUENCE_PARALLEL: tl.constexpr,
-        CAUSAL: tl.constexpr,
-        USE_EXP2: tl.constexpr,
-        ENABLE_DROPOUT: tl.constexpr,          
-):
-    if CAUSAL:
-        # TODO: Causal can skip more blocks with something like lo = start_m * BLOCK_M
-        lo = 0
-    else:
-        lo = 0
-
-    # initialize col and head offsets
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-
-    # masks
-    mask_n = offs_n < N_CTX_K
-    mask_d = offs_d < ACTUAL_BLOCK_DMODEL
-    kv_mask = mask_n[:, None] & mask_d[None, :]
-    
-
-    # initialize grad accumulators
-    dv = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
-    dk = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
-
-    # load k and v once per column block
-    k_ptrs = k_offset + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
-    v_ptrs = v_offset + offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vk
-    k = tl.load(k_ptrs, mask=kv_mask, other=0.0)
-    v = tl.load(v_ptrs, mask=kv_mask, other=0.0)
-
-    # loop over rows
-    for start_m in range(lo, num_block_m * BLOCK_M, BLOCK_M):
-        offs_m = start_m + tl.arange(0, BLOCK_M)
-        q_ptrs = q_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-        dq_ptrs = dq_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-        do_ptrs = do_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-        
-        # update mask as row block changes
-        mask_m = offs_m < N_CTX_Q
-        q_mask = mask_m[:, None] & mask_d[None, :]
-
-        # load q, k, v, do on-chip
-        q = tl.load(q_ptrs, mask=q_mask, other=0.0)
-        do = tl.load(do_ptrs, mask=q_mask, other=0.0)
-
-        # recompute p = softmax(qk, dim=-1).T
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, tl.trans(k))
-
-        if CAUSAL:
-            col_offset = N_CTX_Q - N_CTX_K
-            causal_mask = offs_m[:, None] >= (col_offset + offs_n[None, :])
-            qk = tl.where(causal_mask, qk, float("-inf"))
-
-        l_ptrs = l_offset + offs_m * stride_deltam
-        l_i = tl.load(l_ptrs, mask=mask_m)
-
-        # compute p
-        if USE_EXP2:
-            RCP_LN2: tl.constexpr = 1.4426950408889634
-            qk *= sm_scale * RCP_LN2
-            l_i *= RCP_LN2
-            p = tl.math.exp2(qk - l_i[:, None])
-        else:
-            qk *= sm_scale
-            p = tl.math.exp(qk - l_i[:, None])
-
-        # mask block in the cases where the data is smaller the block size
-        p_mask = mask_m[:, None] & mask_n[None, :]
-        p = tl.where(p_mask, p, 0.0)
-
-        # NOTE: must create a new var p_drop to prevent p (which is used later to compute ds) from changing
-        if ENABLE_DROPOUT:
-            philox_offset = philox_offset_base + start_m * N_CTX_K + start_n * BLOCK_N
-            keep = dropout_mask(philox_seed, philox_offset, dropout_p, BLOCK_M, BLOCK_N, N_CTX_K)
-            p_drop = tl.where(keep, p, 0.0)
-
-            p_drop = p_drop / (1 - dropout_p)
-            p_drop = p_drop.to(Q.dtype.element_ty)
-        
-        # compute dv
-        dv += tl.dot(tl.trans(p_drop.to(Q.dtype.element_ty)), do)
-
-        # compute dp
-        dp = tl.dot(do, tl.trans(v))
-
-        # if dropout enabled, mask the scores and scale proportionally
-        if ENABLE_DROPOUT:
-            philox_offset = philox_offset_base + start_m * N_CTX_K + start_n * BLOCK_N
-            # import pdb; pdb.set_trace()
-            keep = dropout_mask(philox_seed, philox_offset, dropout_p, BLOCK_M, BLOCK_N, N_CTX_K)
-            dp = tl.where(keep, dp, 0.0)
-
-            dp = dp / (1 - dropout_p) # scale ds based on dropout_p
-            dp = dp.to(Q.dtype.element_ty)
-
-        # compute ds , ds = p * (dp - delta[:, None])
-        d_ptrs = d_offset + offs_m * stride_deltam
-        Di = tl.load(d_ptrs, mask=mask_m)
-        ds = (p * (dp - Di[:, None])) * sm_scale
-        ds = tl.where(p_mask, ds, 0.0).to(Q.dtype.element_ty)
-        
-        
-        # print('ds_after_triton\n', ds)
-
-        # compute dk = dot(ds.T, q)
-        dk += tl.dot(tl.trans(ds), q)
-
-        # compute dq
-        if SEQUENCE_PARALLEL:
-            dq = tl.dot(ds, k)
-        else:
-            dq = tl.load(dq_ptrs, mask=q_mask, other=0.0)
-            dq += tl.dot(ds, k)
-        tl.store(dq_ptrs, dq.to(Q.dtype.element_ty), mask=q_mask)
-
-    # write-back dv and dk
-    dk_ptrs = dk_offset + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
-    dv_ptrs = dv_offset + offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vk
-    
-    # write-back
-    tl.store(dk_ptrs, dk.to(K.dtype.element_ty), mask=kv_mask)
-    tl.store(dv_ptrs, dv.to(V.dtype.element_ty), mask=kv_mask)
-
-@triton.jit
-def _bwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    Out,
-    DO,
-    DQ,
-    DK,
-    DV,
-    L,
-    D,
-    stride_dq_all,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vn,
-    stride_vk,
-    stride_deltaz, 
-    stride_deltah, 
-    stride_deltam,
-    Z,
-    H,
-    dropout_p, philox_seed, philox_offset_base,
-    num_block_m,
-    num_block_n,
-    cu_seqlens_q,  
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    SEQUENCE_PARALLEL: tl.constexpr,
-    CAUSAL: tl.constexpr,
-    USE_EXP2: tl.constexpr,
-    IS_VARLEN: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-):
-    # program ids
-    off_hz = tl.program_id(0)
-    if SEQUENCE_PARALLEL:
-        start_n = tl.program_id(1)
-    off_z = off_hz // H
-    off_h = off_hz % H
-
-    if ENABLE_DROPOUT:
-        off_hz = off_z * H + off_h
-        batch_philox_offset = philox_offset_base + off_hz * max_seqlen_q * max_seqlen_k
-    else:
-        batch_philox_offset = 0
-
-    if IS_VARLEN:
-        # Compute sequence lengths for the current batch
-        q_start = tl.load(cu_seqlens_q + off_z)
-        q_end = tl.load(cu_seqlens_q + off_z + 1)
-        k_start = tl.load(cu_seqlens_k + off_z)
-        k_end = tl.load(cu_seqlens_k + off_z + 1)
-
-        # Compute actual sequence lengths
-        N_CTX_Q = q_end - q_start
-        N_CTX_K = k_end - k_start
-    else:
-        q_start = 0
-        k_start = 0
-        N_CTX_Q = max_seqlen_q
-        N_CTX_K = max_seqlen_k
-    
-
-    # input tensor offsets
-    q_offset = Q + off_z * stride_qz + off_h * stride_qh + q_start * stride_qm
-    k_offset = K + off_z * stride_kz + off_h * stride_kh + k_start * stride_kn
-    v_offset = V + off_z * stride_vz + off_h * stride_vh + k_start * stride_vn
-    do_offset = DO + off_z * stride_qz + off_h * stride_qh + q_start * stride_qm
-    l_offset = L + off_z * stride_deltaz + off_h * stride_deltah + q_start * stride_deltam
-    d_offset = D + off_z * stride_deltaz + off_h * stride_deltah + q_start * stride_deltam
-
-    # output tensor offsets
-    dk_offset = DK + off_z * stride_kz + off_h * stride_kh + k_start * stride_kn
-    dv_offset = DV + off_z * stride_vz + off_h * stride_vh + k_start * stride_vn
-    if SEQUENCE_PARALLEL:
-        dq_offset = DQ + start_n * stride_dq_all + off_z * stride_qz + off_h * stride_qh + q_start * stride_qm
-    else:
-        dq_offset = DQ + off_z * stride_qz + off_h * stride_qh + q_start * stride_qm
-
-    # inner loop
-    if SEQUENCE_PARALLEL:
-        _bwd_kernel_one_col_block(
-            Q,
-            K,
-            V,
-            sm_scale,
-            Out,
-            DO,
-            DQ,
-            DK,
-            DV,
-            L,
-            D,
-            q_offset,
-            k_offset,
-            v_offset,
-            do_offset,
-            dq_offset,
-            dk_offset,
-            dv_offset,
-            d_offset,
-            l_offset,
-            stride_dq_all,
-            stride_qz,
-            stride_qh,
-            stride_qm,
-            stride_qk,
-            stride_kz,
-            stride_kh,
-            stride_kn,
-            stride_kk,
-            stride_vz,
-            stride_vh,
-            stride_vn,
-            stride_vk,
-            stride_deltaz, 
-            stride_deltah, 
-            stride_deltam,
-            Z,
-            H,
-            N_CTX_Q,
-            N_CTX_K,
-            off_h,
-            off_z,
-            off_hz,
-            start_n,
-            num_block_m,
-            num_block_n,
-            dropout_p, philox_seed, batch_philox_offset,
-            BLOCK_M=BLOCK_M,
-            BLOCK_DMODEL=BLOCK_DMODEL,
-            ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
-            BLOCK_N=BLOCK_N,
-            SEQUENCE_PARALLEL=SEQUENCE_PARALLEL,
-            CAUSAL=CAUSAL,
-            USE_EXP2=USE_EXP2,
-            ENABLE_DROPOUT=ENABLE_DROPOUT,          
-        )
-    else:
-        for start_n in range(0, num_block_n):
-            _bwd_kernel_one_col_block(
-                Q,
-                K,
-                V,
-                sm_scale,
-                Out,
-                DO,
-                DQ,
-                DK,
-                DV,
-                L,
-                D,
-                q_offset,
-                k_offset,
-                v_offset,
-                do_offset,
-                dq_offset,
-                dk_offset,
-                dv_offset,
-                d_offset,
-                l_offset,
-                stride_dq_all,
-                stride_qz,
-                stride_qh,
-                stride_qm,
-                stride_qk,
-                stride_kz,
-                stride_kh,
-                stride_kn,
-                stride_kk,
-                stride_vz,
-                stride_vh,
-                stride_vn,
-                stride_vk,
-                stride_deltaz, 
-                stride_deltah, 
-                stride_deltam,
-                Z,
-                H,
-                N_CTX_Q,
-                N_CTX_K,
-                off_h,
-                off_z,
-                off_hz,
-                start_n,
-                num_block_m,
-                num_block_n,
-                dropout_p, philox_seed, batch_philox_offset,
-                BLOCK_M=BLOCK_M,
-                BLOCK_DMODEL=BLOCK_DMODEL,
-                ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
-                BLOCK_N=BLOCK_N,
-                SEQUENCE_PARALLEL=SEQUENCE_PARALLEL,
-                CAUSAL=CAUSAL,
-                USE_EXP2=USE_EXP2,
-                ENABLE_DROPOUT=ENABLE_DROPOUT,                
-            )
-
-
-# NOTE: smaller blocks have lower accuracy. more accumlation error probably 128 * 128 seems good but leads to oom. 64 * 64 has accumlation errors but no oom.
-def attention_prefill_backward_triton_impl(
-    do,
-    q,
-    k,
-    v,
-    o,
-    softmax_lse,
-    dq,
-    dk,
-    dv,
-    sm_scale: float,
-    alibi_slopes,
-    causal,
-    layout: str,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q: int,
-    max_seqlen_k: int,
-    dropout_p,
-    dropout_philox_seed,
-    dropout_philox_offset,
-    use_exp2: bool,
-    sequence_parallel = True,
-):
-    if DEBUG:
-        print()
-        print("attention_prefill_backward_triton_new_impl")
-        print("do:", do, do.shape)
-        print("q:", q, q.shape)
-        print("k:", k, k.shape)
-        print("v:", v, v.shape)
-        print("o:", o, o.shape)
-        print("softmax_lse:", softmax_lse, softmax_lse.shape)
-        print("dq:", dq, dq.shape if dq is not None else None)
-        print("dk:", dk, dk.shape if dk is not None else None)
-        print("dv:", dv, dv.shape if dv is not None else None)
-        print("sm_scale:", sm_scale)
-        print("alibi_slopes:", alibi_slopes)
-        print("causal:", causal)
-        print("layout:", layout)
-        print("cu_seqlens_q:", cu_seqlens_q)
-        print("cu_seqlens_k:", cu_seqlens_k)
-        print("max_seqlen_q:", max_seqlen_q)
-        print("max_seqlen_k:", max_seqlen_k)
-        print("dropout_p:", dropout_p)
-        print("dropout_philox_seed:", dropout_philox_seed)
-        print("dropout_philox_offset:", dropout_philox_offset)
-        print("use_exp2:", use_exp2)
-        print("sequence_parallel:", sequence_parallel)
-
-    # make contigious
-    q = q.contiguous()
-    k = k.contiguous()
-    v = v.contiguous()
-    softmax_lse = softmax_lse.contiguous()
-
-    # get strides and shape
-    batch, nheads_q, nheads_k, head_size, max_seqlen_q, max_seqlen_k = get_shape_from_layout(q, k, layout, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k)
-    q_strides, k_strides, v_strides, o_strides = get_strides_from_layout(q, k, v, o, layout)
-    stride_qz, stride_qh, stride_qm, stride_qk =  q_strides
-    stride_kz, stride_kh, stride_kn, stride_kk = k_strides
-    stride_vz, stride_vh, stride_vn, stride_vk = v_strides
-    stride_oz, stride_oh, stride_om, stride_ok = o_strides
-    batch_headsize = batch * nheads_q
-    is_varlen = layout == "thd"
-
-    # FIXME: some configs lead to oom for some reason when using 64 x 64 blocks
-    if max_seqlen_q <= 32 or max_seqlen_k <= 32:
-        BLOCK_M = 32 
-        BLOCK_N = 32
-    else:
-        BLOCK_M = 64 
-        BLOCK_N = 64
-    num_warps = 4 # NOTE: originial is 8. changing it to 1 caused issues be careful
-    num_stages = 1
-    waves_per_eu = 1
-
-    # divide up the problem
-    num_blocks_m = triton.cdiv(max_seqlen_q, BLOCK_M)
-    num_blocks_n = triton.cdiv(max_seqlen_k, BLOCK_N)
-
-    # get closest power of 2 over or equal to 32.
-    padded_d_model = 1 << (head_size - 1).bit_length()
-    padded_d_model = max(padded_d_model, 16)
-    BLOCK_DMODEL = padded_d_model
-    ACTUAL_BLOCK_DMODEL = head_size
-
-    do = do.contiguous()
-    # NOTE: we might need to copy the output tensor if they are not continuous or have other issues
-    copy_back = {"dq": False, "dk": False, "dv": False}
-
-    # deal with dq
-    if dq is None:
-        if sequence_parallel:
-            dq = torch.zeros((num_blocks_n,) + q.shape, device=q.device, dtype=q.dtype)
-        else:
-            dq = torch.zeros(q.shape, device=q.device, dtype=q.dtype)
-    else:
-        dq_og = dq
-        if (not dq.is_contiguous()):
-            dq = dq.contiguous()
-            copy_back["dq"] = True
-
-        if sequence_parallel:
-            dq = torch.zeros((num_blocks_n,) + q.shape, device=q.device, dtype=q.dtype)
-            copy_back["dq"] = True
-        else:
-            # NOTE: the kernel does inplace accumlation so dq has to be zeros. This avoids the case where we are passed empty dq and it is not all zeros
-            dq.zero_()
-    stride_dq_all = dq.stride()[0]
-
-    # deal with dk, dv
-    if (dk is None) or (dv is None):
-        dk = torch.empty_like(k)
-        dv = torch.empty_like(v)
-    else:
-        if (not dk.is_contiguous()):
-            dk_og = dk
-            dk = dk.contiguous()
-            copy_back["dk"] = True
-
-        if (not dv.is_contiguous()):
-            dv_og = dv
-            dv = dv.contiguous()
-            copy_back["dv"] = True
-
-    if DEBUG:
-        print("copy_back:", copy_back)
-
-    # assert contigious
-    assert do.is_contiguous()
-    assert q.is_contiguous()
-    assert k.is_contiguous()
-    assert v.is_contiguous()
-    assert o.is_contiguous()
-    assert softmax_lse.is_contiguous()
-
-    # init delta
-    delta = torch.empty_like(softmax_lse)
-    if is_varlen:
-        stride_deltam, stride_deltah = delta.stride()
-        stride_deltaz = 0
-    else:
-        stride_deltaz, stride_deltah, stride_deltam = delta.stride()
-
-    _bwd_preprocess_use_o[(num_blocks_m, batch_headsize)](
-        o,
-        do,
-        delta,
-        stride_oz, stride_oh, stride_om, stride_ok,
-        stride_oz, stride_oh, stride_om, stride_ok,
-        stride_deltaz, stride_deltah, stride_deltam,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        BLOCK_M=BLOCK_M,
-        BLOCK_DMODEL=BLOCK_DMODEL,
-        ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
-        N_CTX_Q=max_seqlen_q,
-        Z=batch,
-        H=nheads_q,
-        IS_VARLEN=is_varlen
-    )
-
-    if DEBUG:
-        print("_bwd_kernel inputs")
-        print("do:", do, do.shape)
-        print("q:", q, q.shape)
-        print("k:", k, k.shape)
-        print("v:", v, v.shape)
-        print("sm_scale", sm_scale)
-        print("o:", o, o.shape)
-        print("dq:", dq, dq.shape)
-        print("dk:", dk, dk.shape)
-        print("dv:", dv, dv.shape)
-        print("L:", softmax_lse, softmax_lse.shape)
-        print("delta:", delta, delta.shape)
-        print("stride_qz, stride_qh, stride_qm, stride_qk:",  stride_qz, stride_qh, stride_qm, stride_qk)
-        print("stride_kz, stride_kh, stride_kn, stride_kk:",  stride_kz, stride_kh, stride_kn, stride_kk)
-        print("stride_vz, stride_vh, stride_vn, stride_vk:",  stride_vz, stride_vh, stride_vn, stride_vk)
-        print("batch_q:", batch)
-        print("heads_q:",nheads_q)
-        print("max_seqlen_q:",max_seqlen_q)
-        print("max_seqlen_k:",max_seqlen_k)
-        print("BLOCK_M:",BLOCK_M)
-        print("BLOCK_N:",BLOCK_M)
-        print("BLOCK_DMODEL:",BLOCK_DMODEL)
-        print("ACTUAL_BLOCK_DMODEL:",ACTUAL_BLOCK_DMODEL)
-        print("SEQUENCE_PARALLEL:",sequence_parallel)
-        print("CAUSAL:",causal)
-        print("num_warps:",num_warps)
-        print("num_stages:", num_stages)
-        print("USE_EXP2:", use_exp2)
-        print("num_blocks_m:", num_blocks_m)
-        print("num_blocks_n:", num_blocks_n)
-
-    _bwd_kernel[(batch_headsize, num_blocks_n if sequence_parallel else 1)](
-        q,
-        k,
-        v,
-        sm_scale,
-        o,
-        do,
-        dq,
-        dk,
-        dv,
-        softmax_lse,
-        delta,
-        stride_dq_all,
-        stride_qz, stride_qh, stride_qm, stride_qk,
-        stride_kz, stride_kh, stride_kn, stride_kk,
-        stride_vz, stride_vh, stride_vn, stride_vk,
-        stride_deltaz, stride_deltah, stride_deltam,
-        batch,
-        nheads_q,
-        dropout_p,
-        dropout_philox_seed,
-        dropout_philox_offset,
-        num_blocks_m,
-        num_blocks_n,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        BLOCK_M=BLOCK_M,
-        BLOCK_N=BLOCK_N,
-        BLOCK_DMODEL=BLOCK_DMODEL,
-        ACTUAL_BLOCK_DMODEL=ACTUAL_BLOCK_DMODEL,
-        SEQUENCE_PARALLEL=sequence_parallel,
-        CAUSAL=causal,
-        USE_EXP2=use_exp2,
-        num_warps=num_warps,
-        num_stages=num_stages,
-        waves_per_eu = waves_per_eu,
-        IS_VARLEN=is_varlen,
-        ENABLE_DROPOUT=dropout_p >= 0.0,
-    )
-
-    if DEBUG:
-        print("_bwd_kernel outputs")
-        print("dq:", dq, dq.shape)
-        print("dk:", dk, dk.shape)
-        print("dv:", dv, dv.shape)
-        print("delta:", delta, delta.shape)
-
-    if sequence_parallel:
-        dq = dq.sum(dim=0)
-
-    if DEBUG:
-        print("attention_prefill_backward_triton_new_impl outputs")
-        print("dq:", dq, dq.shape)
-        print("dk:", dk, dk.shape)
-        print("dv:", dv, dv.shape)
-        print("delta:", delta, delta.shape)
-        print("copy_back:", copy_back)
-
-    if copy_back["dq"]:
-        dq_og.copy_(dq)
-        dq = dq_og
-    if copy_back["dk"]:
-        dk_og.copy_(dk)
-        dk = dk_og
-    if copy_back["dv"]:
-        dv_og.copy_(dv)
-        dv = dv_og
-
-    return dq, dk, dv, delta, None, None
diff --git a/flash_attn/flash_attn_triton_amd/fwd_prefill.py b/flash_attn/flash_attn_triton_amd/fwd_prefill.py
index 72e9479de..a95904320 100644
--- a/flash_attn/flash_attn_triton_amd/fwd_prefill.py
+++ b/flash_attn/flash_attn_triton_amd/fwd_prefill.py
@@ -1,33 +1,7 @@
 import torch
 import triton
 import triton.language as tl
-from .utils import get_shape_from_layout, get_strides_from_layout, is_cdna, is_rdna, DEBUG, AUTOTUNE
-
-@triton.jit
-def cdiv_fn(x, y):
-    return (x + y - 1) // y
-
-@triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    # tl.device_print('fwd_philox_offset:', philox_offset)
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
-
-
-@triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
-
-
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
-
+from .utils import get_shape_from_layout, get_strides_from_layout, is_cdna, is_rdna, DEBUG, AUTOTUNE, write_dropout_mask
 
 # Convenience function to load with optional boundary checks.
 # "First" is the major dim, "second" is the minor dim.
@@ -82,14 +56,15 @@ def compute_alibi_block(alibi_slope, seqlen_q, seqlen_k, offs_m, offs_n, transpo
 
 
 @triton.jit
-def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stride_vk, stride_bn, start_m,
-                    actual_seqlen_k, actual_seqlen_q, dropout_p, philox_seed, batch_philox_offset, exp_scores_ptrs,
-                    block_min, block_max, offs_n_causal, masked_blocks, n_extra_tokens, alibi_slope, score_ptrs, scores_scaled_shifted_ptrs,
+def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stride_vk, stride_bn, stride_sn, start_m,
+                    actual_seqlen_k, actual_seqlen_q, dropout_p, philox_seed, philox_ptrs, sd_mask_ptrs, dropout_mask_ptrs,
+                    block_min, block_max, offs_n_causal, masked_blocks, n_extra_tokens, alibi_slope,
                     IS_CAUSAL: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr,
                     OFFS_M: tl.constexpr, OFFS_N: tl.constexpr, PRE_LOAD_V: tl.constexpr, MASK_STEPS: tl.constexpr,
                     ENABLE_DROPOUT: tl.constexpr, PADDED_HEAD: tl.constexpr,
                     ACTUAL_BLOCK_DMODEL: tl.constexpr, SM_SCALE: tl.constexpr, USE_EXP2: tl.constexpr,
                     RETURN_SCORES: tl.constexpr):
+    DEBUG_DROPOUT = False
     if USE_EXP2:
         RCP_LN2: tl.constexpr = 1.4426950408889634
     
@@ -125,9 +100,6 @@ def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stri
         # -- compute qk ----
         qk += tl.dot(q, k)
         qk_scaled =  qk * SM_SCALE
-        if RETURN_SCORES:
-            score_mask = (OFFS_M[:, None] < actual_seqlen_q) & ((start_n + tl.arange(0, BLOCK_N))[None, :] < actual_seqlen_k)
-            tl.store(score_ptrs, qk_scaled, mask=score_mask)
 
         if IS_CAUSAL:
             causal_boundary = start_n + offs_n_causal
@@ -150,10 +122,6 @@ def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stri
 
         # scale and subtract max
         q_shifted = qk_scaled - m_ij[:, None]
-        if RETURN_SCORES: 
-            # NOTE: the returned score is not the same as the reference because we need to adjust as we find new maxes per block. We are not doing that
-            scores_scaled_shifted_mask = (OFFS_M[:, None] < actual_seqlen_q) & ((start_n + tl.arange(0, BLOCK_N))[None, :] < actual_seqlen_k)
-            tl.store(scores_scaled_shifted_ptrs, q_shifted, mask=scores_scaled_shifted_mask)
         
         # Compute scaled QK and softmax probabilities
         if USE_EXP2:
@@ -164,17 +132,19 @@ def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stri
         # CAVEAT: Must update l_ij before applying dropout
         l_ij = tl.sum(p, 1)
         if ENABLE_DROPOUT:
-            philox_offset = batch_philox_offset + start_m * BLOCK_M * actual_seqlen_k + start_n
-            keep = dropout_mask(philox_seed, philox_offset, dropout_p, BLOCK_M, BLOCK_N, actual_seqlen_k)
+            rng_output = tl.rand(philox_seed, philox_ptrs)  # TODO: use tl.randint for better performance
+            dropout_mask = rng_output > dropout_p
             if RETURN_SCORES:
                 # NOTE: the returned score is not the same as the reference because we need to adjust as we find new maxes per block. We are not doing that
-                exp_score_mask = (OFFS_M[:, None] < actual_seqlen_q) & ((start_n + tl.arange(0, BLOCK_N))[None, :] < actual_seqlen_k)
-                tl.store(exp_scores_ptrs, tl.where(keep, p, -p), mask=exp_score_mask)
-            p = tl.where(keep, p, 0.0)
+                p_mask = (OFFS_M[:, None] < actual_seqlen_q) & ((start_n + tl.arange(0, BLOCK_N))[None, :] < actual_seqlen_k)
+                tl.store(sd_mask_ptrs, tl.where(dropout_mask, p, -p), mask=p_mask)
+                if DEBUG_DROPOUT:
+                    tl.store(dropout_mask_ptrs, dropout_mask, mask=p_mask)
+            p = tl.where(dropout_mask, p, 0.0)
         elif RETURN_SCORES:
             # NOTE: the returned score is not the same as the reference because we need to adjust as we find new maxes per block. We are not doing that
-            exp_score_mask = (OFFS_M[:, None] < actual_seqlen_q) & ((start_n + tl.arange(0, BLOCK_N))[None, :] < actual_seqlen_k)
-            tl.store(exp_scores_ptrs, p, mask=exp_score_mask)
+            p_mask = (OFFS_M[:, None] < actual_seqlen_q) & ((start_n + tl.arange(0, BLOCK_N))[None, :] < actual_seqlen_k)
+            tl.store(sd_mask_ptrs, p, mask=p_mask)
         
         # -- update output accumulator --
         # alpha is an adjustment factor for acc and li as we loop and find new maxes
@@ -197,9 +167,11 @@ def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stri
         if bias_ptrs is not None:
             bias_ptrs += BLOCK_N * stride_bn
         if RETURN_SCORES:
-            score_ptrs += BLOCK_N
-            scores_scaled_shifted_ptrs += BLOCK_N
-            exp_scores_ptrs += BLOCK_N
+            sd_mask_ptrs += BLOCK_N * stride_sn
+        
+        if ENABLE_DROPOUT:
+            dropout_mask_ptrs += BLOCK_N * stride_sn
+            philox_ptrs += BLOCK_N * stride_sn
     return acc, l_i, m_i
 
 
@@ -282,7 +254,7 @@ def attn_fwd(Q, K, V, bias, SM_SCALE: tl.constexpr, LSE, Out, stride_qz, stride_
              stride_kz, stride_kh, stride_kn, stride_kk, stride_vz, stride_vh, stride_vk, stride_vn, 
              stride_oz, stride_oh, stride_om, stride_on, stride_bz, stride_bh, stride_bm, stride_bn, stride_az, stride_ah,
              stride_sz, stride_sh, stride_sm, stride_sn, stride_lse_z, stride_lse_h, stride_lse_m, cu_seqlens_q, cu_seqlens_k,
-             dropout_p, philox_seed, philox_offset_base, scores, scores_scaled_shifted, exp_scores, alibi_slopes,  HQ: tl.constexpr,
+             dropout_p, philox_seed, philox_offset_base, sd_mask, dropout_mask, alibi_slopes,  HQ: tl.constexpr,
              HK: tl.constexpr, ACTUAL_BLOCK_DMODEL: tl.constexpr, MAX_SEQLENS_Q: tl.constexpr,
              MAX_SEQLENS_K: tl.constexpr, VARLEN: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_M: tl.constexpr,
              BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, PRE_LOAD_V: tl.constexpr, USE_BIAS: tl.constexpr,
@@ -318,14 +290,14 @@ def attn_fwd(Q, K, V, bias, SM_SCALE: tl.constexpr, LSE, Out, stride_qz, stride_
     # inf written to LSE. We don't need to do any GEMMs in this case.
     # This block of code determines what N is, and if this WG is operating
     # on those M rows.
-    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    n_blocks = tl.cdiv(seqlen_k, BLOCK_N)
     if (IS_CAUSAL):
         # If seqlen_q == seqlen_k, the attn scores are a square matrix.
         # If seqlen_q != seqlen_k, attn scores are rectangular which means
         # the causal mask boundary is bottom right aligned, and ends at either
         # the top edge (seqlen_q < seqlen_k) or left edge.
         # This captures the decrease in n_blocks if we have a rectangular attn matrix
-        n_blocks_seqlen = cdiv_fn((start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
+        n_blocks_seqlen = tl.cdiv((start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
         # This is what adjusts the block_max for the current WG, only
         # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
         n_blocks = min(n_blocks, n_blocks_seqlen)
@@ -392,24 +364,19 @@ def attn_fwd(Q, K, V, bias, SM_SCALE: tl.constexpr, LSE, Out, stride_qz, stride_
         alibi_slope = None
 
     if RETURN_SCORES:
-        scores_offset = scores + off_z * stride_sz + off_h_q * stride_sh # + cu_seqlens_q_start * stride_sm
-        score_ptrs = scores_offset + offs_m[:, None] * stride_sm + offs_n[None, :] * stride_sn
-
-        scores_scaled_shifted_offset = scores_scaled_shifted + off_z * stride_sz + off_h_q * stride_sh # + cu_seqlens_q_start * stride_sm
-        scores_scaled_shifted_ptrs = scores_scaled_shifted_offset + offs_m[:, None] * stride_sm + offs_n[None, :] * stride_sn
-    
-        exp_scores_offset = exp_scores + off_z * stride_sz + off_h_q * stride_sh # + cu_seqlens_q_start * stride_sm
-        exp_scores_ptrs = exp_scores_offset + offs_m[:, None] * stride_sm + offs_n[None, :] * stride_sn
+        sd_mask_offset = sd_mask + off_z * stride_sz + off_h_q * stride_sh #+ cu_seqlens_q_start * stride_sm
+        sd_mask_ptrs = sd_mask_offset + offs_m[:, None] * stride_sm + offs_n[None, :] * stride_sn
     else:
-        score_ptrs = None
-        scores_scaled_shifted_ptrs = None
-        exp_scores_ptrs = None
+        sd_mask_ptrs = None
 
     if ENABLE_DROPOUT:
-        off_hz = off_z * HQ + off_h_q
-        batch_philox_offset = philox_offset_base + off_hz * MAX_SEQLENS_Q * MAX_SEQLENS_K
+        dropout_mask_offset = dropout_mask + off_z * stride_sz + off_h_q * stride_sh #+ cu_seqlens_q_start * stride_sm
+        dropout_mask_ptrs = dropout_mask_offset + offs_m[:, None] * stride_sm + offs_n[None, :] * stride_sn
+        batch_philox_offset = philox_offset_base + off_z * stride_sz + off_h_q * stride_sh #+ cu_seqlens_q_start * stride_sm
+        philox_ptrs = batch_philox_offset + offs_m[:, None] * stride_sm + offs_n[None, :] * stride_sn
     else:
-        batch_philox_offset = 0
+        dropout_mask_ptrs = None
+        philox_ptrs = 0
     # initialize pointer to m and l
     m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
     l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
@@ -440,11 +407,11 @@ def attn_fwd(Q, K, V, bias, SM_SCALE: tl.constexpr, LSE, Out, stride_qz, stride_
     # value because there is no masking. Similarly we do not need padding.
     if n_full_blocks > 0:
         block_max = (n_blocks - masked_blocks) * BLOCK_N
-        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stride_vk, stride_bn,
-                                        start_m, seqlen_k, seqlen_q, dropout_p, philox_seed, batch_philox_offset,
-                                        exp_scores_ptrs,
+        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stride_vk, stride_bn, stride_sn,
+                                        start_m, seqlen_k, seqlen_q, dropout_p, philox_seed, philox_ptrs,
+                                        sd_mask_ptrs, dropout_mask_ptrs,
                                         # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-                                        block_min, block_max, 0, 0, 0, alibi_slope, score_ptrs, scores_scaled_shifted_ptrs,
+                                        block_min, block_max, 0, 0, 0, alibi_slope,
                                         # IS_CAUSAL, ....
                                         False, BLOCK_M, BLOCK_DMODEL, BLOCK_N, offs_m, offs_n,
                                         # _, MASK_STEPS, ...
@@ -465,13 +432,14 @@ def attn_fwd(Q, K, V, bias, SM_SCALE: tl.constexpr, LSE, Out, stride_qz, stride_
         if USE_BIAS:
             bias_ptrs += n_full_blocks * BLOCK_N * stride_bn
         if RETURN_SCORES:
-            score_ptrs += n_full_blocks * BLOCK_N
-            scores_scaled_shifted_ptrs += n_full_blocks * BLOCK_N
-            exp_scores_ptrs += n_full_blocks * BLOCK_N
-        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stride_vk, stride_bn,
-                                        start_m, seqlen_k, seqlen_q, dropout_p, philox_seed, batch_philox_offset,
-                                        exp_scores_ptrs, block_min, block_max, offs_n_causal, masked_blocks,
-                                        n_extra_tokens, alibi_slope, score_ptrs, scores_scaled_shifted_ptrs,
+            sd_mask_ptrs += n_full_blocks * BLOCK_N * stride_sn
+        if ENABLE_DROPOUT:
+            dropout_mask_ptrs += n_full_blocks * BLOCK_N * stride_sn
+            philox_ptrs += n_full_blocks * BLOCK_N * stride_sn
+        acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stride_vk, stride_bn, stride_sn,
+                                        start_m, seqlen_k, seqlen_q, dropout_p, philox_seed, philox_ptrs,
+                                        sd_mask_ptrs, dropout_mask_ptrs, block_min, block_max, offs_n_causal, masked_blocks,
+                                        n_extra_tokens, alibi_slope,
                                         IS_CAUSAL, BLOCK_M, BLOCK_DMODEL, BLOCK_N, offs_m, offs_n,
                                         # _, MASK_STEPS, ...
                                         PRE_LOAD_V, True, ENABLE_DROPOUT, PADDED_HEAD,
@@ -481,7 +449,8 @@ def attn_fwd(Q, K, V, bias, SM_SCALE: tl.constexpr, LSE, Out, stride_qz, stride_
     l_recip = 1 / l_i[:, None]
     acc = acc * l_recip
     if ENABLE_DROPOUT:
-        acc = acc / (1 - dropout_p)
+        dropout_scale = 1 / (1 - dropout_p)
+        acc = acc * dropout_scale
     # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
     # then we have one block with a row of all NaNs which come from computing
     # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
@@ -547,13 +516,18 @@ def attention_prefill_forward_triton_impl(
                                         alibi_slopes,
                                         causal,
                                         bias,
-                                        dropout_p,
                                         layout,
+                                        # varlen
                                         cu_seqlens_q, 
                                         cu_seqlens_k,
                                         max_seqlens_q, 
                                         max_seqlens_k, 
-                                        return_scores, 
+                                        # dropout
+                                        dropout_p,
+                                        philox_seed,
+                                        philox_offset,
+                                        # misc
+                                        return_softmax,
                                         use_exp2):
 
     if DEBUG:
@@ -567,13 +541,15 @@ def attention_prefill_forward_triton_impl(
         print("alibi_slopes:", alibi_slopes)
         print("causal:", causal)
         print("bias:", bias)
-        print("dropout_p:", dropout_p)
         print("layout:", layout)
         print("cu_seqlens_q:", cu_seqlens_q)
         print("cu_seqlens_k:", cu_seqlens_k)
         print("max_seqlens_q:", max_seqlens_q)
         print("max_seqlens_k:", max_seqlens_k)
-        print("return_scores:", return_scores)
+        print("dropout_p:", dropout_p)
+        print("philox_seed:", philox_seed)
+        print("philox_offset:", philox_offset)
+        print("return_scores:", return_softmax)
         print("use_exp2:", use_exp2)
 
     # check if varlen
@@ -586,7 +562,6 @@ def attention_prefill_forward_triton_impl(
     batch, nheads_q, nheads_k, head_size, seqlen_q, seqlen_k = get_shape_from_layout(q, k, layout, cu_seqlens_q, cu_seqlens_k, max_seqlens_q, max_seqlens_k)
     q_strides, k_strides, v_strides, o_strides = get_strides_from_layout(q, k, v, o, layout)
 
-
     # Get closest power of 2 over or equal to 32.
     padded_d_model = 1 << (head_size - 1).bit_length()
     # Smallest head_dim supported is 16. If smaller, the tile in the
@@ -595,26 +570,21 @@ def attention_prefill_forward_triton_impl(
 
     grid = lambda META: (triton.cdiv(max_seqlens_q, META['BLOCK_M']), nheads_q, batch)
 
-    if return_scores:
-        scores = torch.zeros((batch, nheads_q, max_seqlens_q, max_seqlens_k), device=q.device,
-                                        dtype=torch.float32)
-        scores_scaled_shifted = torch.zeros((batch, nheads_q, max_seqlens_q, max_seqlens_k), device=q.device,
-                                        dtype=torch.float32)
-        scores_strides = (scores.stride(0), scores.stride(1), scores.stride(2), scores.stride(3))
-    else:
-        scores = None
-        scores_scaled_shifted = None
-        scores_strides = (0, 0 , 0 , 0)
-
-    # exp_scores is used to validate dropout behavior vs the PyTorch SDPA math backend reference.  We zero this out
+    # sd_mask is used to validate dropout behavior vs the PyTorch SDPA math backend reference.  We zero this out
     # to give a consistent starting point and then populate it with the output of softmax with the sign bit set according
     # to the dropout mask. The resulting return allows this mask to be fed into the reference implementation for testing
-    # only.  This return holds no useful output aside from debugging.
-    if return_scores:
-        exp_scores = torch.zeros((batch, nheads_q, max_seqlens_q, max_seqlens_k), device=q.device,
+    # only. This return holds no useful output aside from debugging.
+    use_dropout = (dropout_p > 0.0)
+    if use_dropout or return_softmax:
+        sd_mask = torch.zeros((batch, nheads_q, max_seqlens_q, max_seqlens_k), device=q.device,
+                                        dtype=torch.float32)
+        dropout_mask = torch.zeros((batch, nheads_q, max_seqlens_q, max_seqlens_k), device=q.device,
                                         dtype=torch.float32)
+        scores_strides = (sd_mask.stride(0), sd_mask.stride(1), sd_mask.stride(2), sd_mask.stride(3))
     else:
-        exp_scores = None
+        sd_mask = None
+        dropout_mask = None
+        scores_strides = (0, 0, 0, 0)
 
     # stores LSE the log of the normalization constant / sum of expoential score(unnormalzied probablities)
     if is_varlen:
@@ -625,10 +595,6 @@ def attention_prefill_forward_triton_impl(
         softmax_lse = torch.empty((batch, nheads_q, max_seqlens_q), device=q.device, dtype=torch.float32)
         stride_lse_z, stride_lse_h, stride_lse_m = softmax_lse.stride()
 
-    # Seed the RNG so we get reproducible results for testing.
-    philox_seed = 0x1BF58
-    philox_offset = 0x1D4B49
-
     if bias is not None:
         bias_strides = (bias.stride(0), bias.stride(1),bias.stride(2),
                         bias.stride(3))
@@ -643,19 +609,22 @@ def attention_prefill_forward_triton_impl(
 
     attn_fwd[grid](q, k, v, bias, sm_scale, softmax_lse, o, *q_strides, *k_strides, *v_strides, *o_strides,
                     *bias_strides, *alibi_strides, *scores_strides, stride_lse_z, stride_lse_h, stride_lse_m, cu_seqlens_q, cu_seqlens_k,
-                    dropout_p=dropout_p, philox_seed=philox_seed, philox_offset_base=philox_offset, scores=scores, 
-                    scores_scaled_shifted=scores_scaled_shifted, exp_scores=exp_scores, alibi_slopes=alibi_slopes, 
+                    dropout_p=dropout_p, philox_seed=philox_seed, philox_offset_base=philox_offset, sd_mask=sd_mask, dropout_mask=dropout_mask, alibi_slopes=alibi_slopes, 
                     HQ=nheads_q, HK=nheads_k, ACTUAL_BLOCK_DMODEL=head_size, MAX_SEQLENS_Q=max_seqlens_q,
                     MAX_SEQLENS_K=max_seqlens_k, IS_CAUSAL=causal, VARLEN=is_varlen,
                     BLOCK_DMODEL=padded_d_model, USE_BIAS=False if bias is None else True,
                     USE_ALIBI=False if alibi_slopes is None else True, ENABLE_DROPOUT=dropout_p
-                    > 0.0, USE_EXP2=use_exp2, RETURN_SCORES=return_scores)
+                    > 0.0, USE_EXP2=use_exp2, RETURN_SCORES=return_softmax)
 
     if DEBUG:
         print()
         print("attention_prefill_forward_triton_impl outputs")
         print("o:", o, o.shape)
         print("softmax_lse:", softmax_lse, softmax_lse.shape)
-        print("exp_scores:", exp_scores, exp_scores.shape if exp_scores is not None else None)
+        print("sd_mask:", sd_mask, sd_mask.shape if sd_mask is not None else None)
+        if use_dropout:
+            print("dropout_mask:", dropout_mask, dropout_mask.shape if dropout_mask is not None else None)
+            print("dropout_fraction fwd:", 1.0 - (dropout_mask.sum()/ dropout_mask.numel()).item())
+            write_dropout_mask(dropout_mask, "dropout_mask_fwd")
 
-    return o, softmax_lse, exp_scores, grid, head_size, philox_seed, philox_offset, scores, scores_scaled_shifted
+    return o, softmax_lse, sd_mask.to(o.dtype) if return_softmax else None 
diff --git a/flash_attn/flash_attn_triton_amd/fwd_ref.py b/flash_attn/flash_attn_triton_amd/fwd_ref.py
index 9d860d7da..909996654 100644
--- a/flash_attn/flash_attn_triton_amd/fwd_ref.py
+++ b/flash_attn/flash_attn_triton_amd/fwd_ref.py
@@ -2,9 +2,9 @@
 import math
 from .utils import DEBUG
 
-DEBUG_CORE = DEBUG and False
+DEBUG_CORE = False
 
-def attention_forward_core_ref_impl(q, k, v, sm_scale, causal, use_exp2):
+def attention_forward_core_ref_impl(q, k, v, sm_scale, causal, dropout_p, philox_seed, philox_offset, use_exp2):
     if DEBUG_CORE:
         print()
         print("attention_forward_core_ref_impl")
@@ -13,10 +13,18 @@ def attention_forward_core_ref_impl(q, k, v, sm_scale, causal, use_exp2):
         print("v:", v, v.shape)
         print("sm_scale:", sm_scale)
         print("causal:", causal)
+        print("dropout_p:", dropout_p)
+        print("philox_seed:", philox_seed)
+        print("philox_offset:", philox_offset)
         print("use_exp2:", use_exp2)
+
+    # cast to float32
+    q = q.to(torch.float32)
+    k = k.to(torch.float32)
+    v = v.to(torch.float32)
     
     # Compute attention scores
-    attention_scores = torch.matmul(q.to(torch.float32), k.transpose(-2, -1).to(torch.float32))
+    attention_scores = torch.matmul(q, k.transpose(-2, -1))
     if DEBUG_CORE:
         print("attention_scores:", attention_scores, attention_scores.shape)
 
@@ -32,16 +40,15 @@ def attention_forward_core_ref_impl(q, k, v, sm_scale, causal, use_exp2):
         col_idx = torch.arange(L_k, device=q.device).unsqueeze(0)
         col_offset = L_q-L_k
         causal_mask = row_idx >= (col_offset + col_idx)
-        if DEBUG:
+        if DEBUG_CORE:
             print("causal_mask:", causal_mask)
         # set -inf to places the causal mask is false
         attention_scaled_scores = attention_scaled_scores.masked_fill(
              torch.logical_not(causal_mask.unsqueeze(0)), float('-inf')
         )
-        if DEBUG:
+        if DEBUG_CORE:
             print("attention_scaled_scores after causal:", attention_scaled_scores, attention_scaled_scores.shape)
 
-
     # Compute max for numerical stability
     max_scores = torch.max(attention_scaled_scores, dim=-1, keepdim=True)[0]
     if DEBUG_CORE:
@@ -84,11 +91,28 @@ def attention_forward_core_ref_impl(q, k, v, sm_scale, causal, use_exp2):
         print("sum_exp_scores:", sum_exp_scores, sum_exp_scores.shape)
 
     # Compute softmax probabilities
-    softmax = exp_scores / sum_exp_scores
+    p = exp_scores / sum_exp_scores
 
     if DEBUG_CORE:
-        print("softmax:", softmax, softmax.shape)
-
+        print("softmax:", p, p.shape)
+        
+    # apply dropout if specified
+    if dropout_p > 0.0:
+        rand_vals = torch.rand(p.shape, generator=torch.Generator(device=p.device).manual_seed(philox_seed), device=p.device, dtype=p.dtype)
+        dropout_mask, dropout_scale = rand_vals > dropout_p,  (1.0 / (1 - dropout_p))
+        if DEBUG_CORE:
+            print("dropout_scale:", dropout_scale)
+            print("dropout_mask:", dropout_mask)
+        # Apply dropout mask and scale
+        # Set -1 for dropped positions and 1 for kept positions in exp_scores 
+        sd_mask = torch.where(dropout_mask, exp_scores, -exp_scores)
+        p = torch.where(dropout_mask, p , torch.zeros_like(p)) * dropout_scale
+        if DEBUG_CORE:
+            print("softmax after dropout:", p)
+            print("sd_mask:", sd_mask)
+    else:
+        sd_mask = exp_scores
+    
     # Compute log-sum-exp
     if use_exp2:
         LN2 = math.log(2)
@@ -105,13 +129,18 @@ def attention_forward_core_ref_impl(q, k, v, sm_scale, causal, use_exp2):
         print("softmax_lse:", softmax_lse, softmax_lse.shape)
 
     # Compute output
-    o = torch.matmul(softmax, v.to(torch.float32)).to(torch.float16)
+    o = torch.matmul(p, v)
     if DEBUG_CORE:
         print("o:", o, o.shape)
 
-    return o, softmax_lse, exp_scores, softmax, attention_shifted_scaled_scores, attention_scaled_scores, attention_scores
+    # cast back to original dtype
+    o = o.to(torch.float16)
+    # softmax_lse = softmax_lse.to(torch.float16) # NOTE: if you cast lse to fp16 it cause accuracy issues. keep fp32
+    sd_mask = sd_mask.to(torch.float16)
 
-def attention_vanilla_forward_pytorch_ref_impl(q, k, v, sm_scale, causal, layout, use_exp2):
+    return o, softmax_lse, sd_mask
+
+def attention_vanilla_forward_pytorch_ref_impl(q, k, v, sm_scale, causal, layout, dropout_p, philox_seed, philox_offset, use_exp2):
     """Compute reference output and softmax_lse using PyTorch's built-in function"""
 
     # Ensure the layout is 'bhsd'
@@ -146,8 +175,8 @@ def attention_vanilla_forward_pytorch_ref_impl(q, k, v, sm_scale, causal, layout
         v = v.reshape(batch_size * nheads_k, seq_len_k, head_dim)
 
     # Call the core attention function
-    o, softmax_lse, exp_scores, softmax, attention_shifted_scaled_scores, attention_scaled_scores, attention_scores = attention_forward_core_ref_impl(
-        q, k, v, sm_scale, causal, use_exp2
+    o, softmax_lse, sd_mask = attention_forward_core_ref_impl(
+        q, k, v, sm_scale, causal, dropout_p, philox_seed, philox_offset, use_exp2
     )
 
     if group_size != 1:
@@ -156,27 +185,19 @@ def attention_vanilla_forward_pytorch_ref_impl(q, k, v, sm_scale, causal, layout
         o = o.reshape(batch_size, nheads_q, seq_len_q, head_dim)
         softmax_lse = softmax_lse.reshape(batch_size, nheads_k, group_size, seq_len_q)
         softmax_lse = softmax_lse.reshape(batch_size, nheads_q, seq_len_q)
-        exp_scores = exp_scores.reshape(batch_size, nheads_k, group_size, seq_len_q, seq_len_k)
-        exp_scores = exp_scores.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
-        softmax = softmax.reshape(batch_size, nheads_k, group_size, seq_len_q, seq_len_k)
-        softmax = softmax.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
-        attention_scaled_scores = attention_scaled_scores.reshape(batch_size, nheads_k, group_size, seq_len_q, seq_len_k)
-        attention_scaled_scores = attention_scaled_scores.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
+        sd_mask = sd_mask.reshape(batch_size, nheads_k, group_size, seq_len_q, seq_len_k)
+        sd_mask = sd_mask.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
     else:
         # Standard case
         o = o.reshape(batch_size, nheads_q, seq_len_q, head_dim)
         softmax_lse = softmax_lse.reshape(batch_size, nheads_q, seq_len_q)
-        exp_scores = exp_scores.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
-        softmax = softmax.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
-        attention_shifted_scaled_scores = attention_shifted_scaled_scores.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
-        attention_scaled_scores = attention_scaled_scores.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
-        attention_scores = attention_scores.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
+        sd_mask = sd_mask.reshape(batch_size, nheads_q, seq_len_q, seq_len_k)
 
     # Restore original layout if necessary
     if layout == "bshd":
         o = o.transpose(1, 2)
 
-    return o, softmax_lse, exp_scores, softmax, attention_shifted_scaled_scores, attention_scaled_scores, attention_scores
+    return o, softmax_lse, sd_mask
 
 
 def attention_varlen_forward_pytorch_ref_impl(
@@ -190,6 +211,9 @@ def attention_varlen_forward_pytorch_ref_impl(
     cu_seqlens_k,
     max_seqlen_q,
     max_seqlen_k,
+    dropout_p, 
+    philox_seed, 
+    philox_offset,
     use_exp2
 ):
     # Ensure the layout is 'thd'
@@ -202,9 +226,11 @@ def attention_varlen_forward_pytorch_ref_impl(
 
     # Pre-allocate outputs
     total_L_q = q.shape[0]
+    total_L_k = k.shape[0]
 
     o = torch.empty((total_L_q, nheads_q, head_dim), dtype=q.dtype, device=q.device)
     softmax_lse = torch.empty((total_L_q, nheads_q), dtype=torch.float32, device=q.device)
+    sd_mask = torch.zeros((batch_size, nheads_q, max_seqlen_q, max_seqlen_k), dtype=torch.float32, device=q.device)
 
     # Compute group_size for MQA/GQA handling
     group_size = nheads_q // nheads_k
@@ -252,15 +278,7 @@ def attention_varlen_forward_pytorch_ref_impl(
             v_i = v_i.reshape(nheads_k, seqlen_k, head_dim)
 
         # Call the core attention function for this sequence
-        (
-            o_i,
-            softmax_lse_i,
-            exp_scores_i,
-            softmax_i,
-            attention_shifted_scaled_scores_i,
-            attention_scaled_scores_i,
-            attention_scores_i,
-        ) = attention_forward_core_ref_impl(q_i, k_i, v_i, sm_scale, causal, use_exp2)
+        o_i, softmax_lse_i, sd_mask_i = attention_forward_core_ref_impl(q_i, k_i, v_i, sm_scale, causal, dropout_p, philox_seed, philox_offset, use_exp2)
 
         # Reshape outputs back to original dimensions
         if group_size != 1:
@@ -275,23 +293,17 @@ def attention_varlen_forward_pytorch_ref_impl(
             # Outputs are already in the correct shape
             pass
 
-        # Convert back to 'thd' layout and float16
-        o_i = o_i.permute(1, 0, 2).to(torch.float16)  # [L_q_i, nheads_q, head_dim]
+        # Convert back to 'thd' layout
+        o_i = o_i.permute(1, 0, 2)  # [L_q_i, nheads_q, head_dim]
         softmax_lse_i = softmax_lse_i.permute(1, 0)  # [L_q_i, nheads_q]
+        sd_mask_i = sd_mask_i # [nheads_q, L_q_i, L_k_i]
 
         # Place outputs in pre-allocated tensors
         o[start_q:end_q, :, :] = o_i
         softmax_lse[start_q:end_q, :] = softmax_lse_i
+        sd_mask[i, :, :seqlen_q, :seqlen_k] = sd_mask_i
 
-    return (
-        o,
-        softmax_lse,
-        None,
-        None,
-        None,
-        None,
-        None,
-    )
+    return o, softmax_lse, sd_mask
 
 
 
@@ -301,12 +313,14 @@ def attention_forward_pytorch_ref_impl(
     v,
     sm_scale,
     causal,
-    dropout_p,
     layout,
     cu_seqlens_q,
     cu_seqlens_k,
     max_seqlen_q,
     max_seqlen_k,
+    dropout_p,
+    philox_seed,
+    philox_offset,
     use_exp2
     ):
     if DEBUG:
@@ -322,64 +336,46 @@ def attention_forward_pytorch_ref_impl(
         print("cu_seqlens_k:", cu_seqlens_k)
         print("max_seqlen_q:", max_seqlen_q)
         print("max_seqlen_k:", max_seqlen_k)
+        print("dropout_p:", dropout_p)
+        print("philox_seed:", philox_seed)
+        print("philox_offset:", philox_offset)
         print("use_exp2:", use_exp2)
 
      # compute reference
     if layout == "thd":
-        (
-            o_ref,
-            softmax_lse_ref,
-            exp_scores_ref,
-            softmax_ref,
-            attention_shifted_scaled_scores_ref,
-            attention_scaled_scores_ref,
-            attention_scores_ref,
-        ) = attention_varlen_forward_pytorch_ref_impl(
+        o_ref, softmax_lse_ref, sd_mask_ref = attention_varlen_forward_pytorch_ref_impl(
             q.clone(), 
             k.clone(), 
             v.clone(), 
             sm_scale, 
-            causal, 
+            causal,
             layout,
             cu_seqlens_q,
             cu_seqlens_k,
             max_seqlen_q,
             max_seqlen_k,
+            dropout_p,
+            philox_seed,
+            philox_offset,
             use_exp2,
         )
     else:
-        (
-            o_ref,
-            softmax_lse_ref,
-            exp_scores_ref,
-            softmax_ref,
-            attention_shifted_scaled_scores_ref,
-            attention_scaled_scores_ref,
-            attention_scores_ref,
-        ) = attention_vanilla_forward_pytorch_ref_impl(
-            q.clone(), k.clone(), v.clone(), sm_scale, causal, layout, use_exp2
-        )
+        o_ref, softmax_lse_ref, sd_mask_ref = attention_vanilla_forward_pytorch_ref_impl(q.clone(),
+                                                       k.clone(),
+                                                       v.clone(),
+                                                       sm_scale,
+                                                       causal,
+                                                       layout,
+                                                       dropout_p,
+                                                       philox_seed,
+                                                       philox_offset,
+                                                       use_exp2)
 
     if DEBUG:
         print()
         print("attention_forward_pytorch_ref_impl outputs")
-        print("o_ref:", o_ref, o_ref.shape)
-        print("softmax_lse_ref:", softmax_lse_ref, softmax_lse_ref.shape)
-        print("exp_scores_ref:", exp_scores_ref, exp_scores_ref.shape if exp_scores_ref is not None else None)
-
-    return (
-            o_ref,
-            softmax_lse_ref,
-            exp_scores_ref,
-            softmax_ref,
-            attention_shifted_scaled_scores_ref,
-            attention_scaled_scores_ref,
-            attention_scores_ref,
-    )
-
+        print("o:", o_ref, o_ref.shape)
+        print("softmax_lse:", softmax_lse_ref, softmax_lse_ref.shape)
+        print("sd_mask:", sd_mask_ref, sd_mask_ref.shape if sd_mask_ref is not None else None)
 
-def compute_alibi_tensor_ref(alibi_slopes, seqlen_q, seqlen_k):
-    q_idx = torch.arange(seqlen_q, dtype=torch.int32, device="cuda").unsqueeze(-1)  # (N_CTX_Q, 1)
-    k_idx = torch.arange(seqlen_k, dtype=torch.int32, device="cuda").unsqueeze(0)  # (1, N_CTX_K)
-    relative_pos = torch.abs(q_idx + seqlen_k - seqlen_q - k_idx)  # (N_CTX_Q, N_CTX_K)
-    return -1 * alibi_slopes.unsqueeze(-1).unsqueeze(-1) * relative_pos  # (Z, H, N_CTX_Q, N_CTX_K)
\ No newline at end of file
+    return o_ref, softmax_lse_ref, sd_mask_ref
diff --git a/flash_attn/flash_attn_triton_amd/interface_fa.py b/flash_attn/flash_attn_triton_amd/interface_fa.py
index 5d2bf1d2d..51037f236 100644
--- a/flash_attn/flash_attn_triton_amd/interface_fa.py
+++ b/flash_attn/flash_attn_triton_amd/interface_fa.py
@@ -39,7 +39,6 @@ def fwd(q,
         print("window_size_left:", window_size_left)
         print("window_size_right:", window_size_right)
         print("softcap:", softcap)
-        print("softcap:", softcap)
         print("return_softmax:", return_softmax)
 
 
@@ -63,48 +62,38 @@ def fwd(q,
         metadata.need_alibi(alibi_slopes, batch, nheads_q)
     
     if dropout_p > 0.0:
-        metadata.need_dropout(dropout_p, return_softmax)
+        metadata.need_dropout(dropout_p)
+        rng_state = torch.as_tensor([metadata.philox_seed, metadata.philox_offset]) # as_tensors uses the underlying data and doesnot cast
+    else:
+        rng_state = None
     
-    # Check arguments
+    # check arguments
     metadata.check_args(q, k, v, o)
 
-    rng_state = None
-
+    # call implementation 
     if USE_REF:
         if DEBUG:
             print("Using reference implementation")
-        (output, 
-        softmax_lse, 
-        exp_scores, 
-        _, 
-        _,
-        _, 
-        _) = attention_forward_pytorch_ref_impl(
+        output, softmax_lse, sd_mask = attention_forward_pytorch_ref_impl(
                                                 q, 
                                                 k, 
                                                 v,
                                                 metadata.sm_scale, 
                                                 metadata.causal,
                                                 metadata.layout,
-                                                dropout_p,
                                                 metadata.cu_seqlens_q, 
                                                 metadata.cu_seqlens_k,
                                                 metadata.max_seqlens_q, 
                                                 metadata.max_seqlens_k,
+                                                metadata.dropout_p, 
+                                                metadata.philox_seed,
+                                                metadata.philox_offset,
                                                 metadata.use_exp2)
         o.copy_(output)
     else:
         if DEBUG:
             print("Using Triton implementation")
-        (_, 
-        softmax_lse, 
-        exp_scores, 
-        _, 
-        _, 
-        philox_seed, 
-        philox_offset, 
-        _, 
-        _) = attention_prefill_forward_triton_impl(
+        output, softmax_lse, sd_mask = attention_prefill_forward_triton_impl(
                                                 q, 
                                                 k, 
                                                 v, 
@@ -112,26 +101,25 @@ def fwd(q,
                                                 metadata.sm_scale, 
                                                 metadata.alibi_slopes, 
                                                 metadata.causal, 
-                                                metadata.bias, 
-                                                metadata.dropout_p, 
+                                                metadata.bias,
                                                 metadata.layout, 
                                                 metadata.cu_seqlens_q, 
                                                 metadata.cu_seqlens_k,
                                                 metadata.max_seqlens_q, 
-                                                metadata.max_seqlens_k, 
-                                                metadata.return_scores, 
+                                                metadata.max_seqlens_k,
+                                                metadata.dropout_p, 
+                                                metadata.philox_seed,
+                                                metadata.philox_offset,
+                                                metadata.return_scores,
                                                 metadata.use_exp2)
-        
-        # Init rng_state if dropout is enabled
-        rng_state = torch.Tensor([philox_seed, philox_offset]) if dropout_p > 0.0 else None
 
     if DEBUG:
         print("fwd outputs")
         print("o:", o, o.shape)
         print("softmax_lse:", softmax_lse, softmax_lse.shape)
-        print("exp_scores:", exp_scores, exp_scores.shape if exp_scores is not None else None )
+        print("exp_scores:", sd_mask, sd_mask.shape if sd_mask is not None else None )
 
-    return o, softmax_lse, exp_scores, rng_state
+    return o, softmax_lse, sd_mask, rng_state
 
 def bwd(
     dout,
@@ -154,6 +142,11 @@ def bwd(
     gen_,
     rng_state,
 ):
+    # NOTE: this might have perf costs
+    dq.zero_()
+    dk.zero_()
+    dv.zero_()
+
     if DEBUG:
         print()
         print("flash_attn_triton_amd.py::bwd")
@@ -177,6 +170,12 @@ def bwd(
         print("gen_:", gen_)
         print("rng_state:", rng_state)
 
+    if dropout_p > 0.0:
+        philox_seed, philox_offset = rng_state[0].item(), rng_state[1].item()
+    else:
+        philox_seed, philox_offset = None, None
+
+    # call implementation
     if USE_REF:
         if DEBUG:
             print("Using reference implementation")
@@ -190,14 +189,15 @@ def bwd(
             softmax_lse,
             softmax_scale,
             causal,
-            dropout_p,
             "bshd",
             None,
             None,
             None,
             None,
+            dropout_p, 
+            philox_seed, 
+            philox_offset,
             False,
-            rng_state
         )
         dq.copy_(dq_ref)
         dk.copy_(dk_ref)
@@ -219,14 +219,15 @@ def bwd(
             softmax_scale,
             alibi_slopes,
             causal,
-            dropout_p,
             "bshd",
             None,
             None,
             None,
             None,
+            dropout_p, 
+            philox_seed, 
+            philox_offset,
             False,
-            rng_state
         )
         delta = delta_triton
 
@@ -277,7 +278,7 @@ def varlen_fwd(
         print("window_size_left:", window_size_left)
         print("window_size_right:", window_size_right)
         print("gen_:", gen_)
-    
+
     if o is None:
         o = torch.empty_like(q)
 
@@ -297,48 +298,40 @@ def varlen_fwd(
         metadata.need_alibi(alibi_slopes, batch, nheads_q)
     
     if dropout_p > 0.0:
-        metadata.need_dropout(dropout_p, return_softmax)
+        metadata.need_dropout(dropout_p)
+        rng_state = torch.as_tensor([metadata.philox_seed, metadata.philox_offset]) # as_tensors uses the underlying data and doesnot cast
+    else:
+        rng_state = None
     
     # Check arguments
     metadata.check_args(q, k, v, o)
     if o is None:
         o = torch.empty_like(q, dtype=v.dtype)
 
+    # call implementation
     if USE_REF:
         if DEBUG:
             print("Using reference implementation")
-        (output, 
-        softmax_lse, 
-        exp_scores, 
-        _, 
-        _,
-        _, 
-        _) = attention_forward_pytorch_ref_impl(
+        output, softmax_lse, sd_mask = attention_forward_pytorch_ref_impl(
                                                 q, 
                                                 k, 
                                                 v,
                                                 metadata.sm_scale, 
                                                 metadata.causal,
-                                                dropout_p,
                                                 metadata.layout, 
                                                 metadata.cu_seqlens_q, 
                                                 metadata.cu_seqlens_k,
                                                 metadata.max_seqlens_q, 
                                                 metadata.max_seqlens_k,
+                                                metadata.dropout_p, 
+                                                metadata.philox_seed,
+                                                metadata.philox_offset,
                                                 metadata.use_exp2)
         o.copy_(output)
     else:
         if DEBUG:
             print("Using Triton implementation")
-        (_, 
-        softmax_lse, 
-        exp_scores, 
-        _, 
-        _, 
-        philox_seed, 
-        philox_offset, 
-        _, 
-        _) = attention_prefill_forward_triton_impl(
+        output, softmax_lse, sd_mask = attention_prefill_forward_triton_impl(
                                                             q, 
                                                             k, 
                                                             v, 
@@ -346,24 +339,25 @@ def varlen_fwd(
                                                             metadata.sm_scale, 
                                                             metadata.alibi_slopes, 
                                                             metadata.causal, 
-                                                            metadata.bias, 
-                                                            metadata.dropout_p, 
+                                                            metadata.bias,
                                                             metadata.layout, 
                                                             metadata.cu_seqlens_q, 
                                                             metadata.cu_seqlens_k,
                                                             metadata.max_seqlens_q, 
-                                                            metadata.max_seqlens_k, 
+                                                            metadata.max_seqlens_k,
+                                                            metadata.dropout_p, 
+                                                            metadata.philox_seed,
+                                                            metadata.philox_offset, 
                                                             metadata.return_scores, 
                                                             metadata.use_exp2)
-        # Init rng_state if dropout is enabled
-        rng_state = torch.Tensor([philox_seed, philox_offset]) if dropout_p > 0.0 else None
     if DEBUG:
         print("varlen_fwd outputs")
         print("o:", o, o.shape)
         print("softmax_lse:", softmax_lse, softmax_lse.shape)
-        print("exp_scores:", exp_scores, exp_scores.shape if exp_scores is not None else None )
+        print("sd_mask:", sd_mask, sd_mask.shape if sd_mask is not None else None )
+
 
-    return o, softmax_lse, exp_scores, rng_state
+    return o, softmax_lse, sd_mask, rng_state
 
 def varlen_bwd(
     dout,
@@ -417,6 +411,12 @@ def varlen_bwd(
         print("gen_:", gen_)
         print("rng_state:", rng_state)
 
+    if dropout_p > 0.0:
+        philox_seed, philox_offset = rng_state[0].item(), rng_state[1].item()
+    else:
+        philox_seed, philox_offset = None, None
+    
+    # call implementation
     if USE_REF:
         if DEBUG:
             print("Using reference implementation")
@@ -429,14 +429,15 @@ def varlen_bwd(
             softmax_lse,
             softmax_scale,
             causal,
-            dropout_p,
             "thd",
             cu_seqlens_q,
             cu_seqlens_k,
             max_seqlen_q,
             max_seqlen_k,
+            dropout_p, 
+            philox_seed,
+            philox_offset, 
             False,
-            rng_state
         )
         dq.copy_(dq_ref)
         dk.copy_(dk_ref)
@@ -458,14 +459,15 @@ def varlen_bwd(
             softmax_scale,
             alibi_slopes,
             causal,
-            dropout_p,
             "thd",
             cu_seqlens_q,
             cu_seqlens_k,
             max_seqlen_q,
             max_seqlen_k,
+            dropout_p, 
+            philox_seed,
+            philox_offset,
             False,
-            rng_state
         )
         delta = delta_triton
 
diff --git a/flash_attn/flash_attn_triton_amd/interface_torch.py b/flash_attn/flash_attn_triton_amd/interface_torch.py
index 983b68b67..d4906606e 100644
--- a/flash_attn/flash_attn_triton_amd/interface_torch.py
+++ b/flash_attn/flash_attn_triton_amd/interface_torch.py
@@ -46,7 +46,6 @@ def forward(ctx, q, k, v, o, metadata):
         ctx.return_scores = metadata.return_scores
         ctx.layout = metadata.layout
         ctx.use_exp2 = metadata.use_exp2
-        ctx.rng_state = (philox_seed, philox_offset)
         return output, softmax_lse, exp_scores
 
     @staticmethod
@@ -70,8 +69,7 @@ def backward(ctx, do, *args):
             None,
             None,
             None,
-            ctx.use_exp2,
-            ctx.rng_state
+            ctx.use_exp2
         )
 
 attention_prefill = _attention_prefill.apply
diff --git a/flash_attn/flash_attn_triton_amd/test.py b/flash_attn/flash_attn_triton_amd/test.py
index c22e33ba6..c0db2824c 100644
--- a/flash_attn/flash_attn_triton_amd/test.py
+++ b/flash_attn/flash_attn_triton_amd/test.py
@@ -2,8 +2,9 @@
 import pytest
 
 from .utils import MetaData, get_input_shapes, input_helper, varlen_input_helper, DEBUG
+from .common import compute_alibi_tensor_ref
 from .interface_torch import attention_prefill, attention_decode
-from .fwd_ref import attention_forward_pytorch_ref_impl, compute_alibi_tensor_ref
+from .fwd_ref import attention_forward_pytorch_ref_impl
 from .fwd_prefill import attention_prefill_forward_triton_impl
 from .bwd_prefill import attention_prefill_backward_triton_impl
 from .bwd_ref import attention_backward_pytorch_ref_impl
@@ -353,6 +354,9 @@ def test_op_bwd(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, torch_sdpa_test, use_ali
         (1, 2, 2, 4, 4, 16),
         (2, 1, 1, 4, 4, 16),
         (2, 2, 2, 4, 4, 16),
+        (1, 1, 1, 8, 8, 16),
+        (1, 1, 1, 16, 16, 16),
+        (1, 1, 1, 64, 64, 16),
         (1, 1, 1, 128, 64, 16),
         (2, 2, 2, 2, 128, 1),
         (2, 3, 3, 2, 128, 16),
@@ -377,15 +381,14 @@ def test_op_bwd(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, torch_sdpa_test, use_ali
     ],
 )
 @pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('return_scores', [False])
+@pytest.mark.parametrize('dropout_p', [0.0])
 @pytest.mark.parametrize('layout', ["bhsd", "bshd", "thd"])
 @pytest.mark.parametrize('use_exp2', [True, False]) # works when use_exp2 is false
 @pytest.mark.parametrize('DEBUG_INPUT', [False]) # NOTE: debug input can overflow when the tensors are large. Just use to figure out issues
-def test_op_prefill_fwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, return_scores, layout, use_exp2, DEBUG_INPUT):
+def test_op_prefill_fwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, layout, use_exp2, DEBUG_INPUT):
     dtype = torch.float16
     torch.manual_seed(0)
     alibi_slopes = None
-    dropout_p = 0.0
     device = "cuda"
 
     if layout == "thd":
@@ -409,19 +412,12 @@ def test_op_prefill_fwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, return
         metadata.need_causal()
 
     # NOTE: the returned score is not the same as the reference because we need to adjust as we find new maxes per block. We are not doing that
-    if return_scores:
-        metadata.return_scores = True
+    if dropout_p > 0.0:
+        metadata.need_dropout(dropout_p)
+
 
     # call Triton's forward implementation directly
-    ( output_triton, 
-        softmax_lse_triton, 
-        exp_scores_triton, 
-        _, 
-        _, 
-        _, 
-        _, 
-        _, 
-        _) = attention_prefill_forward_triton_impl(
+    output_triton, softmax_lse_triton, sd_mask_triton = attention_prefill_forward_triton_impl(
                                                 q, 
                                                 k, 
                                                 v, 
@@ -430,52 +426,49 @@ def test_op_prefill_fwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, return
                                                 metadata.alibi_slopes, 
                                                 metadata.causal, 
                                                 metadata.bias, 
-                                                metadata.dropout_p, 
                                                 metadata.layout, 
                                                 metadata.cu_seqlens_q, 
                                                 metadata.cu_seqlens_k,
                                                 metadata.max_seqlens_q, 
-                                                metadata.max_seqlens_k, 
+                                                metadata.max_seqlens_k,
+                                                metadata.dropout_p,
+                                                metadata.philox_seed, 
+                                                metadata.philox_offset, 
                                                 metadata.return_scores, 
                                                 metadata.use_exp2)
 
-    (
-        output_ref,
-        softmax_lse_ref,
-        exp_scores_ref,
-        softmax_ref,
-        attention_shifted_scaled_scores_ref,
-        attention_scaled_scores_ref,
-        attention_scores_ref,
-    ) = attention_forward_pytorch_ref_impl(
+    output_ref, softmax_lse_ref, sd_mask_ref  = attention_forward_pytorch_ref_impl(
         q.clone(), 
         k.clone(), 
         v.clone(), 
         metadata.sm_scale, 
-        causal,
-        dropout_p,
+        causal, 
         layout,
         metadata.cu_seqlens_q,
         metadata.cu_seqlens_k,
         metadata.max_seqlens_q,
         metadata.max_seqlens_k,
+        metadata.dropout_p,
+        metadata.philox_seed, 
+        metadata.philox_offset, 
         use_exp2
     )
 
+    if DEBUG:
+        print()
+        print("Compare Triton Impl with refernce Pytorch Impl")
+
+    # this can be set to true manually or when using dropout
+    if metadata.return_scores:
+        if DEBUG:
+            print("sd_mask_triton:", sd_mask_triton, sd_mask_triton.shape)
+            print("sd_mask_ref:", sd_mask_ref, sd_mask_ref.shape)
+        torch.testing.assert_close(sd_mask_triton, sd_mask_ref, atol=ATOL, rtol=RTOL)
+
     if DEBUG:
         print("softmax_lse_triton:", softmax_lse_triton, softmax_lse_triton.shape)
         print("softmax_lse_ref:", softmax_lse_ref, softmax_lse_ref.shape)
     torch.testing.assert_close(softmax_lse_triton, softmax_lse_ref, atol=ATOL, rtol=RTOL)
-
-    if layout != "thd":
-        # use trick with lse to get the softmax. you need the scores but is it
-        softmax_triton = torch.exp(attention_scaled_scores_ref - softmax_lse_triton.unsqueeze(-1))
-        if DEBUG:
-            print("attention_scaled_scores_ref:", attention_scaled_scores_ref, attention_scaled_scores_ref.shape)
-            print("softmax_lse_triton:", softmax_lse_triton, softmax_lse_triton.shape)
-            print("softmax_triton:", softmax_triton, softmax_triton.shape)
-            print("softmax_ref:", softmax_ref, softmax_ref.shape)
-        torch.testing.assert_close(softmax_triton, softmax_ref, atol=ATOL, rtol=RTOL)
     
     if DEBUG:
         print("output_triton:", output_triton, output_triton.shape)
@@ -502,7 +495,9 @@ def test_op_prefill_fwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, return
     (1, 1, 1, 16, 16, 16),
     (1, 1, 1, 32, 32, 16),
     (1, 1, 1, 64, 64, 16),
-    (1, 1, 1, 64, 64, 64),
+    (1, 1, 1, 64, 64, 16),
+    (1, 1, 1, 64, 128, 16),
+    (1, 1, 1, 64, 64, 32),
     (1, 1, 1, 64, 128, 32),
     (1, 1, 1, 128, 128, 64),
     (1, 1, 1, 128, 256, 45),
@@ -528,11 +523,12 @@ def test_op_prefill_fwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, return
     (1, 16, 16, 1024, 1024, 128),
 ])
 @pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('dropout_p', [0.0])
 @pytest.mark.parametrize('use_exp2', [False]) # FIXME: using exp2 causes issue when used with causal
 @pytest.mark.parametrize('layout', ["bhsd", "bshd", "thd"])
 @pytest.mark.parametrize('sequence_parallel', [True, False])
 @pytest.mark.parametrize('DEBUG_INPUT', [False]) # debug output causes nans on larger tensors
-def test_op_prefill_bwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_exp2, layout, sequence_parallel, DEBUG_INPUT):
+def test_op_prefill_bwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, dropout_p, use_exp2, layout, sequence_parallel, DEBUG_INPUT):
     dtype = torch.float16
     torch.manual_seed(20) # seed from test_op_bwd
 
@@ -546,30 +542,28 @@ def test_op_prefill_bwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_ex
     else:
         do = torch.randn_like(q)
 
+    # NOTE: the returned score is not the same as the reference because we need to adjust as we find new maxes per block. We are not doing that
+    if dropout_p > 0.0:
+        metadata.need_dropout(dropout_p)
+
     # =============================================== Reference ==============================================================
     q_ref = q.clone() 
     k_ref = k.clone()
     v_ref = v.clone()    
-    (
-        o_ref,
-        softmax_lse_ref,
-        _,
-        _,
-        _,
-        _,
-        _,
-    ) = attention_forward_pytorch_ref_impl(
+    output_ref, softmax_lse_ref, sd_mask_ref  = attention_forward_pytorch_ref_impl(
         q_ref,
         k_ref, 
         v_ref,
         metadata.sm_scale, 
-        causal,
-        dropout_p,
+        causal, 
         layout,
         metadata.cu_seqlens_q,
         metadata.cu_seqlens_k,
         metadata.max_seqlens_q,
         metadata.max_seqlens_k,
+        metadata.dropout_p,
+        metadata.philox_seed, 
+        metadata.philox_offset, 
         use_exp2
     )
 
@@ -594,22 +588,23 @@ def test_op_prefill_bwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_ex
         q_ref,
         k_ref,
         v_ref,
-        o_ref,
+        output_ref,
         softmax_lse_ref,
         metadata.sm_scale,
         causal,
-        dropout_p,
         layout,
         metadata.cu_seqlens_q,
         metadata.cu_seqlens_k,
         metadata.max_seqlens_q,
         metadata.max_seqlens_k,
-        use_exp2,
-        rng_state
+        metadata.dropout_p,
+        metadata.philox_seed, 
+        metadata.philox_offset, 
+        use_exp2
     )
 
     # =============================================== Triton ==============================================================
-    o = o_ref.clone().contiguous()
+    o = output_ref.clone().contiguous()
     softmax_lse = softmax_lse_ref.clone().contiguous()
     dq_triton, dk_triton, dv_triton, delta_triton, _, _ = attention_prefill_backward_triton_impl(
         do,
@@ -629,6 +624,9 @@ def test_op_prefill_bwd_impl(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_ex
         metadata.cu_seqlens_k,
         metadata.max_seqlens_q,
         metadata.max_seqlens_k,
+        metadata.dropout_p,
+        metadata.philox_seed, 
+        metadata.philox_offset, 
         use_exp2,
         sequence_parallel=sequence_parallel
     )
diff --git a/flash_attn/flash_attn_triton_amd/utils.py b/flash_attn/flash_attn_triton_amd/utils.py
index e68787e64..60586494f 100644
--- a/flash_attn/flash_attn_triton_amd/utils.py
+++ b/flash_attn/flash_attn_triton_amd/utils.py
@@ -1,4 +1,7 @@
 
+import csv
+import json
+import math
 import torch
 import os
 import triton
@@ -24,7 +27,9 @@ class MetaData():
     seqlen_new = None
     k_new = None
     v_new = None
-    dropout_p, return_scores= 0.0, False
+    return_scores= False
+    dropout_p= 0.0
+    philox_seed, philox_offset = None, None # if dropout_p > 0.0 seed the RNG so we get reproducible results for testing.
     # NOTE: scale sm_scale by log_2(e) and use 2^x in the loop as we do not have native e^x support in HW.
     use_exp2 = False
     rotary_sin = None
@@ -95,9 +100,10 @@ def need_rotary(self, sin, cos, rotary_interleaved, rotary_conjunction=False):
         self.rotary_interleaved = rotary_interleaved
         self.rotary_conjunction = rotary_conjunction
 
-    def need_dropout(self, dropout_p, return_scores):
+    def need_dropout(self, dropout_p):
         self.dropout_p = dropout_p
-        self.return_scores = return_scores
+        self.return_scores = True
+        self.philox_seed, self.philox_offset = 0x1BF58, 0x1D4B49
 
     def check_args(self, q, k, v, o):
         assert q.dim() == k.dim() and q.dim() == v.dim()
@@ -254,6 +260,40 @@ def get_padded_headsize(size):
     padded_d_model = max(padded_d_model, 16)
     return padded_d_model
 
+def write_dropout_mask(x, tensor_name = "tensor"):
+    batch, head, seqlen_m, seqlen_n = x.shape
+    x = x.tolist()
+
+    with open(f'{tensor_name}.csv', 'w') as f:
+        writer = csv.writer(f)
+        for b in range(batch):
+            for h in range(head):
+                dropout_mask = x[b][h]
+                if True:
+                    BLOCK_M = 64
+                    BLOCK_N = 64
+                
+                    # Calculate number of blocks in each dimension
+                    m_blocks = math.ceil(seqlen_m / BLOCK_M)
+                    n_blocks = math.ceil(seqlen_n / BLOCK_N)
+                    
+                    # Process each block
+                    for m_block in range(m_blocks):
+                        # Calculate row range for current block
+                        row_start = m_block * BLOCK_M
+                        row_end = min(row_start + BLOCK_M, seqlen_m)
+                        
+                        for n_block in range(n_blocks):
+                            # Calculate column range for current block
+                            col_start = n_block * BLOCK_N
+                            col_end = min(col_start + BLOCK_N, seqlen_n)
+                            
+                            # Extract and write the current block
+                            for row_idx in range(row_start, row_end):
+                                row_data = dropout_mask[row_idx][col_start:col_end]
+                                writer.writerow(row_data)
+                else:
+                    writer.writerows(dropout_mask)
 
 def _strides(x: torch.Tensor, *stride_names: str):
     if x is None:
@@ -278,4 +318,4 @@ def is_cdna():
 
 def is_rdna():
     return is_hip() and triton.runtime.driver.active.get_current_target().arch in ("gfx1030", "gfx1100", "gfx1101",
-                                                                                   "gfx1102", "gfx1200", "gfx1201")
+                                                                                   "gfx1102", "gfx1200", "gfx1201")
\ No newline at end of file
diff --git a/tests/test_flash_attn_triton_amd.py b/tests/test_flash_attn_triton_amd.py
index 4e60a4a22..2faa63114 100755
--- a/tests/test_flash_attn_triton_amd.py
+++ b/tests/test_flash_attn_triton_amd.py
@@ -589,13 +589,10 @@ def get_dropout_fraction(
 # @pytest.mark.parametrize('seqlen', [128, 256, 384, 512, 768, 1024, 2048])
 @pytest.mark.parametrize("seqlen", [97, 128, 200, 384, 768, 1024, 1025, 2048])
 # @pytest.mark.parametrize("seqlen", [128])
-# @pytest.mark.parametrize("dropout_p", [0.0, 0.17])
-@pytest.mark.parametrize("dropout_p", [0.0])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize("dropout_p", [0.0])
 def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype):
     if USE_TRITON_ROCM:
-        if dropout_p != 0.0:
-            pytest.skip("Dropout not supported in AMD's Triton Backend yet")
-
         if local == True:
             pytest.skip("local sliding window attention not supported on AMD's Triton Backend yet")
 
@@ -604,8 +601,8 @@ def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, determ
     device = "cuda"
     # set seed
     torch.random.manual_seed(0)
-    batch_size = 4
-    nheads = 9
+    batch_size = 1
+    nheads = 1
     window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,))
     qkv = torch.randn(
         batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True
@@ -716,10 +713,13 @@ def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, determ
 
     # Check that FlashAttention's numerical error is at most twice the numerical error
     # of a Pytorch implementation.
+    if DEBUG:
+        print("out:", out, out.shape)
+        print("out_ref:", out_ref, out_ref.shape)
     assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
 
     if dropout_p > 0.0:
-        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+        # assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
         # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate
         if not alibi:
             assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025)
@@ -747,15 +747,12 @@ def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, determ
 # @pytest.mark.parametrize('d', [32])
 @pytest.mark.parametrize("seqlen", [97, 128, 200, 257, 384, 512, 768, 1025, 2048])
 # @pytest.mark.parametrize('seqlen', [128])
-# @pytest.mark.parametrize("dropout_p", [0.0, 0.17])
-@pytest.mark.parametrize('dropout_p', [0.0])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize('dropout_p', [0.0])
 def test_flash_attn_varlen_qkvpacked(
     seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype
 ):
     if USE_TRITON_ROCM:
-        if dropout_p != 0.0:
-            pytest.skip("Dropout not supported in AMD's Triton Backend yet")
-
         if local == True:
             pytest.skip("local sliding window attention not supported on AMD's Triton Backend yet")
     if seqlen >= 2048 and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30:
@@ -874,10 +871,13 @@ def test_flash_attn_varlen_qkvpacked(
 
     # Check that FlashAttention's numerical error is at most twice the numerical error
     # of a Pytorch implementation.
+    if DEBUG:
+        print("out:", out, out.shape)
+        print("out_ref:", out_ref, out_ref.shape)
     assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
 
     if dropout_p > 0.0:
-        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+        # assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
         # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate
         if not alibi:
             assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025)
@@ -924,8 +924,8 @@ def test_flash_attn_varlen_qkvpacked(
     ],
 )
 # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)])
-# @pytest.mark.parametrize("dropout_p", [0.0, 0.17])
-@pytest.mark.parametrize("dropout_p", [0.17])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize("dropout_p", [0.0])
 # @pytest.mark.parametrize("softcap", [0.0, 50.0])
 @pytest.mark.parametrize("softcap", [0.0])
 def test_flash_attn_output(
@@ -948,12 +948,12 @@ def test_flash_attn_output(
     device = "cuda"
     # set seed
     torch.random.manual_seed(0)
-    batch_size = 1
-    nheads = 1 if softcap == 0.0 else 4  # softcap reference impl takes more memory
+    batch_size = 4
+    nheads = 6 if softcap == 0.0 else 4  # softcap reference impl takes more memory
     nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2)
     assert nheads % nheads_k == 0
     window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
-    q = torch.ones(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
     if softcap > 0:
         # Ensure the values of qk are at least within softcap range.
         q = q * softcap
@@ -962,10 +962,10 @@ def test_flash_attn_output(
             batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True
         )
     else:
-        k = torch.ones(
+        k = torch.randn(
             batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
         )
-        v = torch.ones(
+        v = torch.randn(
             batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
         )
     if alibi:
@@ -1002,6 +1002,7 @@ def test_flash_attn_output(
     if DEBUG:
         print("out:", out, out.shape)
         print("lse:", lse, lse.shape)
+        print("S_dmask:", S_dmask, S_dmask.shape if S_dmask is not None else None)
 
     if dropout_p > 0.0:
         S_dmask_converted = convert_flash_attn_S_to_softmax(
@@ -1107,7 +1108,7 @@ def test_flash_attn_output(
         print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}")
         print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}")
 
-    g = torch.ones_like(out)
+    g = torch.randn_like(out)
     do_o = (g.float() * out.float()).sum(-1)
     if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
         if kvpacked:
@@ -1155,26 +1156,23 @@ def test_flash_attn_output(
         print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
         print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
 
-    # NOTE: often is the case the the pytorch max diff is 0. This results in the test almost always 
-    # failing since the triton kernel must have 0 error to pass. To overcome this I've created a constant that is added
-    # to the error. If it is within these bounds it will pass. 
-    # VERY IMPORTANT NOTE: 
-    # if there is an issue with the dropout mask created in the bwd pass, the max error will be on the order of magnitude of
-    # 10^0. Thus I have set MIN_ERROR = 10^-2. This is large enough that it will pass every test regardless of precision error,
-    # but will definitely fail if there is an issue with the reconstructed mask.
-    MIN_ERROR = 1e-2
-
     # Check that FlashAttention's numerical error is at most twice the numerical error
     # of a Pytorch implementation.
     if DEBUG:
         print("out:", out, out.shape)
         print("out_ref:", out_ref, out_ref.shape)
-    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + MIN_ERROR
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
 
     if dropout_p > 0.0:
+        if DEBUG:
+            print("attn:", attn, attn.shape)
+            print("attn_ref:", attn_ref, attn_ref.shape)
         # assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
         # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate
         if not alibi:
+            if DEBUG:
+                print("dropout_fraction:", dropout_fraction)
+                print("dropout_p:", dropout_p)
             assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025)
 
     if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
@@ -1182,19 +1180,19 @@ def test_flash_attn_output(
             print("dv:", dv, dv.shape)
             print("dv_ref:", dv_ref, dv_ref.shape)
             print("dv_pt:", dv_pt, dv_pt.shape)
-        assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item() + MIN_ERROR
+        assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()
         
         if DEBUG:
             print("dk:", dk, dk.shape)
             print("dk_ref:", dk_ref, dk_ref.shape)
             print("dk_pt:", dk_pt, dk_pt.shape)
-        assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item() + MIN_ERROR
+        assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
 
         if DEBUG:
             print("dq:", dq, dq.shape)
             print("dq_ref:", dq_ref, dq_ref.shape)
             print("dq_pt:", dq_pt, dq_pt.shape)
-        assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item() + MIN_ERROR
+        assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
         
 
 
@@ -1218,31 +1216,29 @@ def test_flash_attn_output(
 @pytest.mark.parametrize(
     "seqlen_q,seqlen_k",
     [
-        # (5, 5),
-        # (1, 147),
-        # (113, 203),
-        # (128, 217),
-        # (113, 211),
-        # (108, 256),
+        # (32, 32),
+        (1, 147),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
         (256, 512),
-        # (512, 256),
-        # (1024, 1024),
-        # (1023, 1024),
-        # (1024, 1023),
-        # (2048, 2048),
-        # (790, 790)
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
     ],
 )
 # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
-# @pytest.mark.parametrize("dropout_p", [0.0, 0.17])
-@pytest.mark.parametrize('dropout_p', [0.17])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize('dropout_p', [0.0])
 # @pytest.mark.parametrize("softcap", [0.0, 50.0])
 @pytest.mark.parametrize("softcap", [0.0])
 def test_flash_attn_varlen_output(
     seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked, softcap
 ):
     if USE_TRITON_ROCM:
-
         if local == True:
             pytest.skip("local sliding window attention not supported on AMD's Triton Backend yet")
         
@@ -1283,9 +1279,6 @@ def test_flash_attn_varlen_output(
 
     query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
     key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
-
-    # query_padding_mask, key_padding_mask = None, key_padding_mask
-
     # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
     if alibi:
         alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
@@ -1522,7 +1515,7 @@ def test_flash_attn_varlen_output(
 
     # Check that FlashAttention's numerical error is at most twice the numerical error
     # of a Pytorch implementation.
-    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + MIN_ERROR
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
 
     if dropout_p > 0.0:
         # assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
@@ -1535,19 +1528,19 @@ def test_flash_attn_varlen_output(
             print("dv:", dv, dv.shape)
             print("dv_ref:", dv_ref, dv_ref.shape)
             print("dv_pt:", dv_pt, dv_pt.shape)
-        assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item() + MIN_ERROR
+        assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()
         
         if DEBUG:
             print("dk:", dk, dk.shape)
             print("dk_ref:", dk_ref, dk_ref.shape)
             print("dk_pt:", dk_pt, dk_pt.shape)
-        assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item() + MIN_ERROR
+        assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
 
         if DEBUG:
             print("dq:", dq, dq.shape)
             print("dq_ref:", dq_ref, dq_ref.shape)
             print("dq_pt:", dq_pt, dq_pt.shape)
-        assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item() + MIN_ERROR
+        assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
 
 
 # @pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))