From a010f13a40db6b2cc1fe40d460d48b9029177370 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Fri, 8 Nov 2024 23:06:10 +0000
Subject: [PATCH 01/12] Undo change to position ids to debug loss curve
 increase

---
 .../bionemo-esm2/src/bionemo/esm2/model/model.py      |  5 -----
 .../src/bionemo/llm/model/biobert/model.py            | 11 +++--------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
index 7e0464a4e..b5b53b036 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
@@ -132,11 +132,6 @@ def __init__(
 
         # Embeddings.
         if self.pre_process:
-            self.register_buffer(
-                "bert_position_id_tensor",
-                torch.arange(max_sequence_length, dtype=torch.long, requires_grad=False).unsqueeze(0),
-                persistent=False,
-            )
             # ESM2 Customization: ESM2Embedding instead of LanguageModelEmbedding
             # TODO: call super, overwrite the self.embedding, and setup_embeddings_and_output_layer in constructor.
             # Note: need to avoid calling setup twice: skip with super (super(skip_setup=True))
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
index 68eac670d..dc02a798c 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
@@ -201,11 +201,6 @@ def __init__(  # noqa: D107
         self.model_type = ModelType.encoder_or_decoder
         # Embeddings.
         if self.pre_process:
-            self.register_buffer(
-                "bert_position_id_tensor",
-                torch.arange(max_sequence_length, dtype=torch.long, requires_grad=False).unsqueeze(0),
-                persistent=False,
-            )
             self.embedding = LanguageModelEmbedding(
                 config=self.config,
                 vocab_size=self.vocab_size,
@@ -317,9 +312,9 @@ def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
     def bert_position_ids(self, token_ids):  # noqa: D102
         # Create position ids
         seq_length = token_ids.size(1)
-        if seq_length != self.max_sequence_length:
-            return self.bert_position_id_tensor[:, :seq_length]
-        return self.bert_position_id_tensor  # No need to subset so skip the slice op
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+        return position_ids
 
     def embedding_forward(
         self,

From 916aaae56c1dbd9a611199e1dd65f54ba8ce2c36 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Sat, 9 Nov 2024 00:25:38 +0000
Subject: [PATCH 02/12] remove unused defer wgrad compute just incase

---
 .../geneformer/scripts/train_geneformer.py       |  1 -
 .../src/bionemo/llm/model/biobert/model.py       | 16 ----------------
 2 files changed, 17 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
index 2d606b934..5f698ec0f 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
@@ -283,7 +283,6 @@ def main(
         seq_length=seq_length,
         bias_dropout_fusion=True,  # TODO fix the recompilation issue, but for now it's faster even with recompilations
         bias_activation_fusion=True,  # TODO same note as above. Set these to False to see recompilation go away
-        defer_embedding_wgrad_compute=pipeline_model_parallel_size > 1,
         params_dtype=get_autocast_dtype(precision),
         pipeline_dtype=get_autocast_dtype(precision),
         autocast_dtype=get_autocast_dtype(precision),  # setting this speeds things up a lot
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
index dc02a798c..269554574 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
@@ -229,20 +229,6 @@ def __init__(  # noqa: D107
         # Output
         if post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
-            if self.config.defer_embedding_wgrad_compute:
-                # The embedding activation buffer preserves a reference to the input activations
-                # of the final embedding projection layer GEMM. It will hold the activations for
-                # all the micro-batches of a global batch for the last pipeline stage. Once we are
-                # done with all the back props for all the microbatches for the last pipeline stage,
-                # it will be in the pipeline flush stage. During this pipeline flush we use the
-                # input activations stored in embedding activation buffer and gradient outputs
-                # stored in gradient buffer to calculate the weight gradients for the embedding
-                # final linear layer.
-                self.embedding_activation_buffer = []
-                self.grad_output_buffer = []
-            else:
-                self.embedding_activation_buffer = None
-                self.grad_output_buffer = None
 
             self.lm_head = BertLMHead(
                 config.hidden_size,
@@ -259,8 +245,6 @@ def __init__(  # noqa: D107
                 skip_bias_add=False,
                 gather_output=not self.parallel_output,
                 skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
-                embedding_activation_buffer=self.embedding_activation_buffer,
-                grad_output_buffer=self.grad_output_buffer,
             )
 
             self.binary_head = None

From 1d9d80717426613739d518b7c22d38ce3e60df4b Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Sat, 9 Nov 2024 00:26:34 +0000
Subject: [PATCH 03/12] remove unused args

---
 .../src/bionemo/geneformer/scripts/train_geneformer.py           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
index 5f698ec0f..c5dfcf894 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
@@ -189,7 +189,6 @@ def main(
         progress_interval=log_every_n_steps,
         find_unused_parameters=True,
         ckpt_include_optimizer=True,
-        gradient_as_bucket_view=True,
         # FIXME there are intermittent errors with async checkpoint saving.
         #  see https://wandb.ai/clara-discovery/geneformer_bionemo2_goodslurm/runs/uAFi7DzI/logs
         # ckpt_async_save=True,

From 152b38d7816de0af1aecd23257255c43ad4142f8 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Sat, 9 Nov 2024 00:30:56 +0000
Subject: [PATCH 04/12] add one more change back

---
 sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
index 269554574..a3abbd3a7 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
@@ -337,7 +337,6 @@ def forward(
         tokentype_ids: Optional[Tensor] = None,
         lm_labels: Optional[Tensor] = None,
         inference_params: Any | None = None,
-        runtime_gather_output: Optional[bool] = None,
     ) -> BioBertOutput | Tensor:
         """Forward function of BERT model
 
@@ -416,7 +415,6 @@ def forward(
 
         hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states)
         if not self.skip_logits:
-            # TODO add , runtime_gather_output=runtime_gather_output once supported in ColumnParallelLinear
             logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight)
         else:
             logits = None

From cc972759fc8d93833ae40d9a0fddc4f196f1d012 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Sat, 9 Nov 2024 01:28:17 +0000
Subject: [PATCH 05/12] Untoggle bias fusions

---
 .../bionemo/geneformer/scripts/train_geneformer.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
index c5dfcf894..87baedf35 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
@@ -93,6 +93,7 @@ def main(
     gc_interval: int = 0,
     aligned_megatron_ddp: bool = False,
     recompilation_check: bool = False,
+    bias_fusions: bool = False,
     # TODO add datamodule class, and ability to change data step to get full support for pretraining workflows
 ) -> None:
     """Train a Geneformer model on single cell data.
@@ -147,6 +148,8 @@ def main(
             good for clusters. This will likely slow down single node runs though.
         recompilation_check (bool): enable a recompilation check (only do on a small run) to verify that fused gpu
             kernels are not being regularly recompiled, which is very expensive, with a particular model/settings.
+        bias_fusions (bool): enable two bias fusions (dropout and activation) which improve performance but should be
+            evaluated for impacting training stability. At the very least they trigger recompilations.
     """
     # Create the result directory if it does not exist.
     if wandb_tags is None:
@@ -280,8 +283,8 @@ def main(
         ffn_hidden_size=512,
         num_attention_heads=4,
         seq_length=seq_length,
-        bias_dropout_fusion=True,  # TODO fix the recompilation issue, but for now it's faster even with recompilations
-        bias_activation_fusion=True,  # TODO same note as above. Set these to False to see recompilation go away
+        bias_dropout_fusion=bias_fusions,  # TODO fix the recompilation issue, but for now it's faster even with recompilations
+        bias_activation_fusion=bias_fusions,  # TODO same note as above. Set these to False to see recompilation go away
         params_dtype=get_autocast_dtype(precision),
         pipeline_dtype=get_autocast_dtype(precision),
         autocast_dtype=get_autocast_dtype(precision),  # setting this speeds things up a lot
@@ -620,6 +623,12 @@ def config_class_type(desc: str) -> Type[BioBertConfig]:
         help="Activate this and make sure a small training loop runs, this tells you that your settings are not "
         "triggering regular recompilations which can be very expensive for fused gpu kernels.",
     )
+    parser.add_argument(
+        "--bias-fusions",
+        action="store_true",
+        default=False,
+        help="Activate bias fusions which seem to reduce precision but are slightly faster.",
+    )
 
     return parser
 
@@ -670,6 +679,7 @@ def entrypoint():
         gc_interval=args.gc_interval,
         aligned_megatron_ddp=args.aligned_megatron_ddp,
         recompilation_check=args.recompilation_check,
+        bias_fusions=args.bias_fusions,
     )
 
 

From 3be5b56e4aa507e3202726df93cfc9ce6d4531d9 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Sat, 9 Nov 2024 03:00:31 +0000
Subject: [PATCH 06/12] Make the faster option for bias fusions the default

---
 .../bionemo/geneformer/scripts/train_geneformer.py  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
index 87baedf35..3d80028aa 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
@@ -93,7 +93,7 @@ def main(
     gc_interval: int = 0,
     aligned_megatron_ddp: bool = False,
     recompilation_check: bool = False,
-    bias_fusions: bool = False,
+    skip_bias_fusions: bool = False,
     # TODO add datamodule class, and ability to change data step to get full support for pretraining workflows
 ) -> None:
     """Train a Geneformer model on single cell data.
@@ -148,10 +148,11 @@ def main(
             good for clusters. This will likely slow down single node runs though.
         recompilation_check (bool): enable a recompilation check (only do on a small run) to verify that fused gpu
             kernels are not being regularly recompiled, which is very expensive, with a particular model/settings.
-        bias_fusions (bool): enable two bias fusions (dropout and activation) which improve performance but should be
-            evaluated for impacting training stability. At the very least they trigger recompilations.
+        skip_bias_fusions (bool): Disable the two bias fusions (dropout and activation) which improve performance but
+            cause recompilations. In testing they still seem to result in higher performance despite the recompilations.
     """
     # Create the result directory if it does not exist.
+    bias_fusions = not skip_bias_fusions
     if wandb_tags is None:
         wandb_tags = []
     result_dir.mkdir(parents=True, exist_ok=True)
@@ -624,10 +625,10 @@ def config_class_type(desc: str) -> Type[BioBertConfig]:
         "triggering regular recompilations which can be very expensive for fused gpu kernels.",
     )
     parser.add_argument(
-        "--bias-fusions",
+        "--skip-bias-fusions",
         action="store_true",
         default=False,
-        help="Activate bias fusions which seem to reduce precision but are slightly faster.",
+        help="Deactivate bias fusions which seem to reduce precision but are slightly faster.",
     )
 
     return parser
@@ -679,7 +680,7 @@ def entrypoint():
         gc_interval=args.gc_interval,
         aligned_megatron_ddp=args.aligned_megatron_ddp,
         recompilation_check=args.recompilation_check,
-        bias_fusions=args.bias_fusions,
+        skip_bias_fusions=args.skip_bias_fusions,
     )
 
 

From 57dc0df0a2ca54925ac972c734b68d509c74b62a Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Mon, 11 Nov 2024 04:42:15 +0000
Subject: [PATCH 07/12] reset unrelated problems to main

---
 .../src/bionemo/esm2/model/model.py           |  5 ++++
 .../geneformer/scripts/train_geneformer.py    | 17 +++--------
 .../src/bionemo/llm/model/biobert/model.py    | 29 +++++++++++++++++--
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
index b5b53b036..7e0464a4e 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
@@ -132,6 +132,11 @@ def __init__(
 
         # Embeddings.
         if self.pre_process:
+            self.register_buffer(
+                "bert_position_id_tensor",
+                torch.arange(max_sequence_length, dtype=torch.long, requires_grad=False).unsqueeze(0),
+                persistent=False,
+            )
             # ESM2 Customization: ESM2Embedding instead of LanguageModelEmbedding
             # TODO: call super, overwrite the self.embedding, and setup_embeddings_and_output_layer in constructor.
             # Note: need to avoid calling setup twice: skip with super (super(skip_setup=True))
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
index 3d80028aa..2d606b934 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/train_geneformer.py
@@ -93,7 +93,6 @@ def main(
     gc_interval: int = 0,
     aligned_megatron_ddp: bool = False,
     recompilation_check: bool = False,
-    skip_bias_fusions: bool = False,
     # TODO add datamodule class, and ability to change data step to get full support for pretraining workflows
 ) -> None:
     """Train a Geneformer model on single cell data.
@@ -148,11 +147,8 @@ def main(
             good for clusters. This will likely slow down single node runs though.
         recompilation_check (bool): enable a recompilation check (only do on a small run) to verify that fused gpu
             kernels are not being regularly recompiled, which is very expensive, with a particular model/settings.
-        skip_bias_fusions (bool): Disable the two bias fusions (dropout and activation) which improve performance but
-            cause recompilations. In testing they still seem to result in higher performance despite the recompilations.
     """
     # Create the result directory if it does not exist.
-    bias_fusions = not skip_bias_fusions
     if wandb_tags is None:
         wandb_tags = []
     result_dir.mkdir(parents=True, exist_ok=True)
@@ -193,6 +189,7 @@ def main(
         progress_interval=log_every_n_steps,
         find_unused_parameters=True,
         ckpt_include_optimizer=True,
+        gradient_as_bucket_view=True,
         # FIXME there are intermittent errors with async checkpoint saving.
         #  see https://wandb.ai/clara-discovery/geneformer_bionemo2_goodslurm/runs/uAFi7DzI/logs
         # ckpt_async_save=True,
@@ -284,8 +281,9 @@ def main(
         ffn_hidden_size=512,
         num_attention_heads=4,
         seq_length=seq_length,
-        bias_dropout_fusion=bias_fusions,  # TODO fix the recompilation issue, but for now it's faster even with recompilations
-        bias_activation_fusion=bias_fusions,  # TODO same note as above. Set these to False to see recompilation go away
+        bias_dropout_fusion=True,  # TODO fix the recompilation issue, but for now it's faster even with recompilations
+        bias_activation_fusion=True,  # TODO same note as above. Set these to False to see recompilation go away
+        defer_embedding_wgrad_compute=pipeline_model_parallel_size > 1,
         params_dtype=get_autocast_dtype(precision),
         pipeline_dtype=get_autocast_dtype(precision),
         autocast_dtype=get_autocast_dtype(precision),  # setting this speeds things up a lot
@@ -624,12 +622,6 @@ def config_class_type(desc: str) -> Type[BioBertConfig]:
         help="Activate this and make sure a small training loop runs, this tells you that your settings are not "
         "triggering regular recompilations which can be very expensive for fused gpu kernels.",
     )
-    parser.add_argument(
-        "--skip-bias-fusions",
-        action="store_true",
-        default=False,
-        help="Deactivate bias fusions which seem to reduce precision but are slightly faster.",
-    )
 
     return parser
 
@@ -680,7 +672,6 @@ def entrypoint():
         gc_interval=args.gc_interval,
         aligned_megatron_ddp=args.aligned_megatron_ddp,
         recompilation_check=args.recompilation_check,
-        skip_bias_fusions=args.skip_bias_fusions,
     )
 
 
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
index a3abbd3a7..68eac670d 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
@@ -201,6 +201,11 @@ def __init__(  # noqa: D107
         self.model_type = ModelType.encoder_or_decoder
         # Embeddings.
         if self.pre_process:
+            self.register_buffer(
+                "bert_position_id_tensor",
+                torch.arange(max_sequence_length, dtype=torch.long, requires_grad=False).unsqueeze(0),
+                persistent=False,
+            )
             self.embedding = LanguageModelEmbedding(
                 config=self.config,
                 vocab_size=self.vocab_size,
@@ -229,6 +234,20 @@ def __init__(  # noqa: D107
         # Output
         if post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
+            if self.config.defer_embedding_wgrad_compute:
+                # The embedding activation buffer preserves a reference to the input activations
+                # of the final embedding projection layer GEMM. It will hold the activations for
+                # all the micro-batches of a global batch for the last pipeline stage. Once we are
+                # done with all the back props for all the microbatches for the last pipeline stage,
+                # it will be in the pipeline flush stage. During this pipeline flush we use the
+                # input activations stored in embedding activation buffer and gradient outputs
+                # stored in gradient buffer to calculate the weight gradients for the embedding
+                # final linear layer.
+                self.embedding_activation_buffer = []
+                self.grad_output_buffer = []
+            else:
+                self.embedding_activation_buffer = None
+                self.grad_output_buffer = None
 
             self.lm_head = BertLMHead(
                 config.hidden_size,
@@ -245,6 +264,8 @@ def __init__(  # noqa: D107
                 skip_bias_add=False,
                 gather_output=not self.parallel_output,
                 skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
+                embedding_activation_buffer=self.embedding_activation_buffer,
+                grad_output_buffer=self.grad_output_buffer,
             )
 
             self.binary_head = None
@@ -296,9 +317,9 @@ def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
     def bert_position_ids(self, token_ids):  # noqa: D102
         # Create position ids
         seq_length = token_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
-        position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
-        return position_ids
+        if seq_length != self.max_sequence_length:
+            return self.bert_position_id_tensor[:, :seq_length]
+        return self.bert_position_id_tensor  # No need to subset so skip the slice op
 
     def embedding_forward(
         self,
@@ -337,6 +358,7 @@ def forward(
         tokentype_ids: Optional[Tensor] = None,
         lm_labels: Optional[Tensor] = None,
         inference_params: Any | None = None,
+        runtime_gather_output: Optional[bool] = None,
     ) -> BioBertOutput | Tensor:
         """Forward function of BERT model
 
@@ -415,6 +437,7 @@ def forward(
 
         hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states)
         if not self.skip_logits:
+            # TODO add , runtime_gather_output=runtime_gather_output once supported in ColumnParallelLinear
             logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight)
         else:
             logits = None

From 73a2d215edd9110d7749123e228ea4d2c875a380 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Mon, 11 Nov 2024 04:44:15 +0000
Subject: [PATCH 08/12] Unset NVTE

---
 Dockerfile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2a74224dd..e05417f04 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -166,7 +166,9 @@ RUN <<EOF
 EOF
 
 # Transformer engine attention defaults
-ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0
+# FIXME the following result in unstable training curves even if they are faster
+#  see https://github.com/NVIDIA/bionemo-framework/pull/421
+#ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0
 
 FROM dev AS development
 
@@ -207,4 +209,6 @@ RUN chmod 777 -R /workspace/bionemo2/
 
 # Transformer engine attention defaults
 # We have to declare this again because the devcontainer splits from the release image's base.
-ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0
+# FIXME the following results in unstable training curves even if faster. 
+#  See https://github.com/NVIDIA/bionemo-framework/pull/421
+#ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0

From 4ff86f0424cf05ce2373992a4c7c249b73bc0631 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Mon, 11 Nov 2024 05:25:06 +0000
Subject: [PATCH 09/12] Fix format

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index e05417f04..6f900f9ca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -209,6 +209,6 @@ RUN chmod 777 -R /workspace/bionemo2/
 
 # Transformer engine attention defaults
 # We have to declare this again because the devcontainer splits from the release image's base.
-# FIXME the following results in unstable training curves even if faster. 
+# FIXME the following results in unstable training curves even if faster.
 #  See https://github.com/NVIDIA/bionemo-framework/pull/421
 #ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0

From 2f05f9c6504343db3c015ffa1eb98a1d001a9b4d Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Tue, 12 Nov 2024 17:11:35 +0000
Subject: [PATCH 10/12] remove mention of NVTE from tests/docs/etc

---
 README.md                                                  | 6 ------
 docs/docs/user-guide/examples/bionemo-esm2/pretrain.md     | 3 ---
 scripts/protein/esm2/test_esm2_pretrain.py                 | 7 -------
 .../bionemo-llm/src/bionemo/llm/model/biobert/model.py     | 3 +--
 4 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 762ca1259..5b3b64f80 100644
--- a/README.md
+++ b/README.md
@@ -186,9 +186,6 @@ export MY_DATA_SOURCE="pbss"
 
 ```bash
 # The fastest transformer engine environment variables in testing were the following two
-export NVTE_FUSED_ATTN=1
-export NVTE_FLASH_ATTN=0
-
 TEST_DATA_DIR=$(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source $MY_DATA_SOURCE); \
 ESM2_650M_CKPT=$(download_bionemo_data esm2/650m:2.0 --source $MY_DATA_SOURCE); \
 python  \
@@ -248,9 +245,6 @@ and DataModule types.
 > ⚠️ **Warning:** This setup does NO configuration of Weights and Biases. Edit your config JSON and populate it with your WandB details.
 
 ```
-export NVTE_FUSED_ATTN=1
-export NVTE_FLASH_ATTN=0
-
 bionemo-esm2-train \
 --data-config-t bionemo.esm2.run.config_models.ESM2DataConfig \
 --model-config-t bionemo.esm2.run.config_models.ExposedESM2PretrainConfig \
diff --git a/docs/docs/user-guide/examples/bionemo-esm2/pretrain.md b/docs/docs/user-guide/examples/bionemo-esm2/pretrain.md
index 684578596..101d67d2f 100644
--- a/docs/docs/user-guide/examples/bionemo-esm2/pretrain.md
+++ b/docs/docs/user-guide/examples/bionemo-esm2/pretrain.md
@@ -280,9 +280,6 @@ llm.train(
 Or simply call `esm2_pretrain.py` directly.
 ```bash
 # Enable fused attention in transformer engine for speed-up
-export NVTE_FUSED_ATTN=1
-export NVTE_FLASH_ATTN=0
-
 DATA_DIR=$(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source ngc)
 
 python scripts/protein/esm2/esm2_pretrain.py \
diff --git a/scripts/protein/esm2/test_esm2_pretrain.py b/scripts/protein/esm2/test_esm2_pretrain.py
index c2253d46d..57f821c96 100644
--- a/scripts/protein/esm2/test_esm2_pretrain.py
+++ b/scripts/protein/esm2/test_esm2_pretrain.py
@@ -90,8 +90,6 @@ def test_main_runs(monkeypatch, tmpdir, dummy_protein_dataset, dummy_parquet_tra
     result_dir = Path(tmpdir.mkdir("results"))
 
     with megatron_parallel_state_utils.distributed_model_parallel_state():
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "1")
-        monkeypatch.setenv("NVTE_FLASH_ATTN", "0")
         main(
             train_cluster_path=train_cluster_path,
             train_database_path=dummy_protein_dataset,
@@ -159,8 +157,6 @@ def test_val_dataloader_in_main_runs_with_limit_val_batches(
     result_dir = Path(tmpdir.mkdir("results"))
 
     with megatron_parallel_state_utils.distributed_model_parallel_state():
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "1")
-        monkeypatch.setenv("NVTE_FLASH_ATTN", "0")
         main(
             train_cluster_path=train_cluster_path,
             train_database_path=dummy_protein_dataset,
@@ -239,9 +235,6 @@ def test_pretrain_cli(tmpdir, dummy_protein_dataset, dummy_parquet_train_val_inp
     # a local copy of the environment
     env = dict(**os.environ)
     env["MASTER_PORT"] = str(open_port)
-    env["NVTE_FUSED_ATTN"] = "1"
-    env["NVTE_FLASH_ATTN"] = "0"
-
     cmd = shlex.split(cmd_str)
     result = subprocess.run(
         cmd,
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
index 68eac670d..872b62d45 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
@@ -525,8 +525,7 @@ def configure_model(self, tokenizer: AutoTokenizer) -> MegatronBioBertModelType:
                 self.num_layers // p_size
             ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
 
-        # The local specs all require the standard full attention mask. For transformer engine only the NVTE_FLASH_ATTN=0
-        #  option requires this full attention mask.
+        # The local specs all require the standard full attention mask.
         use_full_attention_mask: bool = "transformer_engine" not in self.biobert_spec_option
         do_next_sentence = False
         if self.model_cls is None:

From dcc75ecfe5e9d5466284d7cf08d86cd4c24a6544 Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Tue, 12 Nov 2024 17:34:08 +0000
Subject: [PATCH 11/12] reduce atol check for geneformer

---
 .../tests/bionemo/geneformer/test_stop_and_go.py            | 1 +
 .../src/bionemo/testing/harnesses/stop_and_go.py            | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
index 382084413..a33a6ff1a 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
@@ -97,6 +97,7 @@ class TestGeneformerStopAndGo(stop_and_go.StopAndGoHarness):
     limit_val_batches: int = 2
     lr: float = 1e-4
     precision: Literal["16-mixed", "bf16-mixed", "32"] = MODEL_PRECISION
+    train_output_atol: float = 2e-2
 
     @override
     @classmethod
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py b/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
index d811c997c..afc12a648 100644
--- a/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
+++ b/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
@@ -106,6 +106,8 @@ class StopAndGoHarness(ABC):
     limit_val_batches: int
     lr: float = 1e-4
     precision: Literal["16-mixed", "bf16-mixed", "32"]
+    train_output_atol: float = 1e-3
+    other_output_atol: float = 1e-4
 
     # class variables that will be setup in setUpClass
     tempdir: tempfile.TemporaryDirectory
@@ -336,9 +338,9 @@ def test_stop_and_go_consistency(self, callback_type):
         assert interrupted_callback.data, f"No data found for {callback_type}"
 
         if callback_type == testing_callbacks.TrainOutputCallback:
-            atol = 1e-3
+            atol = self.train_output_atol
         else:
-            atol = 1e-4
+            atol = self.other_output_atol
 
         recursive_assert_approx_equal(interrupted_callback.data, continuous_callback.data, atol=atol)
 

From 49ad7cdcb6276e07e0e9ae5bdac1c1718128597e Mon Sep 17 00:00:00 2001
From: John St John <jstjohn@nvidia.com>
Date: Tue, 12 Nov 2024 18:24:58 +0000
Subject: [PATCH 12/12] update another failing ci test for geneformer

---
 .../tests/bionemo/geneformer/test_stop_and_go.py          | 2 +-
 .../src/bionemo/testing/harnesses/stop_and_go.py          | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
index a33a6ff1a..4c4a984b7 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
@@ -97,7 +97,7 @@ class TestGeneformerStopAndGo(stop_and_go.StopAndGoHarness):
     limit_val_batches: int = 2
     lr: float = 1e-4
     precision: Literal["16-mixed", "bf16-mixed", "32"] = MODEL_PRECISION
-    train_output_atol: float = 2e-2
+    train_val_output_atol: float = 2e-2
 
     @override
     @classmethod
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py b/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
index afc12a648..7b9a82c43 100644
--- a/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
+++ b/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
@@ -106,7 +106,7 @@ class StopAndGoHarness(ABC):
     limit_val_batches: int
     lr: float = 1e-4
     precision: Literal["16-mixed", "bf16-mixed", "32"]
-    train_output_atol: float = 1e-3
+    train_val_output_atol: float = 1e-3
     other_output_atol: float = 1e-4
 
     # class variables that will be setup in setUpClass
@@ -338,7 +338,7 @@ def test_stop_and_go_consistency(self, callback_type):
         assert interrupted_callback.data, f"No data found for {callback_type}"
 
         if callback_type == testing_callbacks.TrainOutputCallback:
-            atol = self.train_output_atol
+            atol = self.train_val_output_atol
         else:
             atol = self.other_output_atol
 
@@ -390,8 +390,8 @@ def test_stop_and_go_consistency_with_uneven_validation_sizes(self, callback_typ
         interrupted_data = interrupted_callback.data[-len(continuous_callback.data) :]
 
         if callback_type == testing_callbacks.ValidOutputCallback:
-            atol = 1e-3
+            atol = self.train_val_output_atol
         else:
-            atol = 1e-4
+            atol = self.other_output_atol
 
         recursive_assert_approx_equal(interrupted_data, continuous_callback.data, atol=atol)