diff --git a/parse_sweep.py b/parse_sweep.py new file mode 100644 index 00000000..9039b5d1 --- /dev/null +++ b/parse_sweep.py @@ -0,0 +1,57 @@ +""" +Input: a subdirectory containing the logs from various experiments +Output: a csv file with loss values, peak memory usage, throughout from each experiment +""" + +import csv +import os +import re + +import fire + +OUTPUT_FOLDER = '/home/vasiliy/local/tmp/torchtitan_outputs' + +# example: +# [rank0]:[INFO | root ]: step: 10 loss: 7.8774 memory: 0.44GiB(0.47%) tps: 997,458 mfu: 1.50% +# note that number of spaces between terms can vary +regex = r"- step:[ ]+([\d]+).*loss:[ ]+([\d\.]+).*memory:[ ]+([\d\.]+)GiB.*tps: ([\d\,]+).*mfu.*" + +def log_to_maybe_data(line): + res = re.search(regex, line) + if res is not None: + step, loss, memory_gib, wps = res.group(1), res.group(2), res.group(3), res.group(4) + return int(step), float(loss), float(memory_gib), int(wps.replace(',', '')) + else: + return None + +def run( + subfolder_prefix: str, + results_filename: str, +): + subfolder_prefix = str(subfolder_prefix) + + results = [['experiment', 'step', 'loss', 'memory_gib', 'tps']] + + for entry in os.scandir(OUTPUT_FOLDER): + if entry.is_dir() and subfolder_prefix in entry.path: + print(entry) + log_fname = f"{entry.path}/logs.txt" + short_path = entry.path.replace(f"{OUTPUT_FOLDER}/", '') + + with open(log_fname, 'r') as f: + lines = f.readlines() + for l in lines: + res = log_to_maybe_data(l) + if res is not None: + print(l.strip('\n')) + print(res) + results.append([short_path, *res]) + + with open(results_filename, 'w') as f: + writer = csv.writer(f) + writer.writerows(results) + + print('done') + +if __name__ == '__main__': + fire.Fire(run) diff --git a/test/test_te.py b/test/test_te.py new file mode 100644 index 00000000..f91170c1 --- /dev/null +++ b/test/test_te.py @@ -0,0 +1,159 @@ +import copy + +import torch +import torch.nn as nn + +# path hack, TODO remove +import sys +sys.path.insert(0, '/home/vasiliy/local/torchtitan/') +import torchtitan.te_utils as te_utils +from torchtitan.models.norms import build_norm +from torchtitan.models.llama.model import FeedForward, Attention, ModelArgs, precompute_freqs_cis + +import transformer_engine.pytorch as te +from transformer_engine.common.recipe import Format, DelayedScaling + +# torch.use_deterministic_algorithms(True) +torch.manual_seed(0) + +fp8_format = Format.HYBRID +fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=16, amax_compute_algo="max") +maybe_te_float8_ctx = te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe) + +def test_linear_module_swap(): + x = torch.randn(32, 32, device='cuda') + + m = nn.Sequential(nn.Linear(32, 32)).cuda() + te_utils.swap_linear_to_te_linear(m) + print(m) + m = torch.compile(m) + + with maybe_te_float8_ctx: + y = m(x) + y.sum().backward() + + print('done') + +# Subsection of TransformerBlock with only the ffn norm and the ffn +class NormFFNBlock(nn.Module): + def __init__(self, dim, hidden_dim, multiple_of): + super().__init__() + self.ffn_norm = build_norm("rmsnorm", dim, eps=1e-12) + self.feed_forward = FeedForward(dim, hidden_dim, multiple_of, None) + + def forward(self, h): + out = h + self.feed_forward(self.ffn_norm(h)) + return out + +class NormAttnBlock(nn.Module): + def __init__(self, model_args): + super().__init__() + self.attention_norm = build_norm("rmsnorm", model_args.dim, eps=1e-12) + self.attention = Attention(model_args) + self.model_args = model_args + self.freqs_cis = precompute_freqs_cis( + self.model_args.dim // self.model_args.n_heads, + # Need to compute until at least the max token limit for generation + # TODO: explain in docs/composability.md why we removed the 2x + # relaxing in our CP enablement PR + self.model_args.max_seq_len, + self.model_args.rope_theta, + ).cuda() + + def forward(self, x): + x = self.attention_norm(x) + x = self.attention(x, self.freqs_cis) + return x + +def SQNR(x, y): + return 20 * torch.log10( + torch.linalg.norm(x) / torch.linalg.norm(x - y) + ) + +def test_norm_attn_rewrite(): + dim = 256 + model_args = ModelArgs() + m = NormAttnBlock(model_args).cuda().bfloat16() + m_copy = copy.deepcopy(m) + te_utils.swap_norm_attn_to_te_friendly_norm_attn(m_copy) + print(m) + + x = torch.randn(1, 128, model_args.dim).cuda().bfloat16() + x_copy = copy.deepcopy(x) + + y = m(x) + + y_copy = m_copy(x_copy) + + print(torch.allclose(y, y_copy)) + print(SQNR(y, y_copy)) + + te_utils.swap_te_friendly_norm_ffn_to_te_layernorm_linear(m_copy) + print(m) + y_copy2 = m_copy(x_copy) + print(torch.allclose(y_copy, y_copy2)) + print(SQNR(y_copy, y_copy2)) + + + +def test_norm_ffn_rewrite(): + dim = 256 + hidden_dim = 512 + multiple_of = 1 + + x = torch.randn(1, 128, 256).cuda().bfloat16() + x_copy = copy.deepcopy(x) + + m = NormFFNBlock(dim, hidden_dim, multiple_of).cuda().bfloat16() + m_copy = copy.deepcopy(m) + print(m) + + y = m(x) + y.sum().backward() + + te_utils.swap_norm_ffn_to_te_friendly_norm_ffn(m_copy) + print(m_copy) + + y_copy = m_copy(x_copy) + y_copy.sum().backward() + + # TODO: debug why not an exact match + print(torch.allclose(y, y_copy)) + print(SQNR(y, y_copy)) + + # TODO test w13 + # assert torch.allclose(m.ffn.w2.grad, m_copy.ffn.w2.grad, atol=0, rtol=0) + + te_utils.swap_te_friendly_norm_ffn_to_te_layernorm_linear(m_copy) + print(m_copy) + + y_copy2 = m_copy(x_copy) + print(torch.allclose(y_copy, y_copy2)) + print(SQNR(y_copy, y_copy2)) + +# works, so a bug in the swap above? +def test_split_linear(): + M, K, N = 32, 64, 128 + # M, K, N = 4, 6, 8 + + x = torch.randn(M, K) + + fc1 = nn.Linear(K, N, bias=False) + fc2 = nn.Linear(K, N, bias=False) + + fc3 = nn.Linear(K, N * 2, bias=False) + fc3.weight = torch.nn.Parameter( + torch.cat([copy.deepcopy(fc1.weight), copy.deepcopy(fc2.weight)], dim=0) + ) + + y1 = fc1(x) + y2 = fc2(x) + y3 = fc3(x) + y3_1, y3_2 = torch.split(y3, fc3.out_features // 2, dim=-1) + + assert torch.allclose(y1, y3_1) + assert torch.allclose(y2, y3_2) + + +if __name__ == '__main__': + test() diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py index 814bd80f..bca6cd92 100644 --- a/torchtitan/config_manager.py +++ b/torchtitan/config_manager.py @@ -373,6 +373,58 @@ def __init__(self): action="store_true", help="Whether to compile the model", ) + self.parser.add_argument( + "--training.compile_ln_linear", + action="store_true", + help="Whether to compile only the LNLinear blocks", + ) + self.parser.add_argument( + "--training.compile_linear", + action="store_true", + help="Whether to compile only the LNLinear blocks", + ) + self.parser.add_argument( + "--training.horizontally_fuse_fcs", + action="store_true", + help=""" + If true, fuses ffn.fc1 and ffn.fc3 into ffn.fc13. Note that this is required + to use te.LayerNormLinear for FFNs. + TODO also implement this for attention. + """, + ) + self.parser.add_argument( + "--training.te_swap_linear", + action="store_true", + help=""" + If true, swaps torch.nn.Linear with te.Linear + (not for land) + + Note: + * requires training.te_float8_autocast to use float8 + """, + ) + self.parser.add_argument( + "--training.te_swap_ln_linear", + action="store_true", + help=""" + If true, swaps NormFeedForward.norm_w13 from + nn.Sequential(RMSNorm, nn.Linear) to te.LayerNormLinear + (not for land) + + Note: + * requires training.horizontally_fuse_fcs to enable this swap + * this swap happens strictly before `training.te_swap_linear` if both are enabled + * requires training.te_float8_autocast to use float8 + """, + ) + self.parser.add_argument( + "--training.te_float8_autocast", + action="store_true", + help=""" + If true, enables TE's float8 autocast context manager + (not for land) + """, + ) self.parser.add_argument( "--training.gc_freq", type=int, diff --git a/torchtitan/float8.py b/torchtitan/float8.py index 1dd0d0bb..64b6615b 100644 --- a/torchtitan/float8.py +++ b/torchtitan/float8.py @@ -31,6 +31,7 @@ def _is_sm89_or_later(): class Float8Handler: def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims): self.enabled = False + self.job_config = job_config float8_config = job_config.float8 if not float8_config.enable_float8_linear: @@ -92,16 +93,25 @@ def convert_to_float8_training(self, model: nn.Module): from torchao.float8 import convert_to_float8_training + if self.job_config.training.compile_ln_linear: + # only convert compiled regions to float8 + module_filter_fn=lambda mod, fqn: (fqn != "output" and "norm_" in fqn) + else: + module_filter_fn=lambda mod, fqn: fqn != "output" + # Mutates the model inplace replacing instances of nn.Linear with Float8Linear convert_to_float8_training( model, config=self.config, - module_filter_fn=lambda mod, fqn: fqn != "output", + module_filter_fn=module_filter_fn, + # module_filter_fn=lambda mod, fqn: fqn != "output", + # module_filter_fn=lambda mod, fqn: fqn != "output" and "norm_w13" in fqn, ) logger.info( "Swapped to Float8Linear layers with enable_fsdp_float8_all_gather=" f"{self.config.enable_fsdp_float8_all_gather}" ) + print(model) def precompute_float8_dynamic_scale_for_fsdp( self, model: Union[nn.Module, List[nn.Module]] diff --git a/torchtitan/models/llama/__init__.py b/torchtitan/models/llama/__init__.py index 887a96cd..61080843 100644 --- a/torchtitan/models/llama/__init__.py +++ b/torchtitan/models/llama/__init__.py @@ -29,10 +29,12 @@ } llama3_configs = { - "debugmodel": ModelArgs(dim=256, n_layers=8, n_heads=16, rope_theta=500000), + # "debugmodel": ModelArgs(dim=256, n_layers=8, n_heads=16, rope_theta=500000), + "debugmodel": ModelArgs(dim=256, n_layers=1, n_heads=16, rope_theta=500000), "8B": ModelArgs( dim=4096, n_layers=32, + # n_layers=1, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py index 4d4c60bc..6b38a9d1 100644 --- a/torchtitan/parallelisms/parallelize_llama.py +++ b/torchtitan/parallelisms/parallelize_llama.py @@ -22,6 +22,7 @@ from torch.distributed._tensor import Replicate, Shard from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( checkpoint_wrapper as ptd_checkpoint_wrapper, + apply_activation_checkpointing, ) from torch.distributed.tensor.parallel import ( ColwiseParallel, @@ -75,7 +76,7 @@ def parallelize_llama( "fused_rmsnorm is not compatible with torch.compile yet. " "Please use rmsnorm or layernorm." ) - apply_compile(model) + apply_compile(model, job_config) if ( parallel_dims.dp_shard_enabled @@ -243,8 +244,20 @@ def apply_tp( } +import transformer_engine.pytorch as te +rng_seed = 1234 +torch.manual_seed(rng_seed) +torch.cuda.manual_seed(rng_seed) +CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker() +CUDA_RNG_STATES_TRACKER.add("model-parallel-rng", rng_seed) + + +def get_cuda_rng_tracker(): + return CUDA_RNG_STATES_TRACKER + + def _apply_ac_to_transformer_block(module: nn.Module, ac_config): - valid_ac_modes = ("full", "selective") + valid_ac_modes = ("full", "selective", "full_te") if ac_config.mode not in valid_ac_modes: raise ValueError( f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}" @@ -252,6 +265,23 @@ def _apply_ac_to_transformer_block(module: nn.Module, ac_config): if ac_config.mode == "full": return ptd_checkpoint_wrapper(module, preserve_rng_state=False) + elif ac_config.mode == "full_te": + # copy-paste from https://github.com/NVIDIA/TransformerEngine/blob/64126aa8c469b2a97ace01f925f3d5786d5fd1bb/examples/pytorch/fsdp/fsdp.py, apply_fsdp_checkpointing + # note: + # LLaMa 3 8B on 8 H100s with this option: + # 42.27 GiB, 4880 tps, strictly worse than PT-D's full AC. Have not done debugging + # on the cause yet. + + wrapper = lambda m: ptd_checkpoint_wrapper( + m, + checkpoint_fn=te.distributed.checkpoint, + use_reentrant=False, + get_rng_state_tracker=get_cuda_rng_tracker, + ) + def check_fn(submodule): + return True + apply_activation_checkpointing(module, checkpoint_wrapper_fn=wrapper, check_fn=check_fn) + return module assert ac_config.mode == "selective", f"{ac_config.mode}" use_op_sac = ac_config.selective_ac_option == "op" @@ -314,16 +344,42 @@ def apply_ac(model: nn.Module, ac_config): logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") -def apply_compile(model: nn.Module): +def apply_compile(model: nn.Module, job_config): """ Apply torch.compile to each TransformerBlock, which makes compilation efficient due to repeated structure. Alternatively one can compile the whole model (after applying DP). """ - for layer_id, transformer_block in model.layers.named_children(): - transformer_block = torch.compile(transformer_block, fullgraph=True) - model.layers.register_module(layer_id, transformer_block) + if job_config.training.compile_ln_linear: + def _apply_compile(mod): + for name, child in mod.named_children(): + # hacky check, but good enough for this use case + if isinstance(child, torch.nn.Sequential) and len(child) == 2: + new_child = torch.compile(child) + setattr(mod, name, new_child) + else: + _apply_compile(child) + + logger.info("Compiling each LNLinear with torch.compile") + _apply_compile(model) + elif job_config.training.compile_linear: + def _apply_compile(mod): + for name, child in mod.named_children(): + # hacky check, but good enough for this use case + if isinstance(child, torch.nn.Linear): + new_child = torch.compile(child) + setattr(mod, name, new_child) + else: + _apply_compile(child) + + logger.info("Compiling each Linear with torch.compile") + _apply_compile(model) + else: + for layer_id, transformer_block in model.layers.named_children(): + # transformer_block = torch.compile(transformer_block, fullgraph=True) + transformer_block = torch.compile(transformer_block, fullgraph=False) + model.layers.register_module(layer_id, transformer_block) - logger.info("Compiling each TransformerBlock with torch.compile") + logger.info("Compiling each TransformerBlock with torch.compile") def apply_fsdp( diff --git a/torchtitan/te_utils.py b/torchtitan/te_utils.py new file mode 100644 index 00000000..dbfd3803 --- /dev/null +++ b/torchtitan/te_utils.py @@ -0,0 +1,297 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Utilities for testing TransformerEngine + +Note: I attempted to hack in DTensor-based TP/SP to te.Linear in the +link below, and gave up for now as it seemed to be a lot of remaining work. +We can power through that if needed later. +* https://gist.github.com/vkuzo/64d5362b63dd6c76410464e020d9a35f + +Note: I looked into using te.LayerNormLinear, and that would require changing +how Attention and FFN are defined in torchtitan to use a single gemm for +attn.kqv and ffn.w1_w3. Punting for now but we can do this later if needed. +""" + +import contextlib +import os + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from torchtitan.models.llama.model import apply_rotary_emb, repeat_kv + +# import transformer_engine as te +import transformer_engine.pytorch as te + +from transformer_engine.common.recipe import Format, DelayedScaling +te_fp8_format = Format.HYBRID +te_fp8_recipe = DelayedScaling(fp8_format=te_fp8_format, amax_history_len=16, amax_compute_algo="max") + +def swap_linear_to_te_linear(model, fqn=''): + for name, child in model.named_children(): + new_fqn = f"{fqn}.{name}" + # if isinstance(child, torch.nn.Linear) and new_fqn != 'output' and 'norm_' in new_fqn: + if isinstance(child, torch.nn.Linear) and name != 'output': + te_linear = te.Linear(child.in_features, child.out_features, bias=child.bias is not None) + te_linear.weight = child.weight + te_linear.bias = child.bias + setattr(model, name, te_linear) + else: + swap_linear_to_te_linear(child, new_fqn) + +class ResettableIdentity(nn.Identity): + def reset_parameters(self): + pass + + +class NormFeedForward(torch.nn.Module): + """ + A replacement for ffn_norm -> ffn which is TE swap friendly + """ + + def __init__(self, ffn_norm, ffn): + super().__init__() + # self.ffn_norm = ffn_norm + + # fuse w1 and w3, TE assumes this optimization is applied + w13_in_feat = ffn.w1.in_features + w13_out_feat = ffn.w1.out_features * 2 + with torch.device("meta"): + w13 = nn.Linear(w13_in_feat, w13_out_feat, bias=False) + w13.weight = torch.nn.Parameter( + torch.cat([ffn.w1.weight, ffn.w3.weight], dim=0).contiguous() + ) + + # wrapped in a sequential for easy swap to either te.LayerNorm or + # torch.compiling just this wrapper + self.norm_w13 = nn.Sequential(ffn_norm, w13) + + self.w2 = ffn.w2 + self.split_dim = getattr(self.norm_w13, "1").out_features // 2 + + def forward(self, x): + # x = self.ffn_norm(x) + # x = self.w13(x) + x = self.norm_w13(x) + w1_out, w3_out = torch.split( + x, + self.split_dim, + dim=-1, + ) + out = self.w2(F.silu(w1_out) * w3_out) + return out + + def init_weights(self, init_std: float): + if isinstance(self.norm_w13, te.LayerNormLinear): + torch.nn.init.ones_(self.norm_w13.layer_norm_weight) + + # slight difference from llama/model.py - init every weight to init_std + for linear in (self.w2, self.norm_w13): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + else: + getattr(self.norm_w13, "0").reset_parameters() + + # slight difference from llama/model.py - init every weight to init_std + for linear in (self.w2, getattr(self.norm_w13, "1")): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + +class NormAttention(torch.nn.Module): + """ + A replacement for attn_norm -> attn which is TE swap friendly + """ + def __init__(self, attn_norm, attn): + super().__init__() + + # fuse attn.qkv, TE assumes this optimization is applied + self.split_dim = attn.wq.out_features + with torch.device("meta"): + wqkv = nn.Linear(attn.wq.in_features, attn.wq.out_features * 3, bias=False) + wqkv.weight = torch.nn.Parameter( + torch.cat([attn.wq.weight, attn.wk.weight, attn.wv.weight], dim=0).contiguous() + ) + + self.norm_wqkv = nn.Sequential(attn_norm, wqkv) + self.wo = attn.wo + + self.n_heads = attn.n_heads + self.n_kv_heads = attn.n_kv_heads + self.n_rep = attn.n_rep + self.head_dim = attn.head_dim + + def forward(self, x, freqs_cis): + bs, seqlen, _ = x.shape + # xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + x = self.norm_wqkv(x) + xq, xk, xv = torch.split( + x, + [ + self.n_heads * self.head_dim, + self.n_kv_heads * self.head_dim, + self.n_kv_heads * self.head_dim, + ], + dim=-1, + ) + + # Use -1 instead of `n_heads` (or `n_kv_heads`) to infer the actual + # local heads from sizes of xq, xk, and xv as TP may have sharded them + # after the above linear ops. + xq = xq.view(bs, seqlen, -1, self.head_dim) + xk = xk.view(bs, seqlen, -1, self.head_dim) + xv = xv.view(bs, seqlen, -1, self.head_dim) + + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + # repeat k/v heads if n_kv_heads < n_heads + keys = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + values = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + + # we use casual mask for training + output = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True) + output = output.transpose( + 1, 2 + ).contiguous() # (bs, seqlen, n_local_heads, head_dim) + output = output.view(bs, seqlen, -1) + return self.wo(output) + + def init_weights(self, init_std: float): + if isinstance(self.norm_wqkv, te.LayerNormLinear): + torch.nn.init.ones_(self.norm_wqkv.layer_norm_weight) + + # slight difference from llama/model.py - init every weight to init_std + for linear in (self.wo, self.norm_wqkv): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + else: + getattr(self.norm_wqkv, "0").reset_parameters() + + # slight difference from llama/model.py - init every weight to init_std + for linear in (self.wo, getattr(self.norm_wqkv, "1")): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + + +def swap_norm_ffn_to_te_friendly_norm_ffn(parent_module) -> None: + """ + `parent_module` is a module with the following structure: + + parent_module + ffn_norm: LayerNorm|RMSNorm + ffn: FeedForward + w1 + w2 + w3 + + this function will rewrite the graph without changing numerics to the following structure + + parent_module + ffn_norm: ResettableIdentity + feed_forward: NormFeedForward + norm_w13: Sequential + 0: LayerNorm|RMSNorm + 1: Linear (fused w1 and w3) + w2: Linear + + this is done to then make it easier to then swap to te.LayerNormLinear + """ + if hasattr(parent_module, "ffn_norm") and hasattr(parent_module, "feed_forward"): + parent_module.feed_forward = NormFeedForward( + parent_module.ffn_norm, + parent_module.feed_forward, + ) + parent_module.ffn_norm = ResettableIdentity() + else: + for name, child in parent_module.named_children(): + swap_norm_ffn_to_te_friendly_norm_ffn(child) + +def swap_norm_attn_to_te_friendly_norm_attn(parent_module): + """ + `parent_module` is a module with the following structure: + + parent_module + attention_norm: LayerNorm|RMSNorm + attention: Attention + wq + wk + wv + wo + + this function will rewrite the graph without changing numerics to the following structure + + parent_module + attention_norm: ResettableIdentity + attention: NormAttention + norm_wqkv: Sequential + 0: LayerNorm|RMSNorm + 1: Linear (fused wq, wk, wv) + wo: Linear + + this is done to then make it easier to then swap to te.LayerNormLinear + """ + if hasattr(parent_module, "attention_norm") and hasattr(parent_module, "attention"): + parent_module.attention = NormAttention( + parent_module.attention_norm, + parent_module.attention, + ) + parent_module.attention_norm = ResettableIdentity() + else: + for name, child in parent_module.named_children(): + swap_norm_attn_to_te_friendly_norm_attn(child) + +def swap_te_friendly_norm_ffn_to_te_layernorm_linear(parent_module): + """ + In `NormFeedForward`, swaps `norm_w13` with `te.LayerNormLinear` + In `NormAttention`, swaps `norm_wqkv` with `te.LayerNormLinear` + """ + + if isinstance(parent_module, NormFeedForward): + + te_ln_linear = te.LayerNormLinear( + parent_module.norm_w13[1].in_features, + parent_module.norm_w13[1].out_features, + bias=False, + normalization='RMSNorm', + ) + + te_ln_linear.layer_norm_weight = parent_module.norm_w13[0].weight + te_ln_linear.weight = parent_module.norm_w13[1].weight + parent_module.norm_w13 = te_ln_linear + + elif isinstance(parent_module, NormAttention): + + te_ln_linear = te.LayerNormLinear( + parent_module.norm_wqkv[1].in_features, + parent_module.norm_wqkv[1].out_features, + bias=False, + normalization='RMSNorm', + ) + te_ln_linear.layer_norm_weight = parent_module.norm_wqkv[0].weight + te_ln_linear.weight = parent_module.norm_wqkv[1].weight + parent_module.norm_wqkv = te_ln_linear + + else: + for name, child in parent_module.named_children(): + swap_te_friendly_norm_ffn_to_te_layernorm_linear(child) + + +def get_maybe_fp8_autocast(job_config): + # not for land - set up TransformerEngine fp8 autocast + # Note: te.fp8_autocast has to be created at every training iteration. + # If we try to create it once and reuse, we get this error: + # https://gist.github.com/vkuzo/d9840328c8bdc2901b8d04aa570ecb5b + maybe_te_float8_ctx = contextlib.nullcontext() + if job_config.training.te_float8_autocast: + assert job_config.training.te_swap_linear or job_config.training.te_swap_ln_linear + maybe_te_float8_ctx = te.fp8_autocast(enabled=True, fp8_recipe=te_fp8_recipe) + return maybe_te_float8_ctx diff --git a/train.py b/train.py index 58c2bcba..09ff2135 100644 --- a/train.py +++ b/train.py @@ -19,6 +19,7 @@ from torchtitan.float8 import Float8Handler from torchtitan.logging import init_logger, logger from torchtitan.metrics import build_device_memory_monitor, build_metric_logger +import torchtitan.te_utils as te_utils from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config from torchtitan.optimizer import build_lr_schedulers, build_optimizers from torchtitan.parallelisms import ( @@ -113,11 +114,22 @@ def main(job_config: JobConfig): with torch.device("meta"): model = model_cls.from_model_args(model_config) + if job_config.training.horizontally_fuse_fcs: + # note: this is required for te.LayerNormLinear + te_utils.swap_norm_ffn_to_te_friendly_norm_ffn(model) + te_utils.swap_norm_attn_to_te_friendly_norm_attn(model) + # a no-op hander if float8 is not enabled float8_handler = Float8Handler(job_config, parallel_dims) # swap to Float8Linear based on float8 configs float8_handler.convert_to_float8_training(model) + # not for land - set up TransformerEngine + if job_config.training.te_swap_ln_linear: + te_utils.swap_te_friendly_norm_ffn_to_te_layernorm_linear(model) + if job_config.training.te_swap_linear: + te_utils.swap_linear_to_te_linear(model) + # log model size model_param_count = utils.get_num_params(model) num_flop_per_token = utils.get_num_flop_per_token( @@ -244,6 +256,8 @@ def loss_fn(pred, labels): checkpoint.reset() + print(model) + # train loop logger.info( f"Training starts at step {train_state.step + 1}, " @@ -285,7 +299,11 @@ def loss_fn(pred, labels): else None ) + # not for land - set up TransformerEngine fp8 autocast + maybe_te_float8_ctx = te_utils.get_maybe_fp8_autocast(job_config) + if parallel_dims.pp_enabled: + assert not job_config.training.use_te, "unsupported" # Pipeline Parallel forward / backward inside step() call is_last_stage = pp_mesh.get_local_rank() == pp_mesh.size() - 1 @@ -307,12 +325,13 @@ def loss_fn(pred, labels): else: # Non-PP forward / backward with train_context(optional_context_parallel_ctx): - pred = model(input_ids) - loss = loss_fn(pred, labels) - # pred.shape=(bs, seq_len, vocab_size) - # need to free to before bwd to avoid peaking memory - del pred - loss.backward() + with maybe_te_float8_ctx: + pred = model(input_ids) + loss = loss_fn(pred, labels) + # pred.shape=(bs, seq_len, vocab_size) + # need to free to before bwd to avoid peaking memory + del pred + loss.backward() # clip gradients utils.clip_grad_norm_( diff --git a/vasiliy_sweep.sh b/vasiliy_sweep.sh new file mode 100755 index 00000000..f94de0df --- /dev/null +++ b/vasiliy_sweep.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# terminate on first error +set -e + +# sweep over various important torchtitan + TE experiments + +OUTPUT_FOLDER=/home/vasiliy/local/tmp/torchtitan_outputs +OUTPUT_LOGFILE=logs.txt + +# need to loop over: +# 1. AC (none, full, selective with op) +# 2. experiment branches (TE and PT) + +for AC_SETTING in none selective full +do + + for NAME in baseline te_linear_f8 te_ln_linear_f8 pt_f8 pt_f8_fsdp_f8 + do + + if [ $NAME == "baseline" ]; then + EXTRA_ARGS="" + elif [ $NAME == "te_linear_f8" ]; then + EXTRA_ARGS="--training.te_swap_linear --training.te_float8_autocast" + elif [ $NAME == "te_ln_linear_f8" ]; then + EXTRA_ARGS="--training.te_swap_linear --training.te_swap_ln_linear --training.te_float8_autocast" + elif [ $NAME == "pt_f8" ]; then + EXTRA_ARGS="--float8.enable_float8_linear" + elif [ $NAME == "pt_f8_fsdp_f8" ]; then + EXTRA_ARGS="--float8.enable_float8_linear --float8.enable_fsdp_float8_all_gather --float8.precompute_float8_dynamic_scale_for_fsdp" + else + # should not get here + exit 1 + fi + + OUTPUT_SUBFOLDER="20241204_v2_llama3_8b_name_${NAME}_ac_${AC_SETTING}" + + # create the subdir if does not exist, `tee` needs this + mkdir -p $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER + + CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh $EXTRA_ARGS \ + --job.dump_folder $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER \ + --training.compile \ + --training.horizontally_fuse_fcs \ + --activation_checkpoint.mode $AC_SETTING \ + --activation_checkpoint.selective_ac_option 2 \ + --training.steps 200 \ + --profiling.profile_freq 100 2>&1 | tee $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER/$OUTPUT_LOGFILE + + done + +done diff --git a/vasiliy_sweep_regional.sh b/vasiliy_sweep_regional.sh new file mode 100755 index 00000000..5e7c1c74 --- /dev/null +++ b/vasiliy_sweep_regional.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# terminate on first error +set -e + +# sweep over various important torchtitan + TE experiments + +OUTPUT_FOLDER=/home/vasiliy/local/tmp/torchtitan_outputs +OUTPUT_LOGFILE=logs.txt + +# need to loop over: +# 1. AC (none, full, selective with op) +# 2. experiment branches (TE and PT) + +for AC_SETTING in none full +do + + for NAME in baseline te_ln_linear_f8 pt_f8 pt_f8_fsdp_f8 + do + + if [ $NAME == "baseline" ]; then + EXTRA_ARGS="--training.compile --training.compile_ln_linear" + elif [ $NAME == "te_ln_linear_f8" ]; then + EXTRA_ARGS="--training.te_swap_ln_linear --training.te_float8_autocast" + elif [ $NAME == "pt_f8" ]; then + EXTRA_ARGS="--training.compile --training.compile_ln_linear --float8.enable_float8_linear" + elif [ $NAME == "pt_f8_fsdp_f8" ]; then + EXTRA_ARGS="--training.compile --training.compile_ln_linear --float8.enable_float8_linear --float8.enable_fsdp_float8_all_gather --float8.precompute_float8_dynamic_scale_for_fsdp" + else + # should not get here + exit 1 + fi + + # OUTPUT_SUBFOLDER="20241204_v3_regional_llama3_8b_name_${NAME}_ac_${AC_SETTING}" + # fixed - only enable compile for non-TE + OUTPUT_SUBFOLDER="20241204_v5_regional_llama3_8b_name_${NAME}_ac_${AC_SETTING}" + + # create the subdir if does not exist, `tee` needs this + mkdir -p $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER + + CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh $EXTRA_ARGS \ + --job.dump_folder $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER \ + --training.horizontally_fuse_fcs \ + --activation_checkpoint.mode $AC_SETTING \ + --activation_checkpoint.selective_ac_option 2 \ + --training.steps 200 \ + --profiling.profile_freq 100 2>&1 | tee $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER/$OUTPUT_LOGFILE + + done + +done diff --git a/vasiliy_sweep_regional_linear.sh b/vasiliy_sweep_regional_linear.sh new file mode 100755 index 00000000..8fa38718 --- /dev/null +++ b/vasiliy_sweep_regional_linear.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# terminate on first error +set -e + +# sweep over various important torchtitan + TE experiments + +OUTPUT_FOLDER=/home/vasiliy/local/tmp/torchtitan_outputs +OUTPUT_LOGFILE=logs.txt + +# need to loop over: +# 1. AC (none, full, selective with op) +# 2. experiment branches (TE and PT) + +for AC_SETTING in none full +do + + for NAME in baseline te_ln_linear_f8 pt_f8 pt_f8_fsdp_f8 + do + + if [ $NAME == "baseline" ]; then + EXTRA_ARGS="--training.compile --training.compile_linear" + elif [ $NAME == "te_ln_linear_f8" ]; then + EXTRA_ARGS="--training.te_swap_linear --training.te_float8_autocast" + elif [ $NAME == "pt_f8" ]; then + EXTRA_ARGS="--training.compile --training.compile_linear --float8.enable_float8_linear" + elif [ $NAME == "pt_f8_fsdp_f8" ]; then + EXTRA_ARGS="--training.compile --training.compile_linear --float8.enable_float8_linear --float8.enable_fsdp_float8_all_gather --float8.precompute_float8_dynamic_scale_for_fsdp" + else + # should not get here + exit 1 + fi + + # v6 contained an error + OUTPUT_SUBFOLDER="20241204_v7_regional_linear_llama3_8b_name_${NAME}_ac_${AC_SETTING}" + + # create the subdir if does not exist, `tee` needs this + mkdir -p $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER + + CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh $EXTRA_ARGS \ + --job.dump_folder $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER \ + --training.horizontally_fuse_fcs \ + --activation_checkpoint.mode $AC_SETTING \ + --activation_checkpoint.selective_ac_option 2 \ + --training.steps 200 \ + --profiling.profile_freq 100 2>&1 | tee $OUTPUT_FOLDER/$OUTPUT_SUBFOLDER/$OUTPUT_LOGFILE + + done + +done