Move to cuda unconditionally so pp-only run works

pytorch · Feb 9, 2024 · 449c824 · 449c824
1 parent 7a8a9ec
commit 449c824
Showing 1 changed file with 3 additions and 0 deletions.
diff --git a/torchtrain/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py
@@ -73,6 +73,9 @@ def parallelize_llama(model, world_mesh, parallel_dims, args):
             # wrap the rest layers with FSDP
             model = wrap(model.cuda())
 
+    # redundant if FSDP is used, but ensure the model is on device consistently regardless with parallelisms were used
+    model.cuda()
+
     rank0_log("Applied parallelisms to the model...")
 
     return model