Skip to content

Commit

Permalink
Merge pull request #57 from krasserm/wip-fix-activation-checkpoint
Browse files Browse the repository at this point in the history
Fix activation checkpointing

- fixes #56
  • Loading branch information
krasserm authored Jan 2, 2024
2 parents 4ac9b2c + 192b80b commit b08d160
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 6 deletions.
3 changes: 1 addition & 2 deletions examples/training/img_clf/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,14 @@ def configure_optimizers(self):
num_latent_channels=128,
)


if __name__ == "__main__":
lit_model = LitImageClassifier.create(config)

trainer = pl.Trainer(
accelerator="gpu",
devices=2,
max_epochs=30,
strategy=DDPStrategy(find_unused_parameters=False),
strategy=DDPStrategy(find_unused_parameters=False, static_graph=True),
logger=TensorBoardLogger(save_dir="logs", name="img_clf"),
)

Expand Down
1 change: 1 addition & 0 deletions examples/training/img_clf/train.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ python -m perceiver.scripts.vision.image_classifier fit \
--trainer.accelerator=gpu \
--trainer.devices=2 \
--trainer.max_epochs=30 \
--trainer.strategy=ddp_static_graph \
--trainer.logger=TensorBoardLogger \
--trainer.logger.save_dir=logs \
--trainer.logger.name=img_clf
36 changes: 32 additions & 4 deletions perceiver/model/core/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def __init__(
]

if activation_checkpointing:
layers = [checkpoint_wrapper(layer, offload_to_cpu=activation_offloading) for layer in layers]
layers = [activation_checkpoint_wrapper(layer, offload_to_cpu=activation_offloading) for layer in layers]

self.num_rotary_layers = num_rotary_layers
super().__init__(*layers)
Expand Down Expand Up @@ -543,7 +543,8 @@ def cross_attn():
residual_dropout=residual_dropout,
)
return (
checkpoint_wrapper(layer, offload_to_cpu=activation_offloading) if activation_checkpointing else layer
activation_checkpoint_wrapper(layer, offload_to_cpu=activation_offloading)
if activation_checkpointing else layer
)

def self_attn():
Expand Down Expand Up @@ -659,7 +660,7 @@ def __init__(
)

if activation_checkpointing:
cross_attn = checkpoint_wrapper(cross_attn, offload_to_cpu=activation_offloading)
cross_attn = activation_checkpoint_wrapper(cross_attn, offload_to_cpu=activation_offloading)

self.cross_attn = cross_attn
self._init_parameters(init_scale)
Expand Down Expand Up @@ -738,7 +739,8 @@ def cross_attn():
mlp_bias=False,
)
return (
checkpoint_wrapper(layer, offload_to_cpu=activation_offloading) if activation_checkpointing else layer
activation_checkpoint_wrapper(layer, offload_to_cpu=activation_offloading)
if activation_checkpointing else layer
)

def self_attn():
Expand Down Expand Up @@ -926,3 +928,29 @@ def forward(

output.logits = self.output_adapter(output.last_hidden_state, txt_embedding=self.input_adapter.txt_embedding)
return output


def activation_checkpoint_wrapper(module: AbstractAttentionLayer, offload_to_cpu: bool = False):
abstract_attention_layer_original_forward = AbstractAttentionLayer.forward

module._activation_checkpointing_enabled = True

def _abstract_attention_layer_patched_forward(self, *args, **kwargs):
output = abstract_attention_layer_original_forward(self, *args, **kwargs)
if hasattr(self, "_activation_checkpointing_enabled") and self.training and isinstance(output, ModuleOutput):
return output.last_hidden_state
return output

AbstractAttentionLayer.forward = _abstract_attention_layer_patched_forward

module = checkpoint_wrapper(module, offload_to_cpu=offload_to_cpu)
module_original_forward = module.forward

def _module_patched_forward(*args, **kwargs):
output = module_original_forward(*args, **kwargs)
if isinstance(output, ModuleOutput):
return output
return ModuleOutput(last_hidden_state=output, kv_cache=None)

module.forward = _module_patched_forward
return module

0 comments on commit b08d160

Please sign in to comment.