mlp w/ glu style gating

abcucberkeley · Jan 9, 2025 · c19fc43 · c19fc43
1 parent f7085ab
commit c19fc43
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 11 deletions.
diff --git a/src/encoder.py b/src/encoder.py
@@ -5,7 +5,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from timm.layers import trunc_normal_, RmsNorm
+from timm.layers import trunc_normal_, RmsNorm, SwiGLU
 
 from transformer import Transformer
 
@@ -30,6 +30,8 @@ def __init__(
         init_std=0.02,
         fixed_dropout_depth=False,
         norm_layer: nn.Module = RmsNorm,
+        act_layer: nn.Module = nn.SiLU,
+        mlp_layer: nn.Module = SwiGLU,
         **kwargs,
     ):
         super().__init__()
@@ -48,6 +50,8 @@ def __init__(
             dpr = np.linspace(0, self.drop_path_rate, self.depth)
 
         self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.mlp_layer = mlp_layer
 
         self.transformer_blocks = nn.ModuleList([
             Transformer(
@@ -57,7 +61,9 @@ def __init__(
                 proj_drop=self.proj_drop_rate,
                 att_drop=self.att_drop_rate,
                 drop_path=self.drop_path_rate if fixed_dropout_depth and self.drop_path_rate > 0.0 else dpr[i],
-                norm_layer=self.norm_layer
+                norm_layer=self.norm_layer,
+                act_layer=self.act_layer,
+                mlp_layer=self.mlp_layer,
             )
             for i in range(self.depth)
         ])

diff --git a/src/maskedautoencoder.py b/src/maskedautoencoder.py
@@ -4,7 +4,7 @@
 
 import torch
 import torch.nn as nn
-from timm.layers import RmsNorm
+from timm.layers import RmsNorm, SwiGLU
 
 from maskedencoder import MaskedEncoder
 from maskedpredictor import MaskedPredictor
@@ -114,6 +114,8 @@ def __init__(
         init_std=0.02,
         fixed_dropout_depth=False,
         norm_layer: nn.Module = RmsNorm,
+        act_layer: nn.Module = nn.SiLU,
+        mlp_layer: nn.Module = SwiGLU,
         use_conv_proj=False,
         mask_ratio=.9,
         window_mask_shape=None,
@@ -156,6 +158,8 @@ def __init__(
 
         self.init_std = init_std
         self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.mlp_layer = mlp_layer
 
         self.masked_encoder = MaskedEncoder(
             input_fmt="BZYXC",
@@ -172,6 +176,8 @@ def __init__(
             drop_path_rate=self.drop_path_rate,
             fixed_dropout_depth=self.fixed_dropout_depth,
             norm_layer=self.norm_layer,
+            act_layer=self.act_layer,
+            mlp_layer=self.mlp_layer,
             init_std=self.init_std,
             use_conv_proj=use_conv_proj,
             cls_token=False,
@@ -194,6 +200,8 @@ def __init__(
             drop_path_rate=self.drop_path_rate,
             fixed_dropout_depth=self.fixed_dropout_depth,
             norm_layer=self.norm_layer,
+            act_layer=self.act_layer,
+            mlp_layer=self.mlp_layer,
             init_std=self.init_std,
             cls_token=False,
         )

diff --git a/src/maskedencoder.py b/src/maskedencoder.py
@@ -4,7 +4,7 @@
 
 import torch
 import torch.nn as nn
-from timm.layers import RmsNorm
+from timm.layers import RmsNorm, SwiGLU
 
 from encoder import Encoder
 from masking import apply_masks
@@ -90,6 +90,8 @@ def __init__(
         init_std=0.02,
         fixed_dropout_depth=False,
         norm_layer: nn.Module = RmsNorm,
+        act_layer: nn.Module = nn.SiLU,
+        mlp_layer: nn.Module = SwiGLU,
         use_conv_proj=False,
         **kwargs,
     ):
@@ -122,6 +124,8 @@ def __init__(
 
         self.init_std = init_std
         self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.mlp_layer = mlp_layer
         self.norm = norm_layer(self.embed_dim) if norm_layer is not None else nn.Identity()
 
         if use_conv_proj:
@@ -161,6 +165,8 @@ def __init__(
             drop_path_rate=self.drop_path_rate,
             fixed_dropout_depth=self.fixed_dropout_depth,
             norm_layer=self.norm_layer,
+            act_layer=self.act_layer,
+            mlp_layer=self.mlp_layer,
             init_std=self.init_std
         )
 

diff --git a/src/maskedpredictor.py b/src/maskedpredictor.py
@@ -4,7 +4,7 @@
 
 import torch
 import torch.nn as nn
-from timm.layers import RmsNorm
+from timm.layers import RmsNorm, SwiGLU
 
 from encoder import Encoder
 from patch_embeddings import PosEmbedding
@@ -91,6 +91,8 @@ def __init__(
         init_std=0.02,
         fixed_dropout_depth=False,
         norm_layer: nn.Module = RmsNorm,
+        act_layer: nn.Module = nn.SiLU,
+        mlp_layer: nn.Module = SwiGLU,
         **kwargs,
     ):
         super().__init__()
@@ -124,6 +126,8 @@ def __init__(
 
         self.init_std = init_std
         self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.mlp_layer = mlp_layer
         self.norm = norm_layer(self.embed_dim) if norm_layer is not None else nn.Identity()
 
         self.token_param = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
@@ -157,6 +161,8 @@ def __init__(
             drop_path_rate=self.drop_path_rate,
             fixed_dropout_depth=self.fixed_dropout_depth,
             norm_layer=self.norm_layer,
+            act_layer=self.act_layer,
+            mlp_layer=self.mlp_layer,
             init_std=self.init_std
         )
 

diff --git a/src/transformer.py b/src/transformer.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from timm.layers import Mlp, DropPath
+from timm.layers import SwiGLU, DropPath
 
 logging.basicConfig(
 	stream=sys.stdout,
@@ -84,6 +84,8 @@ def __init__(
         att_drop: float = 0.,
         drop_path: float = 0.,
         norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        act_layer: nn.Module = nn.SiLU,
+        mlp_layer: nn.Module = SwiGLU,
     ) -> None:
         super().__init__()
         self.norm1 = norm_layer(dim)
@@ -99,12 +101,11 @@ def __init__(
         self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
         self.norm2 = norm_layer(dim)
-        self.mlp = Mlp(
+        self.mlp = mlp_layer(
             in_features=dim,
             hidden_features=int(dim * mlp_ratio),
             drop=proj_drop,
-            act_layer=nn.GELU,
-            use_conv=False,
+            act_layer=act_layer,
         )
         self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 

diff --git a/src/vit.py b/src/vit.py
@@ -5,7 +5,7 @@
 
 import torch
 import torch.nn as nn
-from timm.layers import AttentionPoolLatent
+from timm.layers import AttentionPoolLatent, Mlp
 from timm.models.vision_transformer import global_pool_nlc
 
 from encoder import Encoder
@@ -93,6 +93,8 @@ def __init__(
         fixed_dropout_depth=False,
         global_pool: Literal['', 'avg', 'avgmax', 'max', 'token', 'map'] = 'avgmax',
         norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),
+        act_layer = nn.GELU,
+        mlp_layer = Mlp,
         use_conv_proj=False,
         **kwargs,
     ):
@@ -126,6 +128,8 @@ def __init__(
         self.init_std = init_std
         self.global_pool = global_pool
         self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.mlp_layer = mlp_layer
         self.norm = norm_layer(self.embed_dim) if norm_layer is not None else nn.Identity()
 
         if use_conv_proj:
@@ -165,7 +169,9 @@ def __init__(
             drop_path_rate=self.drop_path_rate,
             fixed_dropout_depth=self.fixed_dropout_depth,
             norm_layer=self.norm_layer,
-            init_std=self.init_std
+            act_layer=self.act_layer,
+            mlp_layer=self.mlp_layer,
+            init_std=self.init_std,
         )
 
         self.global_pool = global_pool