Adding linear initialization for all networks

jloveric · Dec 15, 2023 · d8047d5 · d8047d5
1 parent ad53b61
commit d8047d5
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 15 deletions.
diff --git a/config/high_order_interpolation.yaml b/config/high_order_interpolation.yaml
@@ -39,6 +39,11 @@ data:
   # each book is processed by a single process!
   pre_process_workers: 0
 
+initialize :
+  type: kaiming #linear
+  max_offset: 0.1
+  max_slope: 0.5
+
 max_epochs: 100
 accelerator: cuda
 batch_size: 256

diff --git a/language_interpolation/networks.py b/language_interpolation/networks.py
@@ -9,11 +9,10 @@
     HighOrderMLP,
     HighOrderFullyConvolutionalNetwork,
     HighOrderTailFocusNetwork,
-    initialize_network_polynomial_layers,
-    initialize_polynomial_layer,
 )
 from high_order_layers_torch.positional_embeddings import ClassicSinusoidalEmbedding
 from high_order_layers_torch.layers import MaxAbsNormalizationLast, high_order_fc_layers
+from high_order_layers_torch.networks import initialize_network_polynomial_layers
 from torchmetrics import Accuracy
 from torch import Tensor
 import torch.nn.functional as F
@@ -280,11 +279,6 @@ def high_order_attention_block(
         device=device,
     )
 
-    initialize_polynomial_layer(query, max_slope=1.0, max_offset=0.0)
-    initialize_polynomial_layer(key, max_slope=1.0, max_offset=0.0)
-    initialize_polynomial_layer(value, max_slope=1.0, max_offset=0.0)
-    initialize_polynomial_layer(output, max_slope=1.0, max_offset=0.0)
-
     layer = HighOrderAttention(
         embed_dim=embed_dim,
         out_dim=out_dim,
@@ -366,10 +360,6 @@ def __init__(
             normalization=mlp_normalization,
         )
 
-        # initialize_network_polynomial_layers(
-        #    self._embedding_layer, max_slope=1.0, max_offset=0.0
-        # )
-
         for index, element in enumerate(layers[1:]):
             input_scale = 2.0
             # if index == 0:
@@ -407,9 +397,7 @@ def __init__(
             normalization=mlp_normalization,
         )
 
-        initialize_network_polynomial_layers(
-            self._output_layer, max_slope=1.0, max_offset=0.0
-        )
+
 
         # Make the positions 0 to max_context-1
         self.positional_embedding = (
@@ -581,6 +569,12 @@ def select_network(cfg: DictConfig, device: str = None):
             f"Unrecognized model_type {cfg.model_type} should be high_order, high_order_input or high_order_conv!"
         )
 
+    if cfg.initialize.type == 'linear':
+        logger.info('Performing linear initialization')
+        initialize_network_polynomial_layers(
+            model, max_slope=cfg.initialize.max_slope, max_offset=cfg.initialize.max_offset
+        )
+
     return model
 
 

diff --git a/tests/test_attention_network.py b/tests/test_attention_network.py
@@ -7,6 +7,7 @@
 )
 from language_interpolation.lightning_datamodule import TransformerDataModule
 from high_order_layers_torch.layers import MaxAbsNormalizationLast
+from high_order_layers_torch.networks import initialize_network_polynomial_layers
 from omegaconf import DictConfig
 from language_interpolation.utils import generate_transformer_text
 import torch
@@ -41,7 +42,7 @@ def test_attention_network():
 
     network = HighOrderAttentionNetwork(
         layers=[
-            {"input": 10, "output": 10, "hidden": 10, "layers": 1, "segments": 3},
+            {"input": 10, "output": 10, "hidden": 10, "layers": 1, "segments": 3, "input_segments": 3},
             {"input": 10, "output": 5, "segments": 3},
             {"input": 5, "output": 5, "segments": 2},
         ],
@@ -55,6 +56,12 @@ def test_attention_network():
         output_hidden_layers=1,
         output_hidden_width=5,
     )
+
+    initialize_network_polynomial_layers(
+        network, max_slope=1.0, max_offset=0.0
+    )
+
+
     result = network(input_data)
     print("final result", result)
     print("result", result.shape)