Skip to content

Commit

Permalink
Adding linear initialization for all networks
Browse files Browse the repository at this point in the history
  • Loading branch information
jloveric committed Dec 15, 2023
1 parent ad53b61 commit d8047d5
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 15 deletions.
5 changes: 5 additions & 0 deletions config/high_order_interpolation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ data:
# each book is processed by a single process!
pre_process_workers: 0

initialize :
type: kaiming #linear
max_offset: 0.1
max_slope: 0.5

max_epochs: 100
accelerator: cuda
batch_size: 256
Expand Down
22 changes: 8 additions & 14 deletions language_interpolation/networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
HighOrderMLP,
HighOrderFullyConvolutionalNetwork,
HighOrderTailFocusNetwork,
initialize_network_polynomial_layers,
initialize_polynomial_layer,
)
from high_order_layers_torch.positional_embeddings import ClassicSinusoidalEmbedding
from high_order_layers_torch.layers import MaxAbsNormalizationLast, high_order_fc_layers
from high_order_layers_torch.networks import initialize_network_polynomial_layers
from torchmetrics import Accuracy
from torch import Tensor
import torch.nn.functional as F
Expand Down Expand Up @@ -280,11 +279,6 @@ def high_order_attention_block(
device=device,
)

initialize_polynomial_layer(query, max_slope=1.0, max_offset=0.0)
initialize_polynomial_layer(key, max_slope=1.0, max_offset=0.0)
initialize_polynomial_layer(value, max_slope=1.0, max_offset=0.0)
initialize_polynomial_layer(output, max_slope=1.0, max_offset=0.0)

layer = HighOrderAttention(
embed_dim=embed_dim,
out_dim=out_dim,
Expand Down Expand Up @@ -366,10 +360,6 @@ def __init__(
normalization=mlp_normalization,
)

# initialize_network_polynomial_layers(
# self._embedding_layer, max_slope=1.0, max_offset=0.0
# )

for index, element in enumerate(layers[1:]):
input_scale = 2.0
# if index == 0:
Expand Down Expand Up @@ -407,9 +397,7 @@ def __init__(
normalization=mlp_normalization,
)

initialize_network_polynomial_layers(
self._output_layer, max_slope=1.0, max_offset=0.0
)


# Make the positions 0 to max_context-1
self.positional_embedding = (
Expand Down Expand Up @@ -581,6 +569,12 @@ def select_network(cfg: DictConfig, device: str = None):
f"Unrecognized model_type {cfg.model_type} should be high_order, high_order_input or high_order_conv!"
)

if cfg.initialize.type == 'linear':
logger.info('Performing linear initialization')
initialize_network_polynomial_layers(
model, max_slope=cfg.initialize.max_slope, max_offset=cfg.initialize.max_offset
)

return model


Expand Down
9 changes: 8 additions & 1 deletion tests/test_attention_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
)
from language_interpolation.lightning_datamodule import TransformerDataModule
from high_order_layers_torch.layers import MaxAbsNormalizationLast
from high_order_layers_torch.networks import initialize_network_polynomial_layers
from omegaconf import DictConfig
from language_interpolation.utils import generate_transformer_text
import torch
Expand Down Expand Up @@ -41,7 +42,7 @@ def test_attention_network():

network = HighOrderAttentionNetwork(
layers=[
{"input": 10, "output": 10, "hidden": 10, "layers": 1, "segments": 3},
{"input": 10, "output": 10, "hidden": 10, "layers": 1, "segments": 3, "input_segments": 3},
{"input": 10, "output": 5, "segments": 3},
{"input": 5, "output": 5, "segments": 2},
],
Expand All @@ -55,6 +56,12 @@ def test_attention_network():
output_hidden_layers=1,
output_hidden_width=5,
)

initialize_network_polynomial_layers(
network, max_slope=1.0, max_offset=0.0
)


result = network(input_data)
print("final result", result)
print("result", result.shape)
Expand Down

0 comments on commit d8047d5

Please sign in to comment.