diff --git a/config/high_order_interpolation.yaml b/config/high_order_interpolation.yaml
index 727cab2..8d5d2de 100644
--- a/config/high_order_interpolation.yaml
+++ b/config/high_order_interpolation.yaml
@@ -48,6 +48,7 @@ max_epochs: 100
 accelerator: cuda
 batch_size: 256
 gradient_clip: null # 5.0
+accumulate_grad_batches: 1
 
 # Are you training? Otherwise plot the result
 train: True
diff --git a/config/net/transformer.yaml b/config/net/transformer.yaml
index 65fbaed..42efba6 100644
--- a/config/net/transformer.yaml
+++ b/config/net/transformer.yaml
@@ -9,30 +9,41 @@ normalize: true
 # number of Fourier components.
 model_type: high_order_transformer
 n: 3
-input_segments: 64
+
 segments: 2
-# Layers in the form [input, output, segments]
+base_width: 64
 
 # attention blocks
-#layers: [[10, 10, 64], [10, 5, 2], [5, 5, 2]]
 inner: 10
-#layers: [[1, ${inner}, 128], [${inner}, ${inner}, 10], [${inner}, ${inner}, 10]]
+
 layers:
-  [
-    {
-      "input": 1,
-      "hidden": 16,
-      "output": 16,
-      "layers": 1,
-      "segments": 16,
-      "input_segments": 128,
-    },
-    { "input": 16, "output": 16, "segments": 16 },
-    { "input": 16, "output": 16, "segments": 16 },
-    { "input": 16, "output": 16, "segments": 16 },
-    { "input": 16, "output": 16, "segments": 16 },
-    { "input": 16, "hidden": 100, "layers": 1, "segments": 10 },
-  ]
+  - input: 1
+    hidden: 16
+    output: 16
+    layers: 1
+    segments: 16
+    input_segments: 128
+
+  - input: 16
+    output: ${net.base_width}
+    segments: ${net.segments}
+
+  - input: ${net.base_width}
+    output: ${net.base_width}
+    segments: ${net.segments}
+
+  - input: ${net.base_width}
+    output: ${net.base_width}
+    segments: ${net.segments}
+
+  - input: ${net.base_width}
+    output: ${net.base_width}
+    segments: ${net.segments}
+
+  - input: ${net.base_width}
+    hidden: 100
+    layers: 1
+    segments: 10
 
 # Note! output dimension is c=heads*output so different than normal
 # And then similarity size is c*c*batch_size
diff --git a/examples/high_order_interpolation.py b/examples/high_order_interpolation.py
index 28bb3e5..e866de3 100644
--- a/examples/high_order_interpolation.py
+++ b/examples/high_order_interpolation.py
@@ -27,7 +27,9 @@
 logging.getLogger().setLevel(logging.DEBUG)
 
 
-@hydra.main(config_path="../config", config_name="high_order_interpolation", version_base="1.3")
+@hydra.main(
+    config_path="../config", config_name="high_order_interpolation", version_base="1.3"
+)
 def run_language_interpolation(cfg: DictConfig):
     logger.info(OmegaConf.to_yaml(cfg))
     logger.info("Working directory : {}".format(os.getcwd()))
@@ -36,7 +38,6 @@ def run_language_interpolation(cfg: DictConfig):
     create_gutenberg_cache(parent_directory=hydra.utils.get_original_cwd())
 
     if cfg.train is True:
-
         try:  # Try is needed for multirun case
             if cfg.data.type in dataset_registry:
                 dataset_generator = dataset_registry[cfg.data.type]
@@ -46,7 +47,6 @@ def run_language_interpolation(cfg: DictConfig):
                 )
 
             if cfg.net.model_type == "high_order_transformer":
-
                 # dataset_generator is only one type so using the default
                 datamodule = TransformerDataModule(
                     characters_per_feature=cfg.data.characters_per_feature,
@@ -101,6 +101,7 @@ def run_language_interpolation(cfg: DictConfig):
                 max_epochs=cfg.max_epochs,
                 accelerator=cfg.accelerator,
                 gradient_clip_val=cfg.gradient_clip,
+                accumulate_grad_batches=cfg.accumulate_grad_batches,
             )
 
             model = ASCIIPredictionNet(cfg)
diff --git a/language_interpolation/networks.py b/language_interpolation/networks.py
index c6e01a3..5b6cc43 100644
--- a/language_interpolation/networks.py
+++ b/language_interpolation/networks.py
@@ -561,11 +561,14 @@ def select_network(cfg: DictConfig, device: str = None):
 
     if cfg.initialize.type == "linear":
         logger.info("Performing linear initialization")
+        start_init = time.perf_counter()
         initialize_network_polynomial_layers(
             model,
             max_slope=cfg.initialize.max_slope,
             max_offset=cfg.initialize.max_offset,
         )
+        finish_time = time.perf_counter() - start_init
+        logger.info(f"Finished linear initialization {finish_time}")
 
     return model