diff --git a/.github/workflows/dev-env-aws.yml b/.github/workflows/dev-env-aws.yml
index ebbaaa9e..9b9b6e83 100644
--- a/.github/workflows/dev-env-aws.yml
+++ b/.github/workflows/dev-env-aws.yml
@@ -77,6 +77,7 @@ jobs:
 
   deploy-vscode:
     name: Deploy development environment
+    timeout-minutes: 1500
     needs:
       - start-runner
     runs-on: ${{ needs.start-runner.outputs.label }}
@@ -125,8 +126,7 @@ jobs:
           curl -Lk 'https://code.visualstudio.com/sha/download?build=stable&os=cli-alpine-x64' --output vscode_cli.tar.gz
           tar -xf vscode_cli.tar.gz
       - name: Serve VSCode
-        continue-on-error: true
-        timeout-minutes: 999999999999 
+        continue-on-error: true 
         run: ./code tunnel
       
   stop-runner:
diff --git a/pipelines/capacity.py b/pipelines/capacity.py
deleted file mode 100644
index 9f408887..00000000
--- a/pipelines/capacity.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import os
-import asyncio
-import subprocess
-from pathlib import Path
-
-import asyncio
-from contextlib import contextmanager
-from typing import Literal
-from prefect import flow, serve, get_run_logger, task, variables
-from prefect.runner.storage import GitRepository
-import boto3
-from prefect_aws import AwsCredentials
-import os
-
-aws_credentials_block = AwsCredentials.load("spot")
-
-os.environ["REGION_NAME"] = aws_credentials_block.region_name
-os.environ["AWS_ACCESS_KEY_ID"] = aws_credentials_block.aws_access_key_id
-os.environ["AWS_SECRET_ACCESS_KEY"] = aws_credentials_block.aws_secret_access_key.get_secret_value()
-
-
-@task
-def get_client():
-    return boto3.client('autoscaling')
-    
-@task
-def set_capacity(client, capacity: int) -> None:
-    logger = get_run_logger()
-    response = client.update_auto_scaling_group(
-        AutoScalingGroupName='Infra-ECS-Cluster-spot-gpu-2-d30de970-ECSAutoScalingGroup-Vif1pCjC51Hq',
-        DesiredCapacity = capacity,
-        MaxSize=1,
-        MinSize=0,
-    )
-
-    print(response)
-
-
-@flow
-def main(capacity: int):
-    logger = get_run_logger()
-    client = get_client()
-    set_capacity(client, capacity)
-
-
-if __name__ == "__main__":
-
-    git_repo = GitRepository(
-        url="https://github.com/Digital-Defiance/llm-voice-chat.git",
-        branch = "main",
-    )
-
-    main_flow = main.from_source(
-        entrypoint="pipelines/capacity.py:main",
-        source=git_repo,
-    )
-
-    main_flow.deploy(
-        name="change-scaling-capacity",
-        work_pool_name = "workpool-prefect",
-    )
diff --git a/run.sh b/scripts/run.sh
similarity index 100%
rename from run.sh
rename to scripts/run.sh
diff --git a/src/attention/avg_pooling.rs b/src/attention/avg_pooling.rs
new file mode 100644
index 00000000..c708b462
--- /dev/null
+++ b/src/attention/avg_pooling.rs
@@ -0,0 +1,46 @@
+
+use tch::nn::Module;
+use tch::Tensor;
+
+
+
+/// implicit zero paddings on both sides of the input. Can be a single number or a tuple (padH, padW). Default: 0
+const DEFAULT_PADDING: i64 = 0;
+
+/// when True, will use ceil instead of floor in the formula to compute the output shape. Default: False
+const DEFAULT_CEIL_MODE: bool = false;
+
+/// when True, will include the zero-padding in the averaging calculation. Default: True
+const DEFAULT_COUNT_INCLUDE_PAD: bool = true;
+
+/// if specified, it will be used as divisor, otherwise size of the pooling region will be used. Default: None
+const DEFAULT_DIVISOR_OVERRIDE: core::option::Option<i64> = None;
+
+
+/// Average pooling layer
+#[derive(Debug)]
+pub struct AvgPooling {
+    /// Size of the pooling region. Can be a single number or a tuple (kH, kW)
+    kernel_size: i64,
+}
+
+
+impl AvgPooling {
+    pub fn new(kernel_size: i64) -> Self {
+        AvgPooling {  kernel_size  }
+    }
+}
+
+impl Module for AvgPooling {
+    fn forward(&self, x_bcd: &Tensor) -> Tensor {
+        x_bcd.avg_pool2d(
+            self.kernel_size,
+            // stride of the pooling operation. Can be a single number or a tuple (sH, sW). Default: kernel_size
+            self.kernel_size,
+            DEFAULT_PADDING,
+            DEFAULT_CEIL_MODE, 
+            DEFAULT_COUNT_INCLUDE_PAD, 
+            DEFAULT_DIVISOR_OVERRIDE, 
+        )
+    }
+}
diff --git a/src/attention/identity.rs b/src/attention/identity.rs
new file mode 100644
index 00000000..c5eb6b70
--- /dev/null
+++ b/src/attention/identity.rs
@@ -0,0 +1,21 @@
+
+use tch::nn;
+use tch::Tensor;
+
+
+
+#[derive(Debug)]
+pub struct Identity { }
+
+impl Identity {
+    pub fn new() -> Self {
+        Identity { }
+    }
+}
+
+impl nn::Module for Identity {
+    fn forward(&self, x_bcd: &Tensor) -> Tensor {   
+        x_bcd.g_mul_scalar(1)
+    }
+}
+
diff --git a/src/attention/metric_tensor.rs b/src/attention/metric_tensor.rs
index e69de29b..338f7920 100644
--- a/src/attention/metric_tensor.rs
+++ b/src/attention/metric_tensor.rs
@@ -0,0 +1 @@
+//! To be implemented with CUDA kernels
\ No newline at end of file
diff --git a/src/attention/mod.rs b/src/attention/mod.rs
index 7fff1ab5..dbec3405 100644
--- a/src/attention/mod.rs
+++ b/src/attention/mod.rs
@@ -1,5 +1,4 @@
 pub mod quadratic_form;
-
-
-
-
+pub mod scaled_dot_product;
+pub mod identity;
+pub mod avg_pooling;
diff --git a/src/attention/quadratic_form.rs b/src/attention/quadratic_form.rs
index 0d62cb52..9413e490 100644
--- a/src/attention/quadratic_form.rs
+++ b/src/attention/quadratic_form.rs
@@ -1,71 +1,93 @@
 
 use tch::nn;
+use tch::Tensor;
 
 
 pub fn generate_init() -> nn::Init {
     nn::Init::Randn { mean: 0., stdev: 1. }
 }
 
+#[derive(Debug)]
+pub struct QuadraticAttention {
+    projections_1ndq: Tensor,
+    metric_tensors_1nqq: Tensor,
+    adapter_1pd: Tensor,
+    sqrt_q: f64,
+    cp: (i64, i64),
+}
+
+
 /// Performs self attention N times using the quadratic form $xW_nx.T$ where $W_n$ is a learnable matrix.
 /// This is an early version of the metric self attention, where $W$ is forced to have the properties a metric tensor.
 /// https://arxiv.org/abs/2111.11418 - evidence that any of the attention mechanisms might have similar performance 
-pub fn quadratic_self_attention_module(
-    vs_path: &nn::Path,
-    n: i64,
-    d: i64,
-    q: i64,
-    c: i64,
-) ->  impl nn::Module {
-
-    assert!(d % n == 0, "Embeddings dimension must be divisible by the requested number of heads.");
-    debug_assert_eq!(n*q, d);
-
-    let projections_1ndq = vs_path.var("projections_1ndq", &[1, n, d, q], generate_init());
-    let metric_tensors_1nqq = vs_path.var("metric_tensors_1nqq", &[1, n, q, q], generate_init());
-    let mixer_1dd = vs_path.var("mixer_1dd", &[1, d, d], generate_init());
-
-    debug_assert_eq!(projections_1ndq.size(), vec![1, n, d, q]);
-    debug_assert_eq!(metric_tensors_1nqq.size(), vec![1, n, q, q]);
-    debug_assert_eq!(mixer_1dd.size(), vec![1, d, d]);
-
-    let sqrt_q = f64::sqrt(q as f64);
+impl QuadraticAttention {
+    pub fn new(
+        vs_path: &nn::Path,
+        number_of_heads: i64,
+        embedding_dimension: i64,
+        latent_dimension: i64,
+        sequence_length: i64,
+    ) -> Self {
+
+        let n = number_of_heads;
+        let d = embedding_dimension;
+        let c = sequence_length;
+        let q = latent_dimension;
+        let p = latent_dimension*number_of_heads;
+    
+        let projections_1ndq = vs_path.var("projections_1ndq", &[1, n, d, q], generate_init());
+        let metric_tensors_1nqq = vs_path.var("metric_tensors_1nqq", &[1, n, q, q], generate_init());
+        let adapter_1pd = vs_path.var("adapter_1pd", &[1, p, d], generate_init());
+    
+        let sqrt_q = f64::sqrt(q as f64);
+        QuadraticAttention { 
+            projections_1ndq, 
+            metric_tensors_1nqq,
+            adapter_1pd,
+            sqrt_q,
+            cp: (c, p)
+        }
+    }
+}
 
- 
+// Implement the nn::Module trait for QuadraticAttention.
+impl nn::Module for QuadraticAttention {
+    fn forward(&self, x_bcd: &Tensor) -> Tensor {
 
-    nn::func(move |x_bcd: &tch::Tensor| {
-    
         let b = x_bcd.size()[0];
-        assert_eq!(x_bcd.size(), vec![b, c, d]);
-
+        // assert_eq!(x_bcd.size(), vec![b, c, d]);
 
         // Apply n projections to the input 
         let x_b1cd = &x_bcd.unsqueeze(1);
-        let x_bncq = &x_b1cd.matmul(&projections_1ndq);
-        debug_assert_eq!(x_bncq.size(), vec![b, n, c, q]);
-
+        let x_bncq = &x_b1cd.matmul(&self.projections_1ndq);
+        // debug_assert_eq!(x_bncq.size(), vec![b, n, c, q]);
 
         // Use n custom dot products to generate n score tables
         let x_bnqc = &x_bncq.transpose(-1, -2);
-        let dotproducts_bncc = &x_bncq.matmul(&metric_tensors_1nqq.matmul(x_bnqc));
-        debug_assert!(dotproducts_bncc.size() == vec![b, n, c, c]);
+        let dotproducts_bncc = &x_bncq.matmul(&self.metric_tensors_1nqq.matmul(x_bnqc));
+        // debug_assert!(dotproducts_bncc.size() == vec![b, n, c, c]);
     
         // From scaled dot product attention introduced in https://arxiv.org/abs/1706.03762
-        let scaled_dotproducts_bncc = &dotproducts_bncc.divide_scalar(sqrt_q);
+        let scaled_dotproducts_bncc = &dotproducts_bncc.divide_scalar(self.sqrt_q);
 
         let softmaxed_scaled_dotproducts_bncc = &scaled_dotproducts_bncc.softmax(-1, tch::kind::Kind::Float);
         let y_bnqc = &x_bncq.transpose(-1, -2).matmul(softmaxed_scaled_dotproducts_bncc);
-        debug_assert!(y_bnqc.size() == vec![b, n, q, c]);
+        // debug_assert!(y_bnqc.size() == vec![b, n, q, c]);
 
-        let y_bcd = &y_bnqc.reshape(x_bcd.size());
-        debug_assert!(y_bcd.size() == vec![b, c, d]);
+        let y_bcp = &y_bnqc.reshape(&[b, self.cp.0, self.cp.1]);
+        // debug_assert!(y_bcp.size() == vec![b, c, p]);
     
-        y_bcd.matmul(&mixer_1dd)
-    })
+        y_bcp.matmul(&self.adapter_1pd)
+    }
 }
 
 
 
 
+
+
+/* 
+
 #[cfg(test)]
 mod tests {
     use super::*; 
@@ -94,4 +116,8 @@ mod tests {
 
     }
 
-}
\ No newline at end of file
+}
+
+*/
+
+
diff --git a/src/attention/scaled_dot_product.rs b/src/attention/scaled_dot_product.rs
index e69de29b..313b125d 100644
--- a/src/attention/scaled_dot_product.rs
+++ b/src/attention/scaled_dot_product.rs
@@ -0,0 +1,129 @@
+
+use tch::nn;
+use tch::Tensor;
+
+pub fn generate_init() -> nn::Init {
+    nn::Init::Randn { mean: 0., stdev: 1. }
+}
+
+
+
+#[derive(Debug)]
+pub struct ScaledDotProductAttention {
+    query_weights_1ndq: Tensor, 
+    key_weights_1ndq: Tensor,
+    value_weights_1ndq: Tensor,
+    adapter_1pd: Tensor,
+    sqrt_q: f64,
+    cp: (i64, i64),
+}
+
+
+/// Performs self attention N times using the quadratic form $xW_nx.T$ where $W_n$ is a learnable matrix.
+/// This is an early version of the metric self attention, where $W$ is forced to have the properties a metric tensor.
+/// https://arxiv.org/abs/2111.11418 - evidence that any of the attention mechanisms might have similar performance 
+impl ScaledDotProductAttention {
+    pub fn new(
+        vs_path: &nn::Path,
+        number_of_heads: i64,
+        embedding_dimension: i64,
+        latent_dimension: i64,
+        sequence_length: i64,
+    ) -> Self {
+
+        let n = number_of_heads;
+        let d = embedding_dimension;
+        let c = sequence_length;
+        let q = latent_dimension;
+        let p = latent_dimension*number_of_heads;
+
+        let query_weights_1ndq = vs_path.var("query_weights_1ndq", &[1, n, d, q], generate_init());
+        let key_weights_1ndq = vs_path.var("key_weights_1ndq", &[1, n, d, q], generate_init());
+        let value_weights_1ndq = vs_path.var("value_weights_1ndq", &[1, n, d, q], generate_init());
+        let adapter_1pd = vs_path.var("adapter_1pd", &[1, p, d], generate_init());
+    
+        let sqrt_q = f64::sqrt(q as f64);
+        ScaledDotProductAttention { 
+            query_weights_1ndq, 
+            key_weights_1ndq,
+            value_weights_1ndq,
+            adapter_1pd,
+            sqrt_q,
+            cp: (c, p)
+        }
+    }
+}
+
+// Implement the nn::Module trait for QuadraticAttention.
+impl nn::Module for ScaledDotProductAttention {
+    fn forward(&self, x_bcd: &Tensor) -> Tensor {
+
+        let b = x_bcd.size()[0];
+        // assert_eq!(x_bcd.size(), vec![b, self.c, self.d]);
+
+        // Apply n projections to the input 
+        let x_b1cd = &x_bcd.unsqueeze(1);
+        
+        let queries_bncq = &x_b1cd.matmul(&self.query_weights_1ndq);
+        let keys_bncq = &x_b1cd.matmul(&self.key_weights_1ndq);
+        let values_bncq = &x_b1cd.matmul(&self.value_weights_1ndq);
+
+        // debug_assert_eq!(queries_bncq.size(), vec![b, n, c, q]);
+        // debug_assert_eq!(keys_bncq.size(), vec![b, n, c, q]);
+        // debug_assert_eq!(values_bncq.size(), vec![b, n, c, q]);
+
+        // Use n custom dot products to generate n score tables
+        let keys_bnqc = &keys_bncq.transpose(-1, -2);
+        let scores_bncc = &queries_bncq.matmul(keys_bnqc);
+        // debug_assert!(scores_bncc.size() == vec![b, n, c, c]);
+    
+        // From scaled dot product attention introduced in https://arxiv.org/abs/1706.03762
+        let scaled_scores_bncc = &scores_bncc.divide_scalar(self.sqrt_q);
+
+        let softmaxed_scaled_scores_bncc = &scaled_scores_bncc.softmax(-1, tch::kind::Kind::Float);
+        let y_bnqc = &values_bncq.transpose(-1, -2).matmul(softmaxed_scaled_scores_bncc);
+        // debug_assert!(y_bnqc.size() == vec![b, n, q, c]);
+
+        let y_bcp = &y_bnqc.reshape(&[b, self.cp.0, self.cp.1]);
+        // debug_assert!(y_bcp.size() == vec![b, c, p]);
+    
+        y_bcp.matmul(&self.adapter_1pd)
+    }
+}
+
+
+
+
+/* 
+
+#[cfg(test)]
+mod tests {
+    use super::*; 
+    use tch::{nn, Device, Kind, Tensor};
+    use tch::nn::Module;
+
+
+    #[test]
+    pub fn test_layer(){
+
+
+        let vs = nn::VarStore::new(Device::Cpu);
+        let vs_path = &vs.root();
+    
+        let b = 10;
+        let c = 5;
+        let d = 4;
+        let n = 2;
+        let q = 2;
+
+        let input_bcd = Tensor::randn( &[b, c, d],  (Kind::Float, Device::Cpu));
+        let layer = quadratic_self_attention_module(vs_path, n, d, q, c);
+        let output_bcd = layer.forward(&input_bcd);
+
+        debug_assert!(output_bcd.size() == input_bcd.size());
+
+    }
+
+}
+
+*/
diff --git a/src/config.rs b/src/config.rs
index d76706a9..d6a05192 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -1,11 +1,20 @@
 
 
 
+
 use clap::Parser;
 use tch::Device;
+use serde::Deserialize;
+
 
-use crate::metaformer::AttentionKind;
-// use crate::metaformer::AttentionKind;
+#[derive(PartialEq, Clone, Copy, Deserialize)]
+pub enum AttentionKind {
+    Quadratic,
+    ScaledDotProduct,
+    Metric,
+    Identity,
+    AveragePooling,
+}
 
 /// Train a MetaFormer model.
 #[derive(Parser)]
@@ -59,7 +68,6 @@ pub struct Cli {
     #[clap(long, env)]
     pub batch_size: i64,
 
-
     #[clap(long, env)]
     pub learning_rate: f64,
 
@@ -72,9 +80,12 @@ pub struct Cli {
     #[clap(long, env)]
     pub use_gpu: String,
 
+    #[clap(long, env)]
+    pub kernel_size: Option<i64>,
 
 }
 
+
 impl Cli {
     pub fn get_device(&self) -> Device {
         let cuda = Device::cuda_if_available();
@@ -91,18 +102,4 @@ impl Cli {
             panic!("Invalid device configuration. Check USE_GPU env var.");
         }
     }
-
-    pub fn get_attention_kind(&self) -> AttentionKind {
-        if self.attention_kind == "Quadratic" {
-            AttentionKind::Quadratic
-        } else {
-            AttentionKind::Quadratic
-        }
-    }
 }
-
-
-pub fn read_config() -> Cli {
-    Cli::parse()
-}
-
diff --git a/src/main.rs b/src/main.rs
index fad6210f..9b779f56 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -12,35 +12,65 @@ pub mod files;
 
 
 use std::str::FromStr;
-use tch::kind;
-use tch::nn::Module;
+use clap::Parser;
+use tch::{kind, nn::Module};
 use tch::nn::OptimizerConfig;
 
-use metaformer::MetaFormer;
+use metaformer::metaformer;
 use mlflow::{MLFlowClient, MetricAccumulator };
 
 
 use tch;
 use tch::nn;
 use files::read_dataslice;
-use config::{Cli, read_config};
+use config::Cli;
+
+
+const QUADRATIC: &str = "quadratic";
+const SCALED_DOT_PRODUCT: &str = "scaled_dot_product";
+const IDENTITY: &str = "identity";
+const AVERAGE_POOLING: &str = "average_pooling";
+const METRIC: &str = "metric";
+
 
 
 /// Implementation of gradient descent
 fn main() {
 
- 
-
-    let config: Cli = read_config();
+    let config: Cli = Cli::parse();
     let training_device = config.get_device();
-    let metaformer: MetaFormer = MetaFormer::new(&config);
     let vs: nn::VarStore = nn::VarStore::new(training_device);
     let vs_path: &nn::Path<'_> = &vs.root();
-    let attention_kind = config.get_attention_kind();
-    let model = metaformer.create(vs_path, attention_kind, training_device);
+
+    let mut model = metaformer(
+        vs_path,
+        config.dimension,
+        config.input_vocabolary,
+        config.context_window,
+        config.get_device()
+    );
+
+
+    for _ in 0..config.depth {
+
+        model = match config.attention_kind.as_str() {
+            QUADRATIC => model.add_quadratic_form(vs_path, config.heads),
+            SCALED_DOT_PRODUCT => model.add_scaled_dot_product(vs_path, config.heads),
+            IDENTITY => model,
+            AVERAGE_POOLING => model.add_avg_pooling(vs_path, config.kernel_size.unwrap()),
+            METRIC => todo!(),
+            _ => panic!("Not suported")
+        };
+
+        model = model.add_mlp(vs_path);
+    }
+
+    model = model.finish(vs_path, config.output_vocabolary);
+
     let mut opt: nn::Optimizer = tch::nn::Adam::default().build(&vs, config.learning_rate).unwrap(); // https://paperswithcode.com/method/adam
     
     let total_slices: i64 = config.slices*config.epochs;
+
     for global_idx in 0..total_slices {
         let avg_train_loss = {
             let mut loss_accumulator = MetricAccumulator::new("loss/train");
diff --git a/src/metaformer/mod.rs b/src/metaformer/mod.rs
index 9a649def..25c3e40f 100644
--- a/src/metaformer/mod.rs
+++ b/src/metaformer/mod.rs
@@ -1,124 +1,219 @@
-use crate::attention::quadratic_form::quadratic_self_attention_module;
-use crate::config::Cli;
 
 use self::mlp::create_mlp;
 use self::embedder::create_embedder_module;
+use crate::attention::quadratic_form::QuadraticAttention;
+use crate::attention::scaled_dot_product::ScaledDotProductAttention;
 
 pub mod layer_norm;
 pub mod commons;
 pub mod embedder;
 pub mod mlp;
 
+
 use layer_norm::create_layer_norm;
 use commons::generate_init;
-use serde::Deserialize;
 use tch::nn;
 use tch::nn::Module;
+use crate::attention::avg_pooling::AvgPooling;
+use tch::nn::func;
+use tch::Tensor;
+use tch;
 
 
 
-#[derive(PartialEq, Clone, Copy, Deserialize)]
-pub enum AttentionKind {
-    Quadratic,
-    ScaledDotProduct,
-    Metric,
-}
-
 /// Defines structure of the metaformer model
 /// GPT2 paper - https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf
 /// MetaFormer paper - TODO
+#[derive(Debug)]
 pub struct MetaFormer {
-
-
-    /// Dimension of the vector space that the network
-    /// uses internally to represent tokens 
     embedding_dimension: i64,
+    size_of_context_window: i64,
+    layers: Vec<Box<dyn Module>>,
+}
 
-    /// Number of transformer blocks
-    model_depth: i64,
-
-    /// Number of attention modules per transformer block
-    number_of_heads: i64,
+impl Module for MetaFormer {
+    fn forward(&self, xs: &Tensor) -> Tensor {
+        let xs = self.layers[0].forward(xs);
+        self.layers.iter().skip(1).fold(xs, |xs, layer| layer.forward(&xs))
+    }
+}
 
-    /// Maximum number of tokens in the input sequence
-    size_of_context_window: i64,
 
-    /// Total number of tokens that the network recognizes
+/// Creates a new empty metaformer layer.
+pub fn metaformer(
+    vs_path: &nn::Path,
+    embedding_dimension: i64,
     size_of_vocabolary: i64,
+    size_of_context_window: i64,
+    device: tch::Device,
+) -> MetaFormer {
+
+    let embedder = create_embedder_module(
+        vs_path, 
+        embedding_dimension,
+        size_of_vocabolary,
+        size_of_context_window,
+        device
+    );
 
-    output_tokens: i64,
-
-    
+    MetaFormer {
+        embedding_dimension,
+        size_of_context_window,
+        layers: vec![ Box::new(embedder) ]
+    }
 }
 
 
+
 impl MetaFormer {
 
-    pub fn new(config: &Cli) -> MetaFormer {
-        MetaFormer {
-            embedding_dimension: config.dimension,
-            model_depth: config.depth,
-            number_of_heads: config.heads,
-            size_of_context_window: config.context_window,
-            size_of_vocabolary: config.input_vocabolary,
-            output_tokens: config.output_vocabolary,
-        }
+    pub fn add_mlp(self, vs_path: &nn::Path) -> Self {
+        let layer = create_mlp(vs_path, self.embedding_dimension);
+        self.add(vs_path, layer)
     }
 
-    fn create_attention(&self, vs: &nn::Path, kind: AttentionKind) -> impl nn::Module {
-
-        match kind {
-            AttentionKind::Quadratic => quadratic_self_attention_module(
-                vs,
-                self.number_of_heads,
-                self.embedding_dimension,
-                self.embedding_dimension / self.number_of_heads,
-                self.size_of_context_window,
-            ),
-            AttentionKind::Metric => todo!(),
-            AttentionKind::ScaledDotProduct => todo!()
-        }
+    pub fn add_avg_pooling(self, vs_path: &nn::Path, kernel_size: i64) -> Self {
+        let layer = AvgPooling::new(kernel_size);
+        self.add(vs_path, layer)
     }
 
+    pub fn add_scaled_dot_product(self, vs_path: &nn::Path, number_of_heads: i64) -> Self {
+        let layer = ScaledDotProductAttention::new(
+            vs_path,
+            number_of_heads,
+            self.embedding_dimension,
+            self.embedding_dimension / number_of_heads,
+            self.size_of_context_window,
+        );
+        self.add(vs_path, layer)
+    }
+
+    pub fn add_quadratic_form(self, vs_path: &nn::Path, number_of_heads: i64) -> Self {
+        let layer = QuadraticAttention::new(
+            vs_path,
+            number_of_heads,
+            self.embedding_dimension,
+            self.embedding_dimension / number_of_heads,
+            self.size_of_context_window,
+        );
+        self.add(vs_path, layer)
+    }
 
-    fn create_output_layer(&self, vs: &nn::Path) -> impl nn::Module {
 
+    pub fn finish(mut self, vs_path: &nn::Path, output_tokens: i64) -> Self {
+        
         let d = self.embedding_dimension;
-        let t = self.output_tokens;
+        let t = output_tokens;
 
-        let linear_norm = create_layer_norm(vs, self.embedding_dimension);
-        let projection_1dt = vs.var("projection_1dt", &[1, d, t], generate_init());
+        let linear_norm = create_layer_norm(vs_path, self.embedding_dimension);
+        let projection_1dt = vs_path.var("projection_1dt", &[1, d, t], generate_init());
 
-        nn::func(move |x_bcd| {
+        let final_layer = nn::func(move |x_bcd| {
             let y_bcd = &linear_norm.forward(x_bcd);
             y_bcd.matmul(&projection_1dt)
-        })
+        });
+
+        self.layers.push(Box::new(final_layer));
+        self
     }
 
-    pub fn create(&self, vs_path: & nn::Path, kind: AttentionKind, device: tch::Device) -> impl nn::Module {
 
-        let mut model = nn::seq().add(
-            create_embedder_module(
-                vs_path, 
-                self.embedding_dimension,
-                self.size_of_vocabolary,
-                self.size_of_context_window,
-                device
-        ));
+    /// Appends a layer after all the current layers.
+    #[allow(clippy::should_implement_trait)]
+    fn add<M: Module + 'static>(mut self, vs_path: &nn::Path, layer: M) -> Self {
 
-        for _ in 0..self.model_depth  {
-            let attention_module = self.create_attention(vs_path, kind);
-            model = model.add(attention_module);
-            model = model.add(create_mlp(vs_path, self.embedding_dimension));
-        }
+        let layer_norm = create_layer_norm(vs_path, self.embedding_dimension);
 
-        model.add(self.create_output_layer(vs_path))
+        self.layers.push(Box::new(func(
+            move |x: &tch::Tensor| layer.forward(&layer_norm.forward(x)) + x
+        )));
+        self
     }
 }
 
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*
+pub struct MetaFormer {
+
+
+    /// Dimension of the vector space that the network
+    /// uses internally to represent tokens 
+    embedding_dimension: i64,
+
+    /// Number of transformer blocks
+    model_depth: i64,
+
+    /// Number of attention modules per transformer block
+    number_of_heads: i64,
+
+    /// Maximum number of tokens in the input sequence
+    size_of_context_window: i64,
+
+    /// Total number of tokens that the network recognizes
+    size_of_vocabolary: i64,
+
+    output_tokens: i64,
+
+    kernel_size: Option<i64>,
+
+    
+}
+
+*/
+
+
+
+
 /*
 #[test]
 pub fn test_model_creation(){