diff --git a/.github/workflows/dev-env-aws.yml b/.github/workflows/dev-env-aws.yml index ebbaaa9e..9b9b6e83 100644 --- a/.github/workflows/dev-env-aws.yml +++ b/.github/workflows/dev-env-aws.yml @@ -77,6 +77,7 @@ jobs: deploy-vscode: name: Deploy development environment + timeout-minutes: 1500 needs: - start-runner runs-on: ${{ needs.start-runner.outputs.label }} @@ -125,8 +126,7 @@ jobs: curl -Lk 'https://code.visualstudio.com/sha/download?build=stable&os=cli-alpine-x64' --output vscode_cli.tar.gz tar -xf vscode_cli.tar.gz - name: Serve VSCode - continue-on-error: true - timeout-minutes: 999999999999 + continue-on-error: true run: ./code tunnel stop-runner: diff --git a/pipelines/capacity.py b/pipelines/capacity.py deleted file mode 100644 index 9f408887..00000000 --- a/pipelines/capacity.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import asyncio -import subprocess -from pathlib import Path - -import asyncio -from contextlib import contextmanager -from typing import Literal -from prefect import flow, serve, get_run_logger, task, variables -from prefect.runner.storage import GitRepository -import boto3 -from prefect_aws import AwsCredentials -import os - -aws_credentials_block = AwsCredentials.load("spot") - -os.environ["REGION_NAME"] = aws_credentials_block.region_name -os.environ["AWS_ACCESS_KEY_ID"] = aws_credentials_block.aws_access_key_id -os.environ["AWS_SECRET_ACCESS_KEY"] = aws_credentials_block.aws_secret_access_key.get_secret_value() - - -@task -def get_client(): - return boto3.client('autoscaling') - -@task -def set_capacity(client, capacity: int) -> None: - logger = get_run_logger() - response = client.update_auto_scaling_group( - AutoScalingGroupName='Infra-ECS-Cluster-spot-gpu-2-d30de970-ECSAutoScalingGroup-Vif1pCjC51Hq', - DesiredCapacity = capacity, - MaxSize=1, - MinSize=0, - ) - - print(response) - - -@flow -def main(capacity: int): - logger = get_run_logger() - client = get_client() - set_capacity(client, capacity) - - -if __name__ == "__main__": - - git_repo = GitRepository( - url="https://github.com/Digital-Defiance/llm-voice-chat.git", - branch = "main", - ) - - main_flow = main.from_source( - entrypoint="pipelines/capacity.py:main", - source=git_repo, - ) - - main_flow.deploy( - name="change-scaling-capacity", - work_pool_name = "workpool-prefect", - ) diff --git a/run.sh b/scripts/run.sh similarity index 100% rename from run.sh rename to scripts/run.sh diff --git a/src/attention/avg_pooling.rs b/src/attention/avg_pooling.rs new file mode 100644 index 00000000..c708b462 --- /dev/null +++ b/src/attention/avg_pooling.rs @@ -0,0 +1,46 @@ + +use tch::nn::Module; +use tch::Tensor; + + + +/// implicit zero paddings on both sides of the input. Can be a single number or a tuple (padH, padW). Default: 0 +const DEFAULT_PADDING: i64 = 0; + +/// when True, will use ceil instead of floor in the formula to compute the output shape. Default: False +const DEFAULT_CEIL_MODE: bool = false; + +/// when True, will include the zero-padding in the averaging calculation. Default: True +const DEFAULT_COUNT_INCLUDE_PAD: bool = true; + +/// if specified, it will be used as divisor, otherwise size of the pooling region will be used. Default: None +const DEFAULT_DIVISOR_OVERRIDE: core::option::Option = None; + + +/// Average pooling layer +#[derive(Debug)] +pub struct AvgPooling { + /// Size of the pooling region. Can be a single number or a tuple (kH, kW) + kernel_size: i64, +} + + +impl AvgPooling { + pub fn new(kernel_size: i64) -> Self { + AvgPooling { kernel_size } + } +} + +impl Module for AvgPooling { + fn forward(&self, x_bcd: &Tensor) -> Tensor { + x_bcd.avg_pool2d( + self.kernel_size, + // stride of the pooling operation. Can be a single number or a tuple (sH, sW). Default: kernel_size + self.kernel_size, + DEFAULT_PADDING, + DEFAULT_CEIL_MODE, + DEFAULT_COUNT_INCLUDE_PAD, + DEFAULT_DIVISOR_OVERRIDE, + ) + } +} diff --git a/src/attention/identity.rs b/src/attention/identity.rs new file mode 100644 index 00000000..c5eb6b70 --- /dev/null +++ b/src/attention/identity.rs @@ -0,0 +1,21 @@ + +use tch::nn; +use tch::Tensor; + + + +#[derive(Debug)] +pub struct Identity { } + +impl Identity { + pub fn new() -> Self { + Identity { } + } +} + +impl nn::Module for Identity { + fn forward(&self, x_bcd: &Tensor) -> Tensor { + x_bcd.g_mul_scalar(1) + } +} + diff --git a/src/attention/metric_tensor.rs b/src/attention/metric_tensor.rs index e69de29b..338f7920 100644 --- a/src/attention/metric_tensor.rs +++ b/src/attention/metric_tensor.rs @@ -0,0 +1 @@ +//! To be implemented with CUDA kernels \ No newline at end of file diff --git a/src/attention/mod.rs b/src/attention/mod.rs index 7fff1ab5..dbec3405 100644 --- a/src/attention/mod.rs +++ b/src/attention/mod.rs @@ -1,5 +1,4 @@ pub mod quadratic_form; - - - - +pub mod scaled_dot_product; +pub mod identity; +pub mod avg_pooling; diff --git a/src/attention/quadratic_form.rs b/src/attention/quadratic_form.rs index 0d62cb52..9413e490 100644 --- a/src/attention/quadratic_form.rs +++ b/src/attention/quadratic_form.rs @@ -1,71 +1,93 @@ use tch::nn; +use tch::Tensor; pub fn generate_init() -> nn::Init { nn::Init::Randn { mean: 0., stdev: 1. } } +#[derive(Debug)] +pub struct QuadraticAttention { + projections_1ndq: Tensor, + metric_tensors_1nqq: Tensor, + adapter_1pd: Tensor, + sqrt_q: f64, + cp: (i64, i64), +} + + /// Performs self attention N times using the quadratic form $xW_nx.T$ where $W_n$ is a learnable matrix. /// This is an early version of the metric self attention, where $W$ is forced to have the properties a metric tensor. /// https://arxiv.org/abs/2111.11418 - evidence that any of the attention mechanisms might have similar performance -pub fn quadratic_self_attention_module( - vs_path: &nn::Path, - n: i64, - d: i64, - q: i64, - c: i64, -) -> impl nn::Module { - - assert!(d % n == 0, "Embeddings dimension must be divisible by the requested number of heads."); - debug_assert_eq!(n*q, d); - - let projections_1ndq = vs_path.var("projections_1ndq", &[1, n, d, q], generate_init()); - let metric_tensors_1nqq = vs_path.var("metric_tensors_1nqq", &[1, n, q, q], generate_init()); - let mixer_1dd = vs_path.var("mixer_1dd", &[1, d, d], generate_init()); - - debug_assert_eq!(projections_1ndq.size(), vec![1, n, d, q]); - debug_assert_eq!(metric_tensors_1nqq.size(), vec![1, n, q, q]); - debug_assert_eq!(mixer_1dd.size(), vec![1, d, d]); - - let sqrt_q = f64::sqrt(q as f64); +impl QuadraticAttention { + pub fn new( + vs_path: &nn::Path, + number_of_heads: i64, + embedding_dimension: i64, + latent_dimension: i64, + sequence_length: i64, + ) -> Self { + + let n = number_of_heads; + let d = embedding_dimension; + let c = sequence_length; + let q = latent_dimension; + let p = latent_dimension*number_of_heads; + + let projections_1ndq = vs_path.var("projections_1ndq", &[1, n, d, q], generate_init()); + let metric_tensors_1nqq = vs_path.var("metric_tensors_1nqq", &[1, n, q, q], generate_init()); + let adapter_1pd = vs_path.var("adapter_1pd", &[1, p, d], generate_init()); + + let sqrt_q = f64::sqrt(q as f64); + QuadraticAttention { + projections_1ndq, + metric_tensors_1nqq, + adapter_1pd, + sqrt_q, + cp: (c, p) + } + } +} - +// Implement the nn::Module trait for QuadraticAttention. +impl nn::Module for QuadraticAttention { + fn forward(&self, x_bcd: &Tensor) -> Tensor { - nn::func(move |x_bcd: &tch::Tensor| { - let b = x_bcd.size()[0]; - assert_eq!(x_bcd.size(), vec![b, c, d]); - + // assert_eq!(x_bcd.size(), vec![b, c, d]); // Apply n projections to the input let x_b1cd = &x_bcd.unsqueeze(1); - let x_bncq = &x_b1cd.matmul(&projections_1ndq); - debug_assert_eq!(x_bncq.size(), vec![b, n, c, q]); - + let x_bncq = &x_b1cd.matmul(&self.projections_1ndq); + // debug_assert_eq!(x_bncq.size(), vec![b, n, c, q]); // Use n custom dot products to generate n score tables let x_bnqc = &x_bncq.transpose(-1, -2); - let dotproducts_bncc = &x_bncq.matmul(&metric_tensors_1nqq.matmul(x_bnqc)); - debug_assert!(dotproducts_bncc.size() == vec![b, n, c, c]); + let dotproducts_bncc = &x_bncq.matmul(&self.metric_tensors_1nqq.matmul(x_bnqc)); + // debug_assert!(dotproducts_bncc.size() == vec![b, n, c, c]); // From scaled dot product attention introduced in https://arxiv.org/abs/1706.03762 - let scaled_dotproducts_bncc = &dotproducts_bncc.divide_scalar(sqrt_q); + let scaled_dotproducts_bncc = &dotproducts_bncc.divide_scalar(self.sqrt_q); let softmaxed_scaled_dotproducts_bncc = &scaled_dotproducts_bncc.softmax(-1, tch::kind::Kind::Float); let y_bnqc = &x_bncq.transpose(-1, -2).matmul(softmaxed_scaled_dotproducts_bncc); - debug_assert!(y_bnqc.size() == vec![b, n, q, c]); + // debug_assert!(y_bnqc.size() == vec![b, n, q, c]); - let y_bcd = &y_bnqc.reshape(x_bcd.size()); - debug_assert!(y_bcd.size() == vec![b, c, d]); + let y_bcp = &y_bnqc.reshape(&[b, self.cp.0, self.cp.1]); + // debug_assert!(y_bcp.size() == vec![b, c, p]); - y_bcd.matmul(&mixer_1dd) - }) + y_bcp.matmul(&self.adapter_1pd) + } } + + +/* + #[cfg(test)] mod tests { use super::*; @@ -94,4 +116,8 @@ mod tests { } -} \ No newline at end of file +} + +*/ + + diff --git a/src/attention/scaled_dot_product.rs b/src/attention/scaled_dot_product.rs index e69de29b..313b125d 100644 --- a/src/attention/scaled_dot_product.rs +++ b/src/attention/scaled_dot_product.rs @@ -0,0 +1,129 @@ + +use tch::nn; +use tch::Tensor; + +pub fn generate_init() -> nn::Init { + nn::Init::Randn { mean: 0., stdev: 1. } +} + + + +#[derive(Debug)] +pub struct ScaledDotProductAttention { + query_weights_1ndq: Tensor, + key_weights_1ndq: Tensor, + value_weights_1ndq: Tensor, + adapter_1pd: Tensor, + sqrt_q: f64, + cp: (i64, i64), +} + + +/// Performs self attention N times using the quadratic form $xW_nx.T$ where $W_n$ is a learnable matrix. +/// This is an early version of the metric self attention, where $W$ is forced to have the properties a metric tensor. +/// https://arxiv.org/abs/2111.11418 - evidence that any of the attention mechanisms might have similar performance +impl ScaledDotProductAttention { + pub fn new( + vs_path: &nn::Path, + number_of_heads: i64, + embedding_dimension: i64, + latent_dimension: i64, + sequence_length: i64, + ) -> Self { + + let n = number_of_heads; + let d = embedding_dimension; + let c = sequence_length; + let q = latent_dimension; + let p = latent_dimension*number_of_heads; + + let query_weights_1ndq = vs_path.var("query_weights_1ndq", &[1, n, d, q], generate_init()); + let key_weights_1ndq = vs_path.var("key_weights_1ndq", &[1, n, d, q], generate_init()); + let value_weights_1ndq = vs_path.var("value_weights_1ndq", &[1, n, d, q], generate_init()); + let adapter_1pd = vs_path.var("adapter_1pd", &[1, p, d], generate_init()); + + let sqrt_q = f64::sqrt(q as f64); + ScaledDotProductAttention { + query_weights_1ndq, + key_weights_1ndq, + value_weights_1ndq, + adapter_1pd, + sqrt_q, + cp: (c, p) + } + } +} + +// Implement the nn::Module trait for QuadraticAttention. +impl nn::Module for ScaledDotProductAttention { + fn forward(&self, x_bcd: &Tensor) -> Tensor { + + let b = x_bcd.size()[0]; + // assert_eq!(x_bcd.size(), vec![b, self.c, self.d]); + + // Apply n projections to the input + let x_b1cd = &x_bcd.unsqueeze(1); + + let queries_bncq = &x_b1cd.matmul(&self.query_weights_1ndq); + let keys_bncq = &x_b1cd.matmul(&self.key_weights_1ndq); + let values_bncq = &x_b1cd.matmul(&self.value_weights_1ndq); + + // debug_assert_eq!(queries_bncq.size(), vec![b, n, c, q]); + // debug_assert_eq!(keys_bncq.size(), vec![b, n, c, q]); + // debug_assert_eq!(values_bncq.size(), vec![b, n, c, q]); + + // Use n custom dot products to generate n score tables + let keys_bnqc = &keys_bncq.transpose(-1, -2); + let scores_bncc = &queries_bncq.matmul(keys_bnqc); + // debug_assert!(scores_bncc.size() == vec![b, n, c, c]); + + // From scaled dot product attention introduced in https://arxiv.org/abs/1706.03762 + let scaled_scores_bncc = &scores_bncc.divide_scalar(self.sqrt_q); + + let softmaxed_scaled_scores_bncc = &scaled_scores_bncc.softmax(-1, tch::kind::Kind::Float); + let y_bnqc = &values_bncq.transpose(-1, -2).matmul(softmaxed_scaled_scores_bncc); + // debug_assert!(y_bnqc.size() == vec![b, n, q, c]); + + let y_bcp = &y_bnqc.reshape(&[b, self.cp.0, self.cp.1]); + // debug_assert!(y_bcp.size() == vec![b, c, p]); + + y_bcp.matmul(&self.adapter_1pd) + } +} + + + + +/* + +#[cfg(test)] +mod tests { + use super::*; + use tch::{nn, Device, Kind, Tensor}; + use tch::nn::Module; + + + #[test] + pub fn test_layer(){ + + + let vs = nn::VarStore::new(Device::Cpu); + let vs_path = &vs.root(); + + let b = 10; + let c = 5; + let d = 4; + let n = 2; + let q = 2; + + let input_bcd = Tensor::randn( &[b, c, d], (Kind::Float, Device::Cpu)); + let layer = quadratic_self_attention_module(vs_path, n, d, q, c); + let output_bcd = layer.forward(&input_bcd); + + debug_assert!(output_bcd.size() == input_bcd.size()); + + } + +} + +*/ diff --git a/src/config.rs b/src/config.rs index d76706a9..d6a05192 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,11 +1,20 @@ + use clap::Parser; use tch::Device; +use serde::Deserialize; + -use crate::metaformer::AttentionKind; -// use crate::metaformer::AttentionKind; +#[derive(PartialEq, Clone, Copy, Deserialize)] +pub enum AttentionKind { + Quadratic, + ScaledDotProduct, + Metric, + Identity, + AveragePooling, +} /// Train a MetaFormer model. #[derive(Parser)] @@ -59,7 +68,6 @@ pub struct Cli { #[clap(long, env)] pub batch_size: i64, - #[clap(long, env)] pub learning_rate: f64, @@ -72,9 +80,12 @@ pub struct Cli { #[clap(long, env)] pub use_gpu: String, + #[clap(long, env)] + pub kernel_size: Option, } + impl Cli { pub fn get_device(&self) -> Device { let cuda = Device::cuda_if_available(); @@ -91,18 +102,4 @@ impl Cli { panic!("Invalid device configuration. Check USE_GPU env var."); } } - - pub fn get_attention_kind(&self) -> AttentionKind { - if self.attention_kind == "Quadratic" { - AttentionKind::Quadratic - } else { - AttentionKind::Quadratic - } - } } - - -pub fn read_config() -> Cli { - Cli::parse() -} - diff --git a/src/main.rs b/src/main.rs index fad6210f..9b779f56 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,35 +12,65 @@ pub mod files; use std::str::FromStr; -use tch::kind; -use tch::nn::Module; +use clap::Parser; +use tch::{kind, nn::Module}; use tch::nn::OptimizerConfig; -use metaformer::MetaFormer; +use metaformer::metaformer; use mlflow::{MLFlowClient, MetricAccumulator }; use tch; use tch::nn; use files::read_dataslice; -use config::{Cli, read_config}; +use config::Cli; + + +const QUADRATIC: &str = "quadratic"; +const SCALED_DOT_PRODUCT: &str = "scaled_dot_product"; +const IDENTITY: &str = "identity"; +const AVERAGE_POOLING: &str = "average_pooling"; +const METRIC: &str = "metric"; + /// Implementation of gradient descent fn main() { - - - let config: Cli = read_config(); + let config: Cli = Cli::parse(); let training_device = config.get_device(); - let metaformer: MetaFormer = MetaFormer::new(&config); let vs: nn::VarStore = nn::VarStore::new(training_device); let vs_path: &nn::Path<'_> = &vs.root(); - let attention_kind = config.get_attention_kind(); - let model = metaformer.create(vs_path, attention_kind, training_device); + + let mut model = metaformer( + vs_path, + config.dimension, + config.input_vocabolary, + config.context_window, + config.get_device() + ); + + + for _ in 0..config.depth { + + model = match config.attention_kind.as_str() { + QUADRATIC => model.add_quadratic_form(vs_path, config.heads), + SCALED_DOT_PRODUCT => model.add_scaled_dot_product(vs_path, config.heads), + IDENTITY => model, + AVERAGE_POOLING => model.add_avg_pooling(vs_path, config.kernel_size.unwrap()), + METRIC => todo!(), + _ => panic!("Not suported") + }; + + model = model.add_mlp(vs_path); + } + + model = model.finish(vs_path, config.output_vocabolary); + let mut opt: nn::Optimizer = tch::nn::Adam::default().build(&vs, config.learning_rate).unwrap(); // https://paperswithcode.com/method/adam let total_slices: i64 = config.slices*config.epochs; + for global_idx in 0..total_slices { let avg_train_loss = { let mut loss_accumulator = MetricAccumulator::new("loss/train"); diff --git a/src/metaformer/mod.rs b/src/metaformer/mod.rs index 9a649def..25c3e40f 100644 --- a/src/metaformer/mod.rs +++ b/src/metaformer/mod.rs @@ -1,124 +1,219 @@ -use crate::attention::quadratic_form::quadratic_self_attention_module; -use crate::config::Cli; use self::mlp::create_mlp; use self::embedder::create_embedder_module; +use crate::attention::quadratic_form::QuadraticAttention; +use crate::attention::scaled_dot_product::ScaledDotProductAttention; pub mod layer_norm; pub mod commons; pub mod embedder; pub mod mlp; + use layer_norm::create_layer_norm; use commons::generate_init; -use serde::Deserialize; use tch::nn; use tch::nn::Module; +use crate::attention::avg_pooling::AvgPooling; +use tch::nn::func; +use tch::Tensor; +use tch; -#[derive(PartialEq, Clone, Copy, Deserialize)] -pub enum AttentionKind { - Quadratic, - ScaledDotProduct, - Metric, -} - /// Defines structure of the metaformer model /// GPT2 paper - https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf /// MetaFormer paper - TODO +#[derive(Debug)] pub struct MetaFormer { - - - /// Dimension of the vector space that the network - /// uses internally to represent tokens embedding_dimension: i64, + size_of_context_window: i64, + layers: Vec>, +} - /// Number of transformer blocks - model_depth: i64, - - /// Number of attention modules per transformer block - number_of_heads: i64, +impl Module for MetaFormer { + fn forward(&self, xs: &Tensor) -> Tensor { + let xs = self.layers[0].forward(xs); + self.layers.iter().skip(1).fold(xs, |xs, layer| layer.forward(&xs)) + } +} - /// Maximum number of tokens in the input sequence - size_of_context_window: i64, - /// Total number of tokens that the network recognizes +/// Creates a new empty metaformer layer. +pub fn metaformer( + vs_path: &nn::Path, + embedding_dimension: i64, size_of_vocabolary: i64, + size_of_context_window: i64, + device: tch::Device, +) -> MetaFormer { + + let embedder = create_embedder_module( + vs_path, + embedding_dimension, + size_of_vocabolary, + size_of_context_window, + device + ); - output_tokens: i64, - - + MetaFormer { + embedding_dimension, + size_of_context_window, + layers: vec![ Box::new(embedder) ] + } } + impl MetaFormer { - pub fn new(config: &Cli) -> MetaFormer { - MetaFormer { - embedding_dimension: config.dimension, - model_depth: config.depth, - number_of_heads: config.heads, - size_of_context_window: config.context_window, - size_of_vocabolary: config.input_vocabolary, - output_tokens: config.output_vocabolary, - } + pub fn add_mlp(self, vs_path: &nn::Path) -> Self { + let layer = create_mlp(vs_path, self.embedding_dimension); + self.add(vs_path, layer) } - fn create_attention(&self, vs: &nn::Path, kind: AttentionKind) -> impl nn::Module { - - match kind { - AttentionKind::Quadratic => quadratic_self_attention_module( - vs, - self.number_of_heads, - self.embedding_dimension, - self.embedding_dimension / self.number_of_heads, - self.size_of_context_window, - ), - AttentionKind::Metric => todo!(), - AttentionKind::ScaledDotProduct => todo!() - } + pub fn add_avg_pooling(self, vs_path: &nn::Path, kernel_size: i64) -> Self { + let layer = AvgPooling::new(kernel_size); + self.add(vs_path, layer) } + pub fn add_scaled_dot_product(self, vs_path: &nn::Path, number_of_heads: i64) -> Self { + let layer = ScaledDotProductAttention::new( + vs_path, + number_of_heads, + self.embedding_dimension, + self.embedding_dimension / number_of_heads, + self.size_of_context_window, + ); + self.add(vs_path, layer) + } + + pub fn add_quadratic_form(self, vs_path: &nn::Path, number_of_heads: i64) -> Self { + let layer = QuadraticAttention::new( + vs_path, + number_of_heads, + self.embedding_dimension, + self.embedding_dimension / number_of_heads, + self.size_of_context_window, + ); + self.add(vs_path, layer) + } - fn create_output_layer(&self, vs: &nn::Path) -> impl nn::Module { + pub fn finish(mut self, vs_path: &nn::Path, output_tokens: i64) -> Self { + let d = self.embedding_dimension; - let t = self.output_tokens; + let t = output_tokens; - let linear_norm = create_layer_norm(vs, self.embedding_dimension); - let projection_1dt = vs.var("projection_1dt", &[1, d, t], generate_init()); + let linear_norm = create_layer_norm(vs_path, self.embedding_dimension); + let projection_1dt = vs_path.var("projection_1dt", &[1, d, t], generate_init()); - nn::func(move |x_bcd| { + let final_layer = nn::func(move |x_bcd| { let y_bcd = &linear_norm.forward(x_bcd); y_bcd.matmul(&projection_1dt) - }) + }); + + self.layers.push(Box::new(final_layer)); + self } - pub fn create(&self, vs_path: & nn::Path, kind: AttentionKind, device: tch::Device) -> impl nn::Module { - let mut model = nn::seq().add( - create_embedder_module( - vs_path, - self.embedding_dimension, - self.size_of_vocabolary, - self.size_of_context_window, - device - )); + /// Appends a layer after all the current layers. + #[allow(clippy::should_implement_trait)] + fn add(mut self, vs_path: &nn::Path, layer: M) -> Self { - for _ in 0..self.model_depth { - let attention_module = self.create_attention(vs_path, kind); - model = model.add(attention_module); - model = model.add(create_mlp(vs_path, self.embedding_dimension)); - } + let layer_norm = create_layer_norm(vs_path, self.embedding_dimension); - model.add(self.create_output_layer(vs_path)) + self.layers.push(Box::new(func( + move |x: &tch::Tensor| layer.forward(&layer_norm.forward(x)) + x + ))); + self } } + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +/* +pub struct MetaFormer { + + + /// Dimension of the vector space that the network + /// uses internally to represent tokens + embedding_dimension: i64, + + /// Number of transformer blocks + model_depth: i64, + + /// Number of attention modules per transformer block + number_of_heads: i64, + + /// Maximum number of tokens in the input sequence + size_of_context_window: i64, + + /// Total number of tokens that the network recognizes + size_of_vocabolary: i64, + + output_tokens: i64, + + kernel_size: Option, + + +} + +*/ + + + + /* #[test] pub fn test_model_creation(){