config.yaml

dataset: 'CREMAD'
modulation: 'OGM_GE'
fusion_method: 'concat'  # Can be 'sum', 'concat', 'film', or 'gated'
fps: 1
num_classes: 7
audio_path: '/kaggle/input/cremad-1/cremad/AudioWAV/'  # fix link dataset
visual_path: '/kaggle/input/cremad-1/cremad/'  # fix link dataset
batch_size: 16
epochs: 30
optimizer: 'sgd'
learning_rate: 0.0002
lr_decay_step: 70
lr_decay_ratio: 0.1
momentum: 0.9
ckpt_path: '/kaggle/working/ckpt'
train: true
use_tensorboard: false
tensorboard_path: null
random_seed: 0
input_tdim: 256
epoch: 100  # Number of training epochs
weight_visual: '/kaggle/input/weight-cremad/weight.pth'
weight_audio: '/kaggle/input/weight-asr/audioset_10_10_0.4593.pth'
save_path: '/kaggle/working/weights/model.pth'  # Model save path
train_csv: '/kaggle/working/Speech_project_Vin/data/CREMAD/train.csv'  # Path to train.csv
test_csv: '/kaggle/working/Speech_project_Vin/data/CREMAD/test.csv'  # Path to test.csv
# PatchEmbed parameters
img_size: 224  # Image size for PatchEmbed
patch_size: 16  # Patch size for PatchEmbed
in_chans: 3  # Input channels for PatchEmbed
embed_dim: 768  # Embedding dimension for PatchEmbed

# ASTModel parameters
ASTModel:
  verbose: false
  fstride: 10  # Frequency stride
  tstride: 10  # Time stride
  input_fdim: 128  # Number of frequency bins
  input_tdim: 256  # Number of time frames
  imagenet_pretrain: false  # Use ImageNet pretrained model
  audioset_pretrain: true  # Use AudioSet pretrained model
  model_size: 'base384'  # Model size for ASTModel

# Visual model parameters
visual_model:
  reduction_ratio: 16  # Reduction ratio for ChannelGate
  pool_types: ['avg', 'max']  # Pool types used in ChannelGate

# MANet parameters
manet:
  layers: [2, 2, 2, 2]  # Layers for MANet
  num_classes: 12666  # Number of output classes for MANet

# Fusion method parameters
fusion:
  type: 'concat'  # Type of fusion: 'sum', 'concat', 'film', or 'gated'
  input_dim: 1088  # Input dimension for fusion
  output_dim: 7  # Output dimension for the fusion
  film_x_film: false
  gated_x_gate: false
# Training parameters
training:
  learning_rate: 0.0002  # Learning rate for the optimizer
  lr_decay_step: 70  # Step size for learning rate decay
  lr_decay_ratio: 0.1  # Decay ratio for learning rate
  epoch: 100  # Number of epochs for training
  save_path: '/kaggle/working/model.pth'  # Path to save the model