-
Notifications
You must be signed in to change notification settings - Fork 2
/
config.yaml
67 lines (63 loc) · 2.37 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
dataset: 'CREMAD'
modulation: 'OGM_GE'
fusion_method: 'concat' # Can be 'sum', 'concat', 'film', or 'gated'
fps: 1
num_classes: 7
audio_path: '/kaggle/input/cremad-1/cremad/AudioWAV/' # fix link dataset
visual_path: '/kaggle/input/cremad-1/cremad/' # fix link dataset
batch_size: 16
epochs: 30
optimizer: 'sgd'
learning_rate: 0.0002
lr_decay_step: 70
lr_decay_ratio: 0.1
momentum: 0.9
ckpt_path: '/kaggle/working/ckpt'
train: true
use_tensorboard: false
tensorboard_path: null
random_seed: 0
input_tdim: 256
epoch: 100 # Number of training epochs
weight_visual: '/kaggle/input/weight-cremad/weight.pth'
weight_audio: '/kaggle/input/weight-asr/audioset_10_10_0.4593.pth'
save_path: '/kaggle/working/weights/model.pth' # Model save path
train_csv: '/kaggle/working/Speech_project_Vin/data/CREMAD/train.csv' # Path to train.csv
test_csv: '/kaggle/working/Speech_project_Vin/data/CREMAD/test.csv' # Path to test.csv
# PatchEmbed parameters
img_size: 224 # Image size for PatchEmbed
patch_size: 16 # Patch size for PatchEmbed
in_chans: 3 # Input channels for PatchEmbed
embed_dim: 768 # Embedding dimension for PatchEmbed
# ASTModel parameters
ASTModel:
verbose: false
fstride: 10 # Frequency stride
tstride: 10 # Time stride
input_fdim: 128 # Number of frequency bins
input_tdim: 256 # Number of time frames
imagenet_pretrain: false # Use ImageNet pretrained model
audioset_pretrain: true # Use AudioSet pretrained model
model_size: 'base384' # Model size for ASTModel
# Visual model parameters
visual_model:
reduction_ratio: 16 # Reduction ratio for ChannelGate
pool_types: ['avg', 'max'] # Pool types used in ChannelGate
# MANet parameters
manet:
layers: [2, 2, 2, 2] # Layers for MANet
num_classes: 12666 # Number of output classes for MANet
# Fusion method parameters
fusion:
type: 'concat' # Type of fusion: 'sum', 'concat', 'film', or 'gated'
input_dim: 1088 # Input dimension for fusion
output_dim: 7 # Output dimension for the fusion
film_x_film: false
gated_x_gate: false
# Training parameters
training:
learning_rate: 0.0002 # Learning rate for the optimizer
lr_decay_step: 70 # Step size for learning rate decay
lr_decay_ratio: 0.1 # Decay ratio for learning rate
epoch: 100 # Number of epochs for training
save_path: '/kaggle/working/model.pth' # Path to save the model