Merge branch 'decoder' of github.com:Algo-Boys/SWR2-cool-projekt into…

… decoder
Algo-Boys · Sep 18, 2023 · d568904 · d568904
2 parents 9475900 + e062272
commit d568904
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 23 deletions.
diff --git a/config.cluster.yaml b/config.cluster.yaml
@@ -4,18 +4,18 @@ model:
   rnn_dim: 512
   n_feats: 128 # number of mel features
   stride: 2
-  dropout: 0.25 # recommended to be around 0.4-0.6 for smaller datasets, 0.1 for really large datasets
+  dropout: 0.2 # recommended to be around 0.4-0.6 for smaller datasets, 0.1 for really large datasets
 
 training:
   learning_rate: 0.0005
-  batch_size: 64 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU)
+  batch_size: 400 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU)
   epochs: 150 
   eval_every_n: 5 # evaluate every n epochs
-  num_workers: 8 # number of workers for dataloader
+  num_workers: 12 # number of workers for dataloader
   device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
 
 dataset:
-  download: True
+  download: False
   dataset_root_path: "/mnt/lustre/mladm/mfa252/data" # files will be downloaded into this dir
   language_name: "mls_german_opus"
   limited_supervision: False # set to True if you want to use limited supervision
@@ -26,9 +26,9 @@ tokenizer:
   tokenizer_path: "data/tokenizers/char_tokenizer_german.json"
 
 checkpoints:
-  model_load_path: "data/runs/epoch31" # path to load model from
+  model_load_path: "data/runs/epoch50" # path to load model from
   model_save_path: "data/runs/epoch" # path to save model to
 
 inference:
   model_load_path: ~ # path to load model from
-  device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
+  device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
diff --git a/data/own/swabian.flac b/data/own/swabian.flac
diff --git a/hpc.sh b/hpc.sh
@@ -2,15 +2,15 @@
 
 #SBATCH --job-name=swr-teamprojekt
 #SBATCH --partition=a100
-#SBATCH --time=00:30:00
+#SBATCH --time=24:00:00
 
 ### Note: --gres=gpu:x should equal to ntasks-per-node
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
-#SBATCH --gres=gpu:a100:1
-#SBATCH --cpus-per-task=8
-#SBATCH --mem=64gb
-#SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/
+#SBATCH --gres=gpu:a100:4
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=32gb
+#SBATCH --chdir=/mnt/lustre/mladm/mfa252/ref/
 #SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out
 
 source venv/bin/activate

diff --git a/hpc_train.sh b/hpc_train.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 
-yes no | python -m swr2_asr.train --config_path config.cluster.yaml  
+python -m swr2_asr.train --config_path config.cluster.yaml  
diff --git a/swr2_asr/inference.py b/swr2_asr/inference.py
@@ -66,7 +66,12 @@ def main(config_path: str, file_path: str) -> None:
     ).to(device)
 
     checkpoint = torch.load(inference_config["model_load_path"], map_location=device)
-    model.load_state_dict(checkpoint["model_state_dict"], strict=True)
+
+    state_dict = {
+        k[len("module.") :] if k.startswith("module.") else k: v
+        for k, v in checkpoint["model_state_dict"].items()
+    }
+    model.load_state_dict(state_dict, strict=True)
     model.eval()
 
     waveform, sample_rate = torchaudio.load(file_path)  # pylint: disable=no-member

diff --git a/swr2_asr/utils/visualization.py b/swr2_asr/utils/visualization.py
@@ -4,19 +4,35 @@
 import torch
 
 
-def plot(epochs, path):
+def plot(path):
     """Plots the losses over the epochs"""
-    losses = []
+    train_losses = []
     test_losses = []
     cers = []
     wers = []
-    for epoch in range(1, epochs + 1):
-        current_state = torch.load(path + str(epoch))
-        losses.append(current_state["loss"])
-        test_losses.append(current_state["test_loss"])
-        cers.append(current_state["avg_cer"])
-        wers.append(current_state["avg_wer"])
 
-    plt.plot(losses)
-    plt.plot(test_losses)
+    epoch = 5
+    while True:
+        try:
+            current_state = torch.load(path + str(epoch), map_location=torch.device("cpu"))
+        except FileNotFoundError:
+            break
+        train_losses.append((epoch, current_state["train_loss"].item()))
+        test_losses.append((epoch, current_state["test_loss"]))
+        cers.append((epoch, current_state["avg_cer"]))
+        wers.append((epoch, current_state["avg_wer"]))
+        epoch += 5
+
+    plt.plot(*zip(*train_losses), label="train_loss")
+    plt.plot(*zip(*test_losses), label="test_loss")
+    plt.plot(*zip(*cers), label="cer")
+    plt.plot(*zip(*wers), label="wer")
+    plt.xlabel("epoch")
+    plt.ylabel("score")
+    plt.title("Model performance for 5n epochs")
+    plt.legend()
     plt.savefig("losses.svg")
+
+
+if __name__ == "__main__":
+    plot("data/runs/epoch")