-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
kubernetes training on optocycle cluster
- Loading branch information
Showing
7 changed files
with
1,331 additions
and
876 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime | ||
|
||
ARG USERNAME=dev | ||
ARG USER_UID=1000 | ||
ARG USER_GID=$USER_UID | ||
|
||
# set up user with uid 1000 for vscode devcontainer | ||
RUN groupadd --gid $USER_GID $USERNAME \ | ||
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ | ||
&& apt-get update \ | ||
&& apt-get install -y sudo \ | ||
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ | ||
&& chmod 0440 /etc/sudoers.d/$USERNAME | ||
|
||
|
||
RUN apt-get update -y && \ | ||
DEBIAN_FRONTEND=noninteractive apt-get install -y \ | ||
# dev dependencies | ||
curl vim nano tar iputils-ping screen ffmpeg\ | ||
# kenlm dependencies | ||
build-essential libboost-all-dev cmake zlib1g-dev libbz2-dev liblzma-dev && \ | ||
apt-get clean | ||
|
||
WORKDIR /app | ||
COPY requirements.txt ./ | ||
RUN pip install -r requirements.txt | ||
|
||
# set the default user | ||
USER $USERNAME | ||
|
||
# copy the rest of the files | ||
COPY swr2_asr ./swr2_asr | ||
COPY data ./data | ||
COPY config.* ./ | ||
|
||
# just keep the container alive | ||
ENTRYPOINT ["tail", "-f", "/dev/null"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
dataset: | ||
download: True | ||
dataset_root_path: "data/datasets" # files will be downloaded into this dir | ||
language_name: "mls_german_opus" | ||
limited_supervision: False # set to True if you want to use limited supervision | ||
dataset_percentage: 1 # percentage of dataset to use (1.0 = 100%) | ||
shuffle: True | ||
|
||
model: | ||
n_cnn_layers: 3 | ||
n_rnn_layers: 7 | ||
rnn_dim: 512 | ||
n_feats: 256 # number of mel features | ||
stride: 2 | ||
dropout: 0.2 # recommended to be around 0.4 for smaller datasets, 0.1 for really large datasets | ||
|
||
tokenizer: | ||
tokenizer_path: "data/tokenizers/char_tokenizer_german.json" | ||
|
||
decoder: | ||
type: "greedy" # greedy, or lm (beam search) | ||
|
||
lm: # config for lm decoder | ||
language_model_path: "data" # path where model and supplementary files are stored | ||
language: "german" | ||
n_gram: 5 # n-gram size of the language model, 3 or 5 | ||
beam_size: 500 | ||
beam_threshold: 150 | ||
n_best: 1 | ||
lm_weight: 1 | ||
word_score: 1 | ||
|
||
training: | ||
learning_rate: 0.0005 | ||
batch_size: 32 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU) | ||
epochs: 100 | ||
eval_every_n: 5 # evaluate every n epochs | ||
num_workers: 4 # number of workers for dataloader | ||
|
||
checkpoints: # use "~" to disable saving/loading | ||
model_load_path: ~ # path to load model from | ||
model_save_path: "data/runs/01/epoch" # path to save model to | ||
|
||
inference: | ||
model_load_path: "data/epoch67" # path to load model from |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# a pod for testing the torch implementation of nms on a gpu | ||
apiVersion: v1 | ||
kind: Pod | ||
metadata: | ||
name: swr-asr | ||
labels: | ||
app: swr-asr | ||
spec: | ||
initContainers: | ||
- name: swr-asr-init | ||
image: pherkel/swr2-asr | ||
imagePullPolicy: Always | ||
command: ["/bin/sh", "-c", "cp -R /app/* /tmp/"] | ||
volumeMounts: | ||
- name: swr-asr-vol | ||
mountPath: /tmp | ||
|
||
securityContext: | ||
allowPrivilegeEscalation: false | ||
capabilities: | ||
drop: ["ALL"] | ||
|
||
|
||
containers: | ||
- name: swr-asr | ||
image: pherkel/swr2-asr | ||
imagePullPolicy: Always | ||
# command: ["/bin/bash", "-c", "while sleep 1000; do :; done"] | ||
|
||
volumeMounts: | ||
- name: swr-asr-vol | ||
mountPath: /app | ||
|
||
# workaround for increasing worker shared memory size | ||
- name: shm-vol | ||
mountPath: /dev/shm | ||
|
||
resources: | ||
requests: | ||
nvidia.com/gpu: "1" | ||
memory: "8Gi" | ||
cpu: "4" | ||
limits: | ||
nvidia.com/gpu: "1" | ||
memory: "8Gi" | ||
cpu: "4" | ||
|
||
securityContext: | ||
allowPrivilegeEscalation: false | ||
capabilities: | ||
drop: ["ALL"] | ||
|
||
volumes: | ||
- name: swr-asr-vol | ||
persistentVolumeClaim: | ||
claimName: swr-asr-vol | ||
- name: shm-vol | ||
emptyDir: | ||
medium: Memory | ||
|
||
securityContext: | ||
runAsNonRoot: true | ||
runAsUser: 1000 | ||
runAsGroup: 1000 | ||
fsGroup: 1000 | ||
seccompProfile: | ||
type: RuntimeDefault | ||
|
||
nodeSelector: | ||
optocycle.com/infrastructure-provider: ocs |
Oops, something went wrong.