Skip to content

Commit

Permalink
kubernetes training on optocycle cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
Pherkel committed Nov 23, 2023
1 parent 9a90a72 commit b49ec5a
Show file tree
Hide file tree
Showing 7 changed files with 1,331 additions and 876 deletions.
37 changes: 37 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime

ARG USERNAME=dev
ARG USER_UID=1000
ARG USER_GID=$USER_UID

# set up user with uid 1000 for vscode devcontainer
RUN groupadd --gid $USER_GID $USERNAME \
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
&& apt-get update \
&& apt-get install -y sudo \
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
&& chmod 0440 /etc/sudoers.d/$USERNAME


RUN apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y \
# dev dependencies
curl vim nano tar iputils-ping screen ffmpeg\
# kenlm dependencies
build-essential libboost-all-dev cmake zlib1g-dev libbz2-dev liblzma-dev && \
apt-get clean

WORKDIR /app
COPY requirements.txt ./
RUN pip install -r requirements.txt

# set the default user
USER $USERNAME

# copy the rest of the files
COPY swr2_asr ./swr2_asr
COPY data ./data
COPY config.* ./

# just keep the container alive
ENTRYPOINT ["tail", "-f", "/dev/null"]
45 changes: 45 additions & 0 deletions config.docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
dataset:
download: True
dataset_root_path: "data/datasets" # files will be downloaded into this dir
language_name: "mls_german_opus"
limited_supervision: False # set to True if you want to use limited supervision
dataset_percentage: 1 # percentage of dataset to use (1.0 = 100%)
shuffle: True

model:
n_cnn_layers: 3
n_rnn_layers: 7
rnn_dim: 512
n_feats: 256 # number of mel features
stride: 2
dropout: 0.2 # recommended to be around 0.4 for smaller datasets, 0.1 for really large datasets

tokenizer:
tokenizer_path: "data/tokenizers/char_tokenizer_german.json"

decoder:
type: "greedy" # greedy, or lm (beam search)

lm: # config for lm decoder
language_model_path: "data" # path where model and supplementary files are stored
language: "german"
n_gram: 5 # n-gram size of the language model, 3 or 5
beam_size: 500
beam_threshold: 150
n_best: 1
lm_weight: 1
word_score: 1

training:
learning_rate: 0.0005
batch_size: 32 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU)
epochs: 100
eval_every_n: 5 # evaluate every n epochs
num_workers: 4 # number of workers for dataloader

checkpoints: # use "~" to disable saving/loading
model_load_path: ~ # path to load model from
model_save_path: "data/runs/01/epoch" # path to save model to

inference:
model_load_path: "data/epoch67" # path to load model from
70 changes: 70 additions & 0 deletions pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# a pod for testing the torch implementation of nms on a gpu
apiVersion: v1
kind: Pod
metadata:
name: swr-asr
labels:
app: swr-asr
spec:
initContainers:
- name: swr-asr-init
image: pherkel/swr2-asr
imagePullPolicy: Always
command: ["/bin/sh", "-c", "cp -R /app/* /tmp/"]
volumeMounts:
- name: swr-asr-vol
mountPath: /tmp

securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]


containers:
- name: swr-asr
image: pherkel/swr2-asr
imagePullPolicy: Always
# command: ["/bin/bash", "-c", "while sleep 1000; do :; done"]

volumeMounts:
- name: swr-asr-vol
mountPath: /app

# workaround for increasing worker shared memory size
- name: shm-vol
mountPath: /dev/shm

resources:
requests:
nvidia.com/gpu: "1"
memory: "8Gi"
cpu: "4"
limits:
nvidia.com/gpu: "1"
memory: "8Gi"
cpu: "4"

securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]

volumes:
- name: swr-asr-vol
persistentVolumeClaim:
claimName: swr-asr-vol
- name: shm-vol
emptyDir:
medium: Memory

securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault

nodeSelector:
optocycle.com/infrastructure-provider: ocs
Loading

0 comments on commit b49ec5a

Please sign in to comment.