-
Notifications
You must be signed in to change notification settings - Fork 51
/
Dockerfile
127 lines (109 loc) · 4.24 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
ARG DEVICE=cuda
ARG BASE_IMAGE=nvidia/cuda:11.2.0-devel-ubuntu20.04
FROM ${BASE_IMAGE} AS base
ARG PYTHON_VERSION=3.8
ARG PYTORCH_VERSION=1.8
ARG MAGMA_CUDA_VERSION=magma-cuda110
ARG DEVICE
RUN apt-get -y update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl \
build-essential \
ca-certificates \
git \
libgfortran-8-dev \
vim \
zsh \
wget \
ssh \
iputils-ping \
procps \
net-tools \
apt-utils \
rlwrap \
ethtool \
telnet \
openjdk-11-jdk \
openssh-server
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} numpy scipy mkl mkl-include ninja cython typing && \
/opt/conda/bin/conda install -y -c conda-forge mpi4py && \
ln -s /usr/share/pyshared /opt/conda/lib/python${PYTHON_VERSION}/site-packages && \
if [ "${DEVICE}" = "cuda" ]; then \
/opt/conda/bin/conda install -y -c pytorch -c conda-forge ${MAGMA_CUDA_VERSION} pytorch=${PYTORCH_VERSION} torchvision; \
/opt/conda/bin/pip3 install bagua-cuda113 --no-cache-dir; \
else \
/opt/conda/bin/conda install -y -c pytorch -c conda-forge pytorch=${PYTORCH_VERSION} torchvision cpuonly; \
/opt/conda/bin/pip3 install scikit-learn --no-cache-dir; \
fi && \
/opt/conda/bin/conda install torchserve torch-model-archiver torch-workflow-archiver -c pytorch -y; \
/opt/conda/bin/conda clean -yapf;
RUN mkdir -p /opt/hadoop/; \
cd /opt/hadoop/; \
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz; \
tar -zxvf hadoop-3.3.1.tar.gz; \
rm hadoop-3.3.1.tar.gz;
RUN /opt/conda/bin/pip install --no-cache-dir \
remote-pdb \
pytest \
tqdm \
pandas \
tensorboard \
ipython \
captum \
grpcio \
protobuf \
grpcio-tools && \
apt-get purge --auto-remove && \
apt-get clean
ENV PATH=/opt/conda/bin:/opt/hadoop/hadoop-3.3.1/bin/:$PATH
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/
ENV LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:/usr/lib"
ENV LD_LIBRARY_PATH="/opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch/lib/:/opt/conda/lib/"
# alias for cpu builder image
FROM base AS cpu-builder-base
# alias for gpu builder image
FROM base AS cuda-builder-base
ARG DEVICE
ENV USE_CUDA=1
ENV LIBRARY_PATH="${LIBRARY_PATH}:/usr/local/cuda/lib64/stubs/"
FROM ${DEVICE}-builder-base AS builder
ENV RUSTUP_HOME=/rust
ENV CARGO_HOME=/cargo
ENV PATH=/cargo/bin:/rust/bin:/opt/conda/bin:$PATH
RUN curl -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable -y --profile default --no-modify-path
FROM builder AS persia-builder
WORKDIR /workspace
COPY . /workspace
RUN cd /workspace && pip3 install colorama setuptools setuptools-rust setuptools_scm \
&& python setup.py bdist_wheel --dist-dir=/root/dist && rm -rf /workspace
# Build bagua distributed training framework manully
# RUN if [ "${DEVICE}" = "cuda" ]; then \
# rm -rf /etc/apt/sources.list.d; \
# apt-get update; \
# DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends zlib1g-dev libhwloc-dev; \
# git clone https://github.com/BaguaSys/bagua.git; \
# cd bagua; \
# pip3 install cmake setuptools-rust colorama tqdm wheel --no-cache-dir; \
# git submodule update --init --recursive; \
# python setup.py bdist_wheel --dist-dir=/root/dist; \
# cd ..; \
# rm -rf bagua; \
# /opt/conda/bin/conda clean -yapf; \
# fi
ARG DEVICE
FROM base AS runtime
# Install the persia-runtime and bagua (Optional for cpu-runtime)
COPY --from=persia-builder /root/dist .
RUN pip3 install *.whl && rm -rf *.whl
# Install nats server
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.6.6/nats-server-v2.6.6-linux-amd64.tar.gz && \
tar -zxvf nats-server-v2.6.6-linux-amd64.tar.gz && \
cp nats-server-v2.6.6-linux-amd64/nats-server /usr/bin/ &&\
rm -rf nats-server-v2.6.6-linux-amd64/ && \
rm nats-server-v2.6.6-linux-amd64.tar.gz
# Prepare examples
RUN mkdir -p /home/PERSIA/examples
COPY examples /home/PERSIA/examples
RUN cd /home/PERSIA/examples/src/adult-income/data/ && ./prepare_data.sh