-
Notifications
You must be signed in to change notification settings - Fork 110
/
Dockerfile.spark
67 lines (59 loc) · 4 KB
/
Dockerfile.spark
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
FROM centos:7
# Build ARGs
# Forcing back to python 3.8 to be in sync with local dev env.
# Can't run driver and worker on different Python versions when driver is local dev machine and not spark-submit container
#ARG PYTHON_VERSION=3.8.10
ARG PYTHON_VERSION=3.8.16
ARG HADOOP_VERSION=3.3.1
ARG SPARK_VERSION=3.2.1
ARG PROJECT_LOG_DIR=/logs
RUN yum -y update && yum clean all
# sqlite-devel added as prerequisite for coverage python lib, used by pytest-cov plugin
RUN yum -y install wget gcc openssl-devel bzip2-devel libffi libffi-devel zlib-devel sqlite-devel
RUN yum -y groupinstall "Development Tools"
# Building Python 3.x
WORKDIR /usr/src
RUN wget --quiet https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
&& tar xzf Python-${PYTHON_VERSION}.tgz
WORKDIR /usr/src/Python-${PYTHON_VERSION}
RUN ./configure --enable-optimizations \
&& make altinstall \
&& ln -sf /usr/local/bin/python`echo ${PYTHON_VERSION} | awk -F. '{short_version=$1 FS $2; print short_version}'` /usr/bin/python3 \
&& echo "Installed $(python3 --version)"
# Ensure Python STDOUT gets sent to container logs
ENV PYTHONUNBUFFERED=1
# Install Java 1.8.x
RUN yum -y install java-1.8.0-openjdk
ENV JAVA_HOME=/usr/lib/jvm/jre
# Install Hadoop and Spark
WORKDIR /usr/local
RUN wget --quiet https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \
&& tar xzf hadoop-${HADOOP_VERSION}.tar.gz \
&& ln -sfn /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop \
&& wget --quiet https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz \
&& tar xzf spark-${SPARK_VERSION}-bin-without-hadoop.tgz \
&& ln -sfn /usr/local/spark-${SPARK_VERSION}-bin-without-hadoop /usr/local/spark \
&& echo "Installed $(/usr/local/hadoop/bin/hadoop version)"
ENV HADOOP_HOME=/usr/local/hadoop
ENV SPARK_HOME=/usr/local/spark
# Cannot set ENV var = command-result, [i.e. doing: ENV SPARK_DIST_CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath)], so interpolating the hadoop classpath the long way
ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*"
ENV PATH=${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${PATH};
RUN echo "Installed Spark" && echo "$(${SPARK_HOME}/bin/pyspark --version)"
# Config for starting up the Spark History Server
# This allows a web entrypoint to see past job runs, and bring up their Spark UI
# It allows job-run event logs to be monitored (e.g. at /tmp/spark-events)
# NOTE: The specified eventlog dir and its path does not exist. It is assumed that a host dir will be bind-mounted at /project
# so that all containers that do the same can share the events.
RUN cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf \
&& sed -i "s|^# spark.eventLog.dir.*$|spark.eventLog.dir file:///project/$PROJECT_LOG_DIR/spark-events|g" $SPARK_HOME/conf/spark-defaults.conf \
&& sed -i '/spark.eventLog.enabled/s/^# //g' $SPARK_HOME/conf/spark-defaults.conf \
&& echo "spark.history.fs.logDirectory file:///project/$PROJECT_LOG_DIR/spark-events" >> $SPARK_HOME/conf/spark-defaults.conf
# Bake project dependencies into any spark-base -based container that will run Project python code (and require these dependencies)
# NOTE: We must exclude any dependencies on the host machine installed in a virtual environment (assumed to be stored under ./.venv)
WORKDIR /project
COPY requirements/ /project/requirements
RUN rm -rf /project/.venv
RUN python3 -m pip install -r requirements/requirements.txt
# Allow for entrypoints and commands to infer that they are running relative to this directory
WORKDIR ${SPARK_HOME}