forked from panovvv/hadoop-hive-spark-docker
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Dockerfile_env
185 lines (158 loc) · 6.62 KB
/
Dockerfile_env
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# Alpine 3.11 contains Python 3.8, pyspark only supports Python up to 3.7
################################-- Start Packaging --######################################
FROM alpine:3.10.4 as env_package
# 清华源镜像apache 地址: https://mirrors.tuna.tsinghua.edu.cn/apache/
# 修改alpine源,为国内清华镜像源
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories
ENV USR_PROGRAM_DIR=/usr/program
ENV USR_BIN_DIR="${USR_PROGRAM_DIR}/source_dir"
RUN mkdir -p "${USR_BIN_DIR}"
# Common settings
ENV JAVA_HOME "/usr/lib/jvm/java-1.8-openjdk"
ENV PATH="${PATH}:${JAVA_HOME}/bin"
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
ENV PYTHONHASHSEED 0
ENV PYTHONIOENCODING UTF-8
ENV PIP_DISABLE_PIP_VERSION_CHECK 1
# Hadoop
ENV HADOOP_VERSION=3.2.3
ENV HADOOP_HOME /usr/program/hadoop
ENV HADOOP_NNAMENADE_HOSTNAME=master
ENV HADOOP_PACKAGE="hadoop-${HADOOP_VERSION}.tar.gz"
ENV PATH="${PATH}:${HADOOP_HOME}/bin"
ENV PATH="${PATH}:${HADOOP_HOME}/sbin"
ENV HDFS_NAMENODE_USER="root"
ENV HDFS_DATANODE_USER="root"
ENV HDFS_SECONDARYNAMENODE_USER="root"
ENV YARN_RESOURCEMANAGER_USER="root"
ENV YARN_NODEMANAGER_USER="root"
ENV LD_LIBRARY_PATH="${HADOOP_HOME}/lib/native:${LD_LIBRARY_PATH}"
ENV HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
ENV HADOOP_LOG_DIR="${HADOOP_HOME}/logs"
# For S3 to work. Without this line you'll get "Class org.apache.hadoop.fs.s3a.S3AFileSystem not found" exception when accessing S3 from Hadoop
ENV HADOOP_CLASSPATH="${HADOOP_HOME}/share/hadoop/tools/lib/*"
# Hive
ENV HIVE_VERSION=3.1.2
ENV HIVE_HOME=/usr/program/hive
ENV HIVE_CONF_DIR="${HIVE_HOME}/conf"
ENV HIVE_LOG_DIR="${HIVE_HOME}/logs"
ENV HIVE_PACKAGE="apache-hive-${HIVE_VERSION}-bin.tar.gz"
ENV PATH="${PATH}:${HIVE_HOME}/bin"
ENV HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HIVE_HOME}/lib/*"
# Hive Hudi support
ENV HIVE_AUX_JARS_PATH=/usr/program/hive/lib/hudi-hadoop-mr-bundle-0.10.0.jar,/usr/program/hive/lib/hudi-hive-sync-bundle-0.10.0.jar
# HBase
ENV HBASE_VERSION=2.3.6
ENV HBASE_HOME=/usr/program/hbase
ENV HBASE_CONF_DIR="${HBASE_HOME}/conf/"
ENV HBASE_PACKAGE="hbase-${HBASE_VERSION}-bin.tar.gz"
ENV PATH="${PATH}:${HBASE_HOME}/bin"
ENV HBASE_LOG_DIR="${HBASE_HOME}/logs"
# Spark
ENV SPARK_VERSION=3.0.0
ENV SPARK_HADOOP_VERSION=3.2
ENV SPARK_HOME=/usr/program/spark
ENV SPARK_PACKAGE="spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_VERSION}.tgz"
ENV PATH="${PATH}:${SPARK_HOME}/bin"
ENV SPARK_CONF_DIR="${SPARK_HOME}/conf"
ENV SPARK_LOG_DIR="${SPARK_HOME}/logs"
ENV SPARK_DIST_CLASSPATH="${HADOOP_CONF_DIR}:${HADOOP_HOME}/share/hadoop/tools/lib/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/hdfs:${HADOOP_HOME}/share/hadoop/hdfs/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/yarn:${HADOOP_HOME}/share/hadoop/yarn/lib/*:${HADOOP_HOME}/share/hadoop/yarn/*"
# FLINK
ENV FLINK_VERSION=1.13.6
ENV FLINK_SCALA_VERSION=2.12
ENV FLINK_HOME=/usr/program/flink
ENV FLINK_PACKAGE="flink-${FLINK_VERSION}-bin-scala_${FLINK_SCALA_VERSION}.tgz"
ENV PATH="${PATH}:${FLINK_HOME}/bin"
ENV FLINK_CONF_DIR="${FLINK_HOME}/conf"
ENV FLINK_LOG_DIR="${FLINK_HOME}/logs"
# Sqoop
ENV SQOOP_VERSION=1.4.7
ENV HADOOP_SQOOP_VERSION=2.6.0
ENV SQOOP_HOME=/usr/program/sqoop
ENV SQOOP_PACKAGE="sqoop-${SQOOP_VERSION}.bin__hadoop-${HADOOP_SQOOP_VERSION}.tar.gz"
ENV PATH="${PATH}:${SQOOP_HOME}/bin"
ENV HADOOP_COMMON_HOME="${HADOOP_HOME}"
ENV HADOOP_MAPRED_HOME="${HADOOP_HOME}"
ENV SQOOP_CONF_DIR="${SQOOP_HOME}/conf"
ENV SQOOP_LOG_DIR="${SQOOP_HOME}/logs"
# Zookeeper
ENV ZK_VERSION=3.6.3
ENV ZK_HOME=/usr/program/zookeeper
ENV ZK_CONF_DIR=${ZK_HOME}/conf
ENV ZK_PACKAGE="apache-zookeeper-${ZK_VERSION}-bin.tar.gz"
# Tez
ENV TEZ_VERSION=0.9.2
ENV TEZ_HOME=/usr/program/tez
ENV TEZ_PACKAGE="apache-tez-${TEZ_VERSION}-bin.tar.gz"
ENV TEZ_CONF_DIR=${HADOOP_CONF_DIR}
ENV TEZ_JARS=${TEZ_HOME}/*:${TEZ_HOME}/lib/*
ENV HADOOP_CLASSPATH=${TEZ_CONF_DIR}:${TEZ_JARS}:${HADOOP_CLASSPATH}
# Hudi
ENV HUDI_VERSION=0.10.0
# Trino(PrestoSQL)
ENV TRINO_VERSION=378
ENV TRINO_HOME=/usr/program/trino-server
ENV TRINO_CONF_DIR=${TRINO_HOME}/etc
ENV ZULU_JDK11_HOME=/usr/program/zulu-jdk11
# 220426: 0.8.0 is not a release version
ENV LIVY_VERSION=0.8.0-incubating
ENV LIVY_HOME=/usr/program/livy
ENV LIVY_CONF_DIR="${LIVY_HOME}/conf"
ENV LIVY_LOG_DIR="${LIVY_HOME}/logs"
ENV LIVY_PACKAGE="apache-livy-${LIVY_VERSION}-bin.zip"
RUN apk add --no-cache \
'curl=~7.66' \
'unzip=~6.0' \
'openjdk8=~8' \
'bash=~5.0' \
'coreutils=~8.31' \
'procps=~3.3' \
'findutils=~4.6' \
'ncurses=~6.1' \
'g++=~8.3' \
'libc6-compat=~1.1' \
tcl tk expect \
&& ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2
# https://github.com/hadolint/hadolint/wiki/DL4006
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# for openrc 避免openrc 报文件只读错误,创建匿名数据卷
VOLUME [ "/sys/fs/cgroup" ]
# 设置alpine默认root密码
ENV ROOT_PWD=123456
# add ssh https://blog.csdn.net/Gekkoou/article/details/90430603
RUN apk update && \
apk add openssh-server openssh-client openrc tzdata && \
cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
rc-update add sshd && \
mkdir -p /run/openrc && touch /run/openrc/softlevel && \
openrc && \
rc-status && \
sed -i "s/#Port 22/Port 22/g" /etc/ssh/sshd_config && \
sed -i "s/#PermitRootLogin.*/PermitRootLogin yes/g" /etc/ssh/sshd_config && \
mkdir -p /root/.ssh && chmod 700 /root/.ssh/ && \
ssh-keygen -t rsa -N '' -f /root/.ssh/id_rsa && \
service sshd restart && \
echo "root:${ROOT_PWD}" | chpasswd && \
apk del tzdata && \
rm -rf /var/cache/apk/*
# PySpark - comment out if you don't want it in order to save image space
RUN apk add --no-cache \
'python3=~3.7' \
'python3-dev=~3.7' \
&& ln -s /usr/bin/python3 /usr/bin/python \
&& rm -rf /var/cache/apk/*
# 原始的国内R语言源是:http://cran.us.r-project.org
# 国内源:http://mirrors.tuna.tsinghua.edu.cn/CRAN
# SparkR - comment out if you don't want it in order to save image space
#RUN apk add --no-cache \
# 'R=~3.6' \
# 'R-dev=~3.6' \
# 'libc-dev=~0.7' \
# && R -e 'install.packages("knitr", repos = "http://mirrors.tuna.tsinghua.edu.cn/CRAN")'
# # && R -e 'install.packages("rmarkdown", repos = "http://cran.us.r-project.org")'
# Multi tail for logging 多重日志合并服务
COPY scripts/ /scripts
RUN apk add 'linux-headers=~4.19' \
&& gcc /scripts/watchdir.c -o /scripts/watchdir \
&& chmod +x /scripts/*
################################-- end Packaging --######################################