diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 02dfb3d..4482f81 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,10 +135,82 @@ jobs: scala: "2.13" with_hive: "true" with_pyspark: "true" + - spark: "3.5.1" + java: "8" + hadoop: "3.3.4" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "8" + hadoop: "3.3.4" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "8" + hadoop: "3.3.6" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "8" + hadoop: "3.3.6" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "11" + hadoop: "3.3.4" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "11" + hadoop: "3.3.4" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "11" + hadoop: "3.3.6" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "11" + hadoop: "3.3.6" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "17" + hadoop: "3.3.4" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "17" + hadoop: "3.3.4" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "17" + hadoop: "3.3.6" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.5.1" + java: "17" + hadoop: "3.3.6" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" runs-on: ubuntu-20.04 env: IMAGE_NAME: "spark-k8s" - SELF_VERSION: "v3" + SELF_VERSION: "v4" SPARK_VERSION: "${{ matrix.version.spark }}" HADOOP_VERSION: "${{ matrix.version.hadoop }}" SCALA_VERSION: "${{ matrix.version.scala }}" diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ff7de0..eaa1734 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # CHANGELOG +## v4 +- Drop support for all Spark version less than 2.y.z +- Add Spark 3.5.1 +- Add Hadoop 3.3.6 +- Add support for Java 17 for Spark 3.5.1 +- Fix Ubuntu-based images to use `jre-focal` variant instead of `jre` which was recently upgraded to Ubuntu Jammy to v22.y.z and causing system level python package installation to fail due to [PEP 668](https://issues.apache.org/jira/browse/SPARK-49068) +- Remove docker tags without self version + ## v3 - (Temporarily drop support for R due to keyserver issues) diff --git a/README.md b/README.md index e3c9202..56626d2 100644 --- a/README.md +++ b/README.md @@ -13,20 +13,21 @@ Debian: - `3.2.2` - `3.1.3` - `3.4.1` +- `3.5.1` -## Note +## Notes (R builds are temporarily suspended due to keyserver issues at current time.) -Build image for Spark 3.4.1 is Ubuntu based because openjdk is deprecated and -going forward the official Spark repository uses `eclipse-temurin:-jre` +Build image for Spark 3.4.1/3.5.1 is Ubuntu based because openjdk is deprecated and +going forward the official Spark repository uses `eclipse-temurin:-jre-focal` where slim variants of jre images are not available at the moment. All the build images with Spark before v3.4.0 are Debian based as the official Spark repository now uses `openjdk:-jre-slim-buster` as the base image for Kubernetes build. Because currently the official Dockerfiles do not pin the Debian distribution, they are incorrectly using the latest Debian `bullseye`, -which does not have support for Python 2, and its Python 3.9 do not work well +which does not have support for Python 2, and its Python 3.9 do not work well with PySpark. Hence some Dockerfile overrides are in-place to make sure that Spark 2 builds @@ -48,11 +49,11 @@ For quick testing of local build, you should do the following commands: ```bash export IMAGE_NAME=spark-k8s -export SELF_VERSION="v3" +export SELF_VERSION="v4" export SCALA_VERSION="2.12" -export SPARK_VERSION="3.3.0" -export HADOOP_VERSION="3.3.2" -export JAVA_VERSION="11" +export SPARK_VERSION="3.5.1" +export HADOOP_VERSION="3.3.6" +export JAVA_VERSION="17" export WITH_HIVE="true" export WITH_PYSPARK="true" bash make-distribution.sh diff --git a/build.sh b/build.sh index b1e9643..34d5f58 100755 --- a/build.sh +++ b/build.sh @@ -1,9 +1,9 @@ export IMAGE_NAME=spark-k8s -export SELF_VERSION="v3" +export SELF_VERSION="v4" export SCALA_VERSION="2.12" -export SPARK_VERSION="3.3.0" -export HADOOP_VERSION="3.3.2" -export JAVA_VERSION="11" +export SPARK_VERSION="3.5.2" +export HADOOP_VERSION="3.3.6" +export JAVA_VERSION="8" export WITH_HIVE="true" export WITH_PYSPARK="true" bash make-distribution.sh diff --git a/make-distribution.sh b/make-distribution.sh index 1f4e2e2..4e3591a 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -40,57 +40,25 @@ TERM=xterm-color ./dev/make-distribution.sh \ ${HIVE_INSTALL_FLAG:+"-Phive"} \ -DskipTests -SPARK_MAJOR_VERSION="$(echo "${SPARK_VERSION}" | cut -d '.' -f1)" -HADOOP_MAJOR_VERSION="$(echo "${HADOOP_VERSION}" | cut -d '.' -f1)" -HIVE_HADOOP3_HIVE_EXEC_URL=${HIVE_HADOOP3_HIVE_EXEC_URL:-https://github.com/guangie88/hive-exec-jar/releases/download/1.2.1.spark2-hadoop3/hive-exec-1.2.1.spark2.jar} - -# Replace Hive for Hadoop 3 since Hive 1.2.1 does not officially support Hadoop 3 when using Spark 2.y.z -# Note docker-image-tool.sh takes the jars from assembly/target/scala-2.*/jars -if [[ "${WITH_HIVE}" = "true" ]] && [[ "${SPARK_MAJOR_VERSION}" -eq 2 ]] && [[ "${HADOOP_MAJOR_VERSION}" -eq 3 ]]; then - HIVE_EXEC_JAR_NAME="hive-exec-1.2.1.spark2.jar" - TARGET_JAR_PATH="$(find assembly -type f -name "${HIVE_EXEC_JAR_NAME}")" - curl -LO "${HIVE_HADOOP3_HIVE_EXEC_URL}" && mv "${HIVE_EXEC_JAR_NAME}" "${TARGET_JAR_PATH}" - # Spark <= 2.4 uses ${TARGET_JAR_PATH} for Docker COPY, but Spark >= 3 uses dist/jars/ - cp "${TARGET_JAR_PATH}" "dist/jars/" -fi - SPARK_MAJOR_VERSION="$(echo "${SPARK_VERSION}" | cut -d '.' -f1)" SPARK_MINOR_VERSION="$(echo "${SPARK_VERSION}" | cut -d '.' -f2)" +HADOOP_MAJOR_VERSION="$(echo "${HADOOP_VERSION}" | cut -d '.' -f1)" -if [[ ${SPARK_MAJOR_VERSION} -eq 2 && ${SPARK_MINOR_VERSION} -eq 4 ]]; then # 2.4.z - # Same Dockerfiles as Spark v2.4.8, but allow override of base image to use Debian Buster - # and not using PYTHONENV and instead copies pyspark out like Spark 3.y.z - DOCKERFILE_BASE="../overrides/base/2.4.z/Dockerfile" - DOCKERFILE_PY="../overrides/python/2.4.z/Dockerfile" -else - DOCKERFILE_BASE="./resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile" - DOCKERFILE_PY="./resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile" -fi +DOCKERFILE_BASE="./resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile" +DOCKERFILE_PY="./resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile" if [[ ${SPARK_MAJOR_VERSION} -eq 3 && ${SPARK_MINOR_VERSION} -ge 4 ]]; then # >=3.4 # From Spark v3.4.0 onwards, openjdk is not the prefered base image source as it i # deprecated and taken over by eclipse-temurin. slim-buster variants are not available # on eclipse-temurin at the moment. - IMAGE_VARIANT="jre" + IMAGE_VARIANT="jre-focal" else IMAGE_VARIANT="jre-slim-buster" fi -# Temporarily remove R build due to keyserver issue -# DOCKERFILE_R="./resource-managers/kubernetes/docker/src/main/dockerfiles/R/Dockerfile" - SPARK_LABEL="${SPARK_VERSION}" TAG_NAME="${SELF_VERSION}_${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION}" -# ./bin/docker-image-tool.sh \ -# -b java_image_tag=${JAVA_VERSION}-jre-slim-buster \ -# -r "${IMAGE_NAME}" \ -# -t "${TAG_NAME}" \ -# -f "${DOCKERFILE_BASE}" \ -# -p "${DOCKERFILE_PY}" \ -# -R "${DOCKERFILE_R}" \ -# build - ./bin/docker-image-tool.sh \ -b java_image_tag=${JAVA_VERSION}-${IMAGE_VARIANT} \ -r "${IMAGE_NAME}" \ @@ -101,6 +69,5 @@ TAG_NAME="${SELF_VERSION}_${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_ docker tag "${IMAGE_NAME}/spark:${TAG_NAME}" "${IMAGE_NAME}:${TAG_NAME}" docker tag "${IMAGE_NAME}/spark-py:${TAG_NAME}" "${IMAGE_NAME}-py:${TAG_NAME}" -# docker tag "${IMAGE_NAME}/spark-r:${TAG_NAME}" "${IMAGE_NAME}-r:${TAG_NAME}" popd >/dev/null diff --git a/push-images.sh b/push-images.sh index 9d3547d..d1539a8 100755 --- a/push-images.sh +++ b/push-images.sh @@ -11,21 +11,10 @@ else fi TAG_NAME="${SELF_VERSION}_${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION}" -ALT_TAG_NAME="${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION}" docker tag "${IMAGE_NAME}:${TAG_NAME}" "${IMAGE_ORG}/${IMAGE_NAME}:${TAG_NAME}" docker push "${IMAGE_ORG}/${IMAGE_NAME}:${TAG_NAME}" -docker tag "${IMAGE_NAME}:${TAG_NAME}" "${IMAGE_ORG}/${IMAGE_NAME}:${ALT_TAG_NAME}" -docker push "${IMAGE_ORG}/${IMAGE_NAME}:${ALT_TAG_NAME}" # Python image push docker tag "${IMAGE_NAME}-py:${TAG_NAME}" "${IMAGE_ORG}/${IMAGE_NAME}-py:${TAG_NAME}" docker push "${IMAGE_ORG}/${IMAGE_NAME}-py:${TAG_NAME}" -docker tag "${IMAGE_NAME}-py:${TAG_NAME}" "${IMAGE_ORG}/${IMAGE_NAME}-py:${ALT_TAG_NAME}" -docker push "${IMAGE_ORG}/${IMAGE_NAME}-py:${ALT_TAG_NAME}" - -# R image push -# docker tag "${IMAGE_NAME}-r:${TAG_NAME}" "${IMAGE_ORG}/${IMAGE_NAME}-r:${TAG_NAME}" -# docker push "${IMAGE_ORG}/${IMAGE_NAME}-r:${TAG_NAME}" -# docker tag "${IMAGE_NAME}-r:${TAG_NAME}" "${IMAGE_ORG}/${IMAGE_NAME}-r:${ALT_TAG_NAME}" -# docker push "${IMAGE_ORG}/${IMAGE_NAME}-r:${ALT_TAG_NAME}" diff --git a/templates/vars.yml b/templates/vars.yml index 32720f7..e224834 100644 --- a/templates/vars.yml +++ b/templates/vars.yml @@ -1,4 +1,4 @@ -self_version: 'v3' +self_version: 'v4' versions: - spark: ['3.1.3'] @@ -19,4 +19,9 @@ versions: - spark: ['3.4.1'] java: ['8', '11'] hadoop: ['3.3.4'] + scala: ['2.12', '2.13'] + +- spark: ['3.5.1'] + java: ['8', '11', '17'] + hadoop: ['3.3.4', '3.3.6'] scala: ['2.12', '2.13'] \ No newline at end of file