From 947a1ab61ad92f66fbf858233dfd10b9be0e132a Mon Sep 17 00:00:00 2001 From: Fan Ting Wei Date: Mon, 11 Sep 2023 11:15:31 +0800 Subject: [PATCH] feat(build): add Spark v3.4.1 (#40) * update to 3.4.1 --- .github/workflows/ci.yml | 24 ++++++++++++++++++++++++ CHANGELOG.md | 3 ++- README.md | 16 +++++++++++----- make-distribution.sh | 11 ++++++++++- templates/vars.yml | 5 +++++ 5 files changed, 52 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 92141b5..02dfb3d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,6 +111,30 @@ jobs: scala: "2.13" with_hive: "true" with_pyspark: "true" + - spark: "3.4.1" + java: "8" + hadoop: "3.3.4" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.1" + java: "8" + hadoop: "3.3.4" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.1" + java: "11" + hadoop: "3.3.4" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.1" + java: "11" + hadoop: "3.3.4" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" runs-on: ubuntu-20.04 env: IMAGE_NAME: "spark-k8s" diff --git a/CHANGELOG.md b/CHANGELOG.md index 235ef00..2ff7de0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,9 @@ ## v3 - (Temporarily drop support for R due to keyserver issues) -- Only supports for for 3.1.3, 3.2.2, 3.3.0 (dropped 2.4.8). +- Only supports for for 3.1.3, 3.2.2, 3.3.0, 3.4.1 (dropped 2.4.8). - Supports both Java 8 and 11 for Spark 3 builds. +- Add Ubuntu-based image since the migration to eclipse-temurin for jre image source. ## v2 diff --git a/README.md b/README.md index de0891f..e3c9202 100644 --- a/README.md +++ b/README.md @@ -12,16 +12,22 @@ Debian: - `3.3.0` - `3.2.2` - `3.1.3` +- `3.4.1` ## Note (R builds are temporarily suspended due to keyserver issues at current time.) -All the build images here are Debian based as the official Spark repository now -uses `openjdk:-jdk-slim-buster` as the base image for Kubernetes build. -Because currently the official Dockerfiles do not pin the Debian distribution, -they are incorrectly using the latest Debian `bullseye`, which does not have -support for Python 2, and its Python 3.9 do not work well with PySpark. +Build image for Spark 3.4.1 is Ubuntu based because openjdk is deprecated and +going forward the official Spark repository uses `eclipse-temurin:-jre` +where slim variants of jre images are not available at the moment. + +All the build images with Spark before v3.4.0 are Debian based as the official +Spark repository now uses `openjdk:-jre-slim-buster` as the base image +for Kubernetes build. Because currently the official Dockerfiles do not pin +the Debian distribution, they are incorrectly using the latest Debian `bullseye`, +which does not have support for Python 2, and its Python 3.9 do not work well +with PySpark. Hence some Dockerfile overrides are in-place to make sure that Spark 2 builds can still work. diff --git a/make-distribution.sh b/make-distribution.sh index d7cd5c6..1f4e2e2 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -67,6 +67,15 @@ else DOCKERFILE_PY="./resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile" fi +if [[ ${SPARK_MAJOR_VERSION} -eq 3 && ${SPARK_MINOR_VERSION} -ge 4 ]]; then # >=3.4 + # From Spark v3.4.0 onwards, openjdk is not the prefered base image source as it i + # deprecated and taken over by eclipse-temurin. slim-buster variants are not available + # on eclipse-temurin at the moment. + IMAGE_VARIANT="jre" +else + IMAGE_VARIANT="jre-slim-buster" +fi + # Temporarily remove R build due to keyserver issue # DOCKERFILE_R="./resource-managers/kubernetes/docker/src/main/dockerfiles/R/Dockerfile" @@ -83,7 +92,7 @@ TAG_NAME="${SELF_VERSION}_${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_ # build ./bin/docker-image-tool.sh \ - -b java_image_tag=${JAVA_VERSION}-jre-slim-buster \ + -b java_image_tag=${JAVA_VERSION}-${IMAGE_VARIANT} \ -r "${IMAGE_NAME}" \ -t "${TAG_NAME}" \ -f "${DOCKERFILE_BASE}" \ diff --git a/templates/vars.yml b/templates/vars.yml index 1c8e674..32720f7 100644 --- a/templates/vars.yml +++ b/templates/vars.yml @@ -15,3 +15,8 @@ versions: java: ['8', '11'] hadoop: ['3.3.2'] scala: ['2.12', '2.13'] + +- spark: ['3.4.1'] + java: ['8', '11'] + hadoop: ['3.3.4'] + scala: ['2.12', '2.13'] \ No newline at end of file