Skip to content

Commit

Permalink
Switch to Spark 4 preview 2 (#136)
Browse files Browse the repository at this point in the history
* Use scala 2.13.13 & Spark 4 snapshot & spark testing base snapshot. Add local maven resolver for snapshots

Add an upsert example

Update sbt version and plugins

Update for Spark 4 / Scala 2.13

* Drop 11 from the build matrix

* We need JDK17
  • Loading branch information
holdenk authored Nov 3, 2024
1 parent c1adb45 commit da5958d
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 13 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ jobs:
matrix:
include:
- java: 17
- java: 11
runs-on: ubuntu-latest
steps:
- name: Checkout
Expand Down Expand Up @@ -179,6 +178,12 @@ jobs:
path: |
data/fetched/*
key: data-fetched
- name: Setup JDK
uses: actions/setup-java@v3
with:
distribution: temurin
java-version: 17
cache: sbt
- name: Run PySpark examples
run:
./run_pyspark_examples.sh
Expand Down
16 changes: 9 additions & 7 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
scalaVersion := "2.13.8"

lazy val root = (project in file("."))
.aggregate(core, native)

Expand All @@ -16,6 +14,7 @@ organization := "com.highperformancespark"

lazy val V = _root_.scalafix.sbt.BuildInfo

scalaVersion := "2.13.13"
addCompilerPlugin(scalafixSemanticdb)
scalacOptions ++= List(
"-Yrangepos",
Expand All @@ -38,7 +37,8 @@ resolvers ++= Seq(
"Typesafe repository" at "https://repo.typesafe.com/typesafe/releases/",
"Second Typesafe repo" at "https://repo.typesafe.com/typesafe/maven-releases/",
"Mesosphere Public Repository" at "https://downloads.mesosphere.io/maven",
Resolver.sonatypeRepo("public")
Resolver.sonatypeRepo("public"),
Resolver.mavenLocal
)

licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html"))
Expand Down Expand Up @@ -67,17 +67,18 @@ val sparkTestingVersion = settingKey[String]("Spark testing base version without
lazy val core = (project in file("core")) // regular scala code with @native methods
.dependsOn(native % Runtime)
.settings(javah / target := (native / nativeCompile / sourceDirectory).value / "include")
.settings(scalaVersion := "2.13.13")
.settings(sbtJniCoreScope := Compile)
.settings(
scalaVersion := "2.13.8",
javacOptions ++= Seq("-source", "1.8", "-target", "1.8"),
javacOptions ++= Seq("-source", "17", "-target", "17"),
parallelExecution in Test := false,
fork := true,
javaOptions ++= Seq("-Xms4048M", "-Xmx4048M", "-Djna.nosys=true"),
Test / javaOptions ++= specialOptions,
// 2.4.5 is the highest version we have with the old spark-testing-base deps
sparkVersion := System.getProperty("sparkVersion", "3.5.1"),
sparkTestingVersion := "1.5.2",
sparkVersion := System.getProperty("sparkVersion", "4.0.0-preview2"),
sparkTestingVersion := "2.0.1",
// additional libraries
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion.value % Provided,
Expand All @@ -95,12 +96,13 @@ lazy val core = (project in file("core")) // regular scala code with @native met
"net.java.dev.jna" % "jna" % "5.12.1"),
scalacOptions ++= Seq("-deprecation", "-unchecked"),
pomIncludeRepository := { x => false },
resolvers += Resolver.mavenLocal
)

// JNI Magic!
lazy val native = (project in file("native")) // native code and build script
.settings(nativeCompile / sourceDirectory := sourceDirectory.value)
.settings(scalaVersion := "2.13.8")
.settings(scalaVersion := "2.13.13")
.enablePlugins(JniNative) // JniNative needs to be explicitly enabled

//tag::xmlVersionConflict[]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ case class LoadSave(sc: SparkContext, session: SparkSession) {
}
//end::saveAppend[]

def upsertPandas(input: DataFrame): Unit = {
//tag::upsert[]
input.mergeInto("pandaInfo", $"source.id" === $"target.id")
.whenMatched() // Note you can override the general match condition above if desired
.updateAll()
.whenNotMatched()
.insertAll()
//end::upsert[]
}

def createJDBC() = {
session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass",
"table", new Properties)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class SimpleNaiveBayes(val uid: String)
// Note this estimator assumes they start at 0 and go to numClasses
val numClasses = getNumClasses(ds)
// Get the number of features by peaking at the first row
val numFeatures: Integer = ds.select(col($(featuresCol))).head
val numFeatures: Integer = ds.select(col($(featuresCol))).head()
.get(0).asInstanceOf[Vector].size
// Determine the number of records for each class
val groupedByLabel = ds.select(col($(labelCol)).as[Double]).groupByKey(x => x)
Expand Down
8 changes: 4 additions & 4 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositori
resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/"


addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2")

addDependencyTreePlugin

//tag::scalaFix[]
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.10.4")
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.12.1")
//end::scalaFix[]

//tag::sbtJNIPlugin[]
addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.5.4")
addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.7.0")
//end::sbtJNIPlugin[]

//tag::xmlVersionConflict[]
Expand All @@ -24,4 +24,4 @@ ThisBuild / libraryDependencySchemes ++= Seq(
)
//end::xmlVersionConflict[]

addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0")

0 comments on commit da5958d

Please sign in to comment.