Skip to content

Commit

Permalink
Upgrade iceberg14 & spark 3.5 (#115)
Browse files Browse the repository at this point in the history
* Start upgrade to iceberg 1.4

* And Spark 3.5

* And update sbt build too.

* Update spark testing base version to match.

* Drop the word2vec simple MLlib example, Spark no longer depends on blas transitively.

* Drop the BLAS import.

* Fix cleanup

* Bump to 1.4.5 to try and avoid the issue with metadata.

* Go 1.4.7
  • Loading branch information
holdenk authored Oct 14, 2023
1 parent 9d5395c commit 61c0b31
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 41 deletions.
4 changes: 2 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ lazy val core = (project in file("core")) // regular scala code with @native met
javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true"),
Test / javaOptions ++= specialOptions,
// 2.4.5 is the highest version we have with the old spark-testing-base deps
sparkVersion := System.getProperty("sparkVersion", "3.3.0"),
sparkTestingVersion := "1.4.0",
sparkVersion := System.getProperty("sparkVersion", "3.5.0"),
sparkTestingVersion := "1.4.7",
// additional libraries
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion.value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import org.apache.spark.mllib.linalg.{Vector => SparkVector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import com.github.fommil.netlib.BLAS.{getInstance => blas}
import com.highperformancespark.examples.dataframe._
//end::imports[]

Expand Down Expand Up @@ -97,42 +96,6 @@ object GoldilocksMLlib {
}
//end::trainScaler[]

//tag::word2vecSimple[]
def word2vec(sc: SparkContext, rdd: RDD[String]): RDD[SparkVector] = {
// Tokenize our data
val tokenized = rdd.map(_.split(" ").toIterable)
// Construct our word2vec model
val wv = new Word2Vec()
val wvm = wv.fit(tokenized)
val wvmb = sc.broadcast(wvm)
// WVM can now transform single words
println(wvm.transform("panda"))
// Vector size is 100 - we use this to build a transformer on top of WVM that
// works on sentences.
val vectorSize = 100
// The transform function works on a per-word basis, but we have
// sentences as input.
tokenized.map{words =>
// If there is nothing in the sentence output a null vector
if (words.isEmpty) {
Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
} else {
// If there are sentences construct a running sum of the
// vectors for each word
val sum = Array[Double](vectorSize)
words.foreach { word =>
blas.daxpy(
vectorSize, 1.0, wvmb.value.transform(word).toArray, 1, sum, 1)
}
// Then scale it by the number of words
blas.dscal(sum.length, 1.0 / words.size, sum, 1)
// And wrap it in a Spark vector
Vectors.dense(sum)
}
}
}
//end::word2vecSimple[]

//tag::hashingTFPreserve[]
def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = {
val ht = new HashingTF()
Expand Down
6 changes: 4 additions & 2 deletions env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@

# Download Spark and iceberg if not present
SPARK_MAJOR=${SPARK_MAJOR:-"3.4"}
SPARK_VERSION=${SPARK_VERSION:-"3.4.1"}
SPARK_VERSION=${SPARK_VERSION:-"3.5.0"}
SCALA_VERSION=${SCALA_VERSION:-"2.12"}
HADOOP_VERSION="3"
SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz"
ICEBERG_VERSION=${ICEBERG_VERSION:-"1.3.1"}
ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"}
if [ ! -f "${SPARK_FILE}" ]; then
wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" &
fi
Expand All @@ -26,6 +26,8 @@ fi
export SPARK_HOME="${SPARK_PATH}"

if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then
# Delete the old JAR first.
rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete."
cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}"
fi

Expand Down

0 comments on commit 61c0b31

Please sign in to comment.