Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade iceberg14 & spark 3.5 #115

Merged
merged 9 commits into from
Oct 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ lazy val core = (project in file("core")) // regular scala code with @native met
javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true"),
Test / javaOptions ++= specialOptions,
// 2.4.5 is the highest version we have with the old spark-testing-base deps
sparkVersion := System.getProperty("sparkVersion", "3.3.0"),
sparkTestingVersion := "1.4.0",
sparkVersion := System.getProperty("sparkVersion", "3.5.0"),
sparkTestingVersion := "1.4.7",
// additional libraries
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion.value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import org.apache.spark.mllib.linalg.{Vector => SparkVector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import com.github.fommil.netlib.BLAS.{getInstance => blas}
import com.highperformancespark.examples.dataframe._
//end::imports[]

Expand Down Expand Up @@ -97,42 +96,6 @@ object GoldilocksMLlib {
}
//end::trainScaler[]

//tag::word2vecSimple[]
def word2vec(sc: SparkContext, rdd: RDD[String]): RDD[SparkVector] = {
// Tokenize our data
val tokenized = rdd.map(_.split(" ").toIterable)
// Construct our word2vec model
val wv = new Word2Vec()
val wvm = wv.fit(tokenized)
val wvmb = sc.broadcast(wvm)
// WVM can now transform single words
println(wvm.transform("panda"))
// Vector size is 100 - we use this to build a transformer on top of WVM that
// works on sentences.
val vectorSize = 100
// The transform function works on a per-word basis, but we have
// sentences as input.
tokenized.map{words =>
// If there is nothing in the sentence output a null vector
if (words.isEmpty) {
Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
} else {
// If there are sentences construct a running sum of the
// vectors for each word
val sum = Array[Double](vectorSize)
words.foreach { word =>
blas.daxpy(
vectorSize, 1.0, wvmb.value.transform(word).toArray, 1, sum, 1)
}
// Then scale it by the number of words
blas.dscal(sum.length, 1.0 / words.size, sum, 1)
// And wrap it in a Spark vector
Vectors.dense(sum)
}
}
}
//end::word2vecSimple[]

//tag::hashingTFPreserve[]
def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = {
val ht = new HashingTF()
Expand Down
6 changes: 4 additions & 2 deletions env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@

# Download Spark and iceberg if not present
SPARK_MAJOR=${SPARK_MAJOR:-"3.4"}
SPARK_VERSION=${SPARK_VERSION:-"3.4.1"}
SPARK_VERSION=${SPARK_VERSION:-"3.5.0"}
SCALA_VERSION=${SCALA_VERSION:-"2.12"}
HADOOP_VERSION="3"
SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz"
ICEBERG_VERSION=${ICEBERG_VERSION:-"1.3.1"}
ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"}
if [ ! -f "${SPARK_FILE}" ]; then
wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" &
fi
Expand All @@ -26,6 +26,8 @@ fi
export SPARK_HOME="${SPARK_PATH}"

if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then
# Delete the old JAR first.
rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete."
cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}"
fi

Expand Down
Loading