Add spark

martinprobson · Nov 7, 2017 · 6b12d71 · 6b12d71
1 parent e557c4d
commit 6b12d71
Show file tree

Hide file tree

Showing 7 changed files with 147 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
+deploy/
 resources/vim/vimdir
 resources/miniconda/miniconda.sh
 .vagrant
 *.log
+*.tgz
diff --git a/Vagrantfile b/Vagrantfile
@@ -65,6 +65,7 @@ Vagrant.configure("2") do |config|
   # Enable provisioning with a shell script. Additional provisioners such as
   # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
   # documentation for more information about their specific syntax and use.
+  config.vm.network :private_network, ip: '10.211.55.102'
   config.vm.hostname = "devvm"
   config.vm.provision "shell", path: "scripts/10_InstallBasePackages.sh"
   config.vm.provision "shell", path: "scripts/20_setPassword.sh"
@@ -73,5 +74,6 @@ Vagrant.configure("2") do |config|
   config.vm.provision "shell", path: "scripts/30_setupJava.sh"
   config.vm.provision "shell", path: "scripts/40_setupEclipse.sh"
   config.vm.provision "shell", path: "scripts/50_setupMiniconda.sh", privileged: false
+  config.vm.provision "shell", path: "scripts/60_setup_spark.sh"
 
 end
diff --git a/resources/spark/spark-defaults.conf b/resources/spark/spark-defaults.conf
@@ -0,0 +1,17 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+#spark.eventLog.dir=hdfs://10.211.55.101/user/spark/applicationHistory
+#spark.eventLog.dir               hdfs://node1:8021/user/spark/applicationHistory
+#spark.eventLog.dir               hdfs://node1/user/spark/applicationHistory
+#spark.eventLog.dir               hdfs:///user/spark/applicationHistory
+#spark.yarn.historyServer.address=10.211.55.101:18080
+#spark.eventLog.enabled=true
diff --git a/resources/spark/spark-env.sh b/resources/spark/spark-env.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with 
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client mode
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
+# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
+# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
+# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
+# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
+# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
+
+# Options for the daemons used in the standalone deploy mode:
+# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+#SPARK_DAEMON_JAVA_OPTS="-Dspark.history.fs.logDirectory=hdfs:///user/spark/applicationHistory"
+#SPARK_DAEMON_JAVA_OPTS="-Dspark.history.fs.logDirectory=hdfs://node1/user/spark/applicationHistory"
+
+
diff --git a/resources/spark/spark.sh b/resources/spark/spark.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+export SPARK_HOME=/usr/local/spark
+export PATH=${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH}
diff --git a/scripts/60_setup_spark.sh b/scripts/60_setup_spark.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# http://www.cloudera.com/content/cloudera/en/documentation/core/v5-2-x/topics/cdh_ig_spark_configure.html
+
+source "/vagrant/scripts/common.sh"
+
+function installLocalSpark {
+	echo "install spark from local file"
+	FILE=/vagrant/resources/$SPARK_ARCHIVE
+	tar -xzf $FILE -C /usr/local
+}
+
+function installRemoteSpark {
+	echo "install spark from remote file"
+	curl -sS -o /vagrant/resources/$SPARK_ARCHIVE -O -L $SPARK_MIRROR_DOWNLOAD
+	tar -xzf /vagrant/resources/$SPARK_ARCHIVE -C /usr/local
+}
+
+function setupSpark {
+	echo "setup spark"
+#	cp -f /vagrant/resources/spark/slaves /usr/local/spark/conf
+	cp -f /vagrant/resources/spark/spark-env.sh /usr/local/spark/conf
+	cp -f /vagrant/resources/spark/spark-defaults.conf /usr/local/spark/conf
+#	ln -s $HADOOP_CONF/yarn-site.xml /usr/local/spark/conf/yarn-site.xml
+#	ln -s $HADOOP_CONF/core-site.xml /usr/local/spark/conf/core-site.xml
+#	ln -s $HADOOP_CONF/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml
+#	ln -s $HIVE_CONF/hive-site.xml   /usr/local/spark/conf/hive-site.xml
+}
+
+function setupEnvVars {
+	echo "creating spark environment variables"
+	cp -f $SPARK_RES_DIR/spark.sh /etc/profile.d/spark.sh
+	. /etc/profile.d/spark.sh
+}
+
+function setupHistoryServer {
+	echo "setup history server"
+	. /etc/profile.d/hadoop.sh
+    hdfs dfs -mkdir -p /user/spark/applicationHistory
+    hdfs dfs -chmod -R 777 /user/spark
+}
+
+function installSpark {
+	if resourceExists $SPARK_ARCHIVE; then
+		installLocalSpark
+	else
+		installRemoteSpark
+	fi
+	ln -s /usr/local/$SPARK_VERSION-bin-hadoop2.7 /usr/local/spark
+	mkdir -p /usr/local/spark/logs/history
+}
+
+function startServices {
+	echo "starting Spark history service"
+	/usr/local/spark/sbin/start-history-server.sh
+}
+
+echo "setup spark"
+
+installSpark
+setupSpark
+setupEnvVars
+#setupHistoryServer
+#startServices
+
+echo "spark setup complete"
diff --git a/scripts/common.sh b/scripts/common.sh
@@ -55,3 +55,11 @@ MINICONDA_ARCHIVE=${MINICONDA_MAJOR_VERSION}-${MINICONDA_VERSION}-${MINICONDA_AR
 MINICONDA_INSTALL_LOCATION=/home/ubuntu/miniconda
 MINICONDA_INSTALLER=/vagrant/resources/miniconda/miniconda.sh
 MINICONDA_PROFILE=/vagrant/resources/miniconda/miniconda-profile.sh
+
+# spark
+SPARK_VERSION=spark-2.1.1
+SPARK_ARCHIVE=$SPARK_VERSION-bin-hadoop2.tgz
+SPARK_MIRROR_DOWNLOAD=http://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
+SPARK_RES_DIR=/vagrant/resources/spark
+SPARK_CONF_DIR=/usr/local/spark/conf
+