Skip to content

Commit

Permalink
Add spark
Browse files Browse the repository at this point in the history
  • Loading branch information
martinprobson committed Nov 7, 2017
1 parent e557c4d commit 6b12d71
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
deploy/
resources/vim/vimdir
resources/miniconda/miniconda.sh
.vagrant
*.log
*.tgz
2 changes: 2 additions & 0 deletions Vagrantfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Vagrant.configure("2") do |config|
# Enable provisioning with a shell script. Additional provisioners such as
# Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
# documentation for more information about their specific syntax and use.
config.vm.network :private_network, ip: '10.211.55.102'
config.vm.hostname = "devvm"
config.vm.provision "shell", path: "scripts/10_InstallBasePackages.sh"
config.vm.provision "shell", path: "scripts/20_setPassword.sh"
Expand All @@ -73,5 +74,6 @@ Vagrant.configure("2") do |config|
config.vm.provision "shell", path: "scripts/30_setupJava.sh"
config.vm.provision "shell", path: "scripts/40_setupEclipse.sh"
config.vm.provision "shell", path: "scripts/50_setupMiniconda.sh", privileged: false
config.vm.provision "shell", path: "scripts/60_setup_spark.sh"

end
17 changes: 17 additions & 0 deletions resources/spark/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.

# Example:
# spark.master spark://master:7077
# spark.eventLog.enabled true
# spark.eventLog.dir hdfs://namenode:8021/directory
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 5g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"

#spark.eventLog.dir=hdfs://10.211.55.101/user/spark/applicationHistory
#spark.eventLog.dir hdfs://node1:8021/user/spark/applicationHistory
#spark.eventLog.dir hdfs://node1/user/spark/applicationHistory
#spark.eventLog.dir hdfs:///user/spark/applicationHistory
#spark.yarn.historyServer.address=10.211.55.101:18080
#spark.eventLog.enabled=true
48 changes: 48 additions & 0 deletions resources/spark/spark-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env bash

# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.

# Options read when launching programs locally with
# ./bin/run-example or ./bin/spark-submit
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
# - SPARK_CLASSPATH, default classpath entries to append

# Options read by executors and drivers running inside the cluster
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
# - SPARK_CLASSPATH, default classpath entries to append
# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos

# Options read in YARN client mode
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.

# Options for the daemons used in the standalone deploy mode:
# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
# - SPARK_WORKER_DIR, to set the working directory of worker processes
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers

#SPARK_DAEMON_JAVA_OPTS="-Dspark.history.fs.logDirectory=hdfs:///user/spark/applicationHistory"
#SPARK_DAEMON_JAVA_OPTS="-Dspark.history.fs.logDirectory=hdfs://node1/user/spark/applicationHistory"


4 changes: 4 additions & 0 deletions resources/spark/spark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

export SPARK_HOME=/usr/local/spark
export PATH=${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH}
66 changes: 66 additions & 0 deletions scripts/60_setup_spark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash

# http://www.cloudera.com/content/cloudera/en/documentation/core/v5-2-x/topics/cdh_ig_spark_configure.html

source "/vagrant/scripts/common.sh"

function installLocalSpark {
echo "install spark from local file"
FILE=/vagrant/resources/$SPARK_ARCHIVE
tar -xzf $FILE -C /usr/local
}

function installRemoteSpark {
echo "install spark from remote file"
curl -sS -o /vagrant/resources/$SPARK_ARCHIVE -O -L $SPARK_MIRROR_DOWNLOAD
tar -xzf /vagrant/resources/$SPARK_ARCHIVE -C /usr/local
}

function setupSpark {
echo "setup spark"
# cp -f /vagrant/resources/spark/slaves /usr/local/spark/conf
cp -f /vagrant/resources/spark/spark-env.sh /usr/local/spark/conf
cp -f /vagrant/resources/spark/spark-defaults.conf /usr/local/spark/conf
# ln -s $HADOOP_CONF/yarn-site.xml /usr/local/spark/conf/yarn-site.xml
# ln -s $HADOOP_CONF/core-site.xml /usr/local/spark/conf/core-site.xml
# ln -s $HADOOP_CONF/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml
# ln -s $HIVE_CONF/hive-site.xml /usr/local/spark/conf/hive-site.xml
}

function setupEnvVars {
echo "creating spark environment variables"
cp -f $SPARK_RES_DIR/spark.sh /etc/profile.d/spark.sh
. /etc/profile.d/spark.sh
}

function setupHistoryServer {
echo "setup history server"
. /etc/profile.d/hadoop.sh
hdfs dfs -mkdir -p /user/spark/applicationHistory
hdfs dfs -chmod -R 777 /user/spark
}

function installSpark {
if resourceExists $SPARK_ARCHIVE; then
installLocalSpark
else
installRemoteSpark
fi
ln -s /usr/local/$SPARK_VERSION-bin-hadoop2.7 /usr/local/spark
mkdir -p /usr/local/spark/logs/history
}

function startServices {
echo "starting Spark history service"
/usr/local/spark/sbin/start-history-server.sh
}

echo "setup spark"

installSpark
setupSpark
setupEnvVars
#setupHistoryServer
#startServices

echo "spark setup complete"
8 changes: 8 additions & 0 deletions scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,11 @@ MINICONDA_ARCHIVE=${MINICONDA_MAJOR_VERSION}-${MINICONDA_VERSION}-${MINICONDA_AR
MINICONDA_INSTALL_LOCATION=/home/ubuntu/miniconda
MINICONDA_INSTALLER=/vagrant/resources/miniconda/miniconda.sh
MINICONDA_PROFILE=/vagrant/resources/miniconda/miniconda-profile.sh

# spark
SPARK_VERSION=spark-2.1.1
SPARK_ARCHIVE=$SPARK_VERSION-bin-hadoop2.tgz
SPARK_MIRROR_DOWNLOAD=http://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
SPARK_RES_DIR=/vagrant/resources/spark
SPARK_CONF_DIR=/usr/local/spark/conf

0 comments on commit 6b12d71

Please sign in to comment.