diff --git a/.gitignore b/.gitignore index a0a22bb..4faa219 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ +deploy/ resources/vim/vimdir resources/miniconda/miniconda.sh .vagrant *.log +*.tgz diff --git a/Vagrantfile b/Vagrantfile index 2608dc2..6c6f8de 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -65,6 +65,7 @@ Vagrant.configure("2") do |config| # Enable provisioning with a shell script. Additional provisioners such as # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the # documentation for more information about their specific syntax and use. + config.vm.network :private_network, ip: '10.211.55.102' config.vm.hostname = "devvm" config.vm.provision "shell", path: "scripts/10_InstallBasePackages.sh" config.vm.provision "shell", path: "scripts/20_setPassword.sh" @@ -73,5 +74,6 @@ Vagrant.configure("2") do |config| config.vm.provision "shell", path: "scripts/30_setupJava.sh" config.vm.provision "shell", path: "scripts/40_setupEclipse.sh" config.vm.provision "shell", path: "scripts/50_setupMiniconda.sh", privileged: false + config.vm.provision "shell", path: "scripts/60_setup_spark.sh" end diff --git a/resources/spark/spark-defaults.conf b/resources/spark/spark-defaults.conf new file mode 100644 index 0000000..d0a05c7 --- /dev/null +++ b/resources/spark/spark-defaults.conf @@ -0,0 +1,17 @@ +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + +#spark.eventLog.dir=hdfs://10.211.55.101/user/spark/applicationHistory +#spark.eventLog.dir hdfs://node1:8021/user/spark/applicationHistory +#spark.eventLog.dir hdfs://node1/user/spark/applicationHistory +#spark.eventLog.dir hdfs:///user/spark/applicationHistory +#spark.yarn.historyServer.address=10.211.55.101:18080 +#spark.eventLog.enabled=true diff --git a/resources/spark/spark-env.sh b/resources/spark/spark-env.sh new file mode 100755 index 0000000..b0882be --- /dev/null +++ b/resources/spark/spark-env.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program +# - SPARK_CLASSPATH, default classpath entries to append + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_CLASSPATH, default classpath entries to append +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in YARN client mode +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) +# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb) +# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) +# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’) +# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. +# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job. + +# Options for the daemons used in the standalone deploy mode: +# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +#SPARK_DAEMON_JAVA_OPTS="-Dspark.history.fs.logDirectory=hdfs:///user/spark/applicationHistory" +#SPARK_DAEMON_JAVA_OPTS="-Dspark.history.fs.logDirectory=hdfs://node1/user/spark/applicationHistory" + + diff --git a/resources/spark/spark.sh b/resources/spark/spark.sh new file mode 100755 index 0000000..fce3805 --- /dev/null +++ b/resources/spark/spark.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +export SPARK_HOME=/usr/local/spark +export PATH=${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH} \ No newline at end of file diff --git a/scripts/60_setup_spark.sh b/scripts/60_setup_spark.sh new file mode 100755 index 0000000..eff563e --- /dev/null +++ b/scripts/60_setup_spark.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# http://www.cloudera.com/content/cloudera/en/documentation/core/v5-2-x/topics/cdh_ig_spark_configure.html + +source "/vagrant/scripts/common.sh" + +function installLocalSpark { + echo "install spark from local file" + FILE=/vagrant/resources/$SPARK_ARCHIVE + tar -xzf $FILE -C /usr/local +} + +function installRemoteSpark { + echo "install spark from remote file" + curl -sS -o /vagrant/resources/$SPARK_ARCHIVE -O -L $SPARK_MIRROR_DOWNLOAD + tar -xzf /vagrant/resources/$SPARK_ARCHIVE -C /usr/local +} + +function setupSpark { + echo "setup spark" +# cp -f /vagrant/resources/spark/slaves /usr/local/spark/conf + cp -f /vagrant/resources/spark/spark-env.sh /usr/local/spark/conf + cp -f /vagrant/resources/spark/spark-defaults.conf /usr/local/spark/conf +# ln -s $HADOOP_CONF/yarn-site.xml /usr/local/spark/conf/yarn-site.xml +# ln -s $HADOOP_CONF/core-site.xml /usr/local/spark/conf/core-site.xml +# ln -s $HADOOP_CONF/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml +# ln -s $HIVE_CONF/hive-site.xml /usr/local/spark/conf/hive-site.xml +} + +function setupEnvVars { + echo "creating spark environment variables" + cp -f $SPARK_RES_DIR/spark.sh /etc/profile.d/spark.sh + . /etc/profile.d/spark.sh +} + +function setupHistoryServer { + echo "setup history server" + . /etc/profile.d/hadoop.sh + hdfs dfs -mkdir -p /user/spark/applicationHistory + hdfs dfs -chmod -R 777 /user/spark +} + +function installSpark { + if resourceExists $SPARK_ARCHIVE; then + installLocalSpark + else + installRemoteSpark + fi + ln -s /usr/local/$SPARK_VERSION-bin-hadoop2.7 /usr/local/spark + mkdir -p /usr/local/spark/logs/history +} + +function startServices { + echo "starting Spark history service" + /usr/local/spark/sbin/start-history-server.sh +} + +echo "setup spark" + +installSpark +setupSpark +setupEnvVars +#setupHistoryServer +#startServices + +echo "spark setup complete" diff --git a/scripts/common.sh b/scripts/common.sh index 72245bb..89f1920 100755 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -55,3 +55,11 @@ MINICONDA_ARCHIVE=${MINICONDA_MAJOR_VERSION}-${MINICONDA_VERSION}-${MINICONDA_AR MINICONDA_INSTALL_LOCATION=/home/ubuntu/miniconda MINICONDA_INSTALLER=/vagrant/resources/miniconda/miniconda.sh MINICONDA_PROFILE=/vagrant/resources/miniconda/miniconda-profile.sh + +# spark +SPARK_VERSION=spark-2.1.1 +SPARK_ARCHIVE=$SPARK_VERSION-bin-hadoop2.tgz +SPARK_MIRROR_DOWNLOAD=http://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz +SPARK_RES_DIR=/vagrant/resources/spark +SPARK_CONF_DIR=/usr/local/spark/conf +