-
Notifications
You must be signed in to change notification settings - Fork 30
/
run.sh
executable file
·174 lines (163 loc) · 5.2 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env bash
#--------------------------------------------
# Run one of the hadoop sample jobs.
# ./run.sh -h
# for help on the options.
# Hint: To see the hadoop command that would be invoked without actually
# invoking it, run this script as follows:
# NOOP=echo ./run.sh [args]
root_dir=$(dirname $0)
jars=($(echo $root_dir/target/ScalaHadoop*.jar))
if [ ${#jars[@]} -ne 1 ]
then
echo "You have no jar assembly OR too many version of it in the target directory:"
echo " ${jars[@]}"
echo "Delete the oldest jar(s) or run 'sbt clean assembly'"
exit 1
fi
export APP_JAR=${jars[0]}
root_dir=$(dirname $0)
[ "$root_dir" = "." ] && root_dir="$PWD"
help() {
cat <<EOF
usage: run.sh job [options] [which_mapper]
where the options include the following:
job One of the implemented MR jobs. Currently:
WordCount
SecondarySort
-h | --help Show this help and exit.
--local Use local/standalone mode.
This is converted into corresponding Hadoop arguments:
-fs file:/// -jt localhost
See also --hdfs
--hdfs Use pseudo-distributed (default) or distributed mode, depending on
how your installation is configured. See also the "generic" opts below.
--input=input Input to process. Defaults to data/<job>/input, where "<job>" will
be the input job argument converted to lower case (e.g., "wordcount"
from "WordCount"). (See notes in the README on input data.)
--output=outdir Where output goes. Defaults to
data/<job>/output/$which_mapper/YYYYMMDD-hhmmss. (see --input for <job>...)
--use-combiner Use the reducer as a combiner.
When running WordCount, the following values can be used for "which_mapper":
1 | no | no-buffer Use the most naive mapper that emits "(word,1)" pairs.
2 | not | no-buffer-use-tokenizer
Use the naive mapper, but do better text tokenization.
3 | buffer Use buffered counts of "(word,N)" pairs.
4 | buffer-flush Use buffered word counts with periodic flushing to conserve memory.
For other jobs, this option is ignored.
When running SecondarySort, you must specify a stock symbol to select for (even if your
data has data for only one symbol...)
--symbol symbol
Any other arguments are expected to be Hadoop "generic" options, which are the following
(see also the Hadoop javadocs for "GenericOptionsParser"):
-conf config_file Use this configuration file.
-D property=value Use the given value for the property.
-fs local|namenode:port Specify the name node. (inferred by --local | --hdfs above)
-jt local|jobtracker:port Specify the job tracker. (inferred by --local | --hdfs above)
-files file1,file2,...
Copy the files to the MapReduce cluster (e.g., support data files).
-libjars jar1,jar2,...
Include the jar files in the classpath.
-archives zip1,targz2,...
Copy and unarchive each file on every node.
Hint: To see the hadoop command that would be invoked without actually invoking it,
run this script as follows:
NOOP=echo $root_dir/run.sh [args]
EOF
}
job=
map_kind=
use_combiner=
stock_symbol_arg=
hdfs_root=hdfs://localhost/user/$USER/
other_args=()
fs_jt=
while [ $# -ne 0 ]
do
case $1 in
-h|--help)
help
exit 0
;;
1|no|no-buffer)
map_kind=no-buffer
;;
2|not|no-buffer-use-tokenizer)
map_kind=no-buffer-use-tokenizer
;;
3|buffer)
map_kind=buffer
;;
4|buffer-flush)
map_kind=buffer-flush
;;
--hdfs)
fs_jt=
;;
--local)
fs_jt="-fs file:/// -jt localhost"
;;
--input)
shift
input=$1
;;
--input=*)
input=${1#--input=}
;;
--output)
shift
output=$1
;;
--output=*)
output=${1#--output=}
;;
--use-combiner)
use_combiner=--use-combiner
;;
--symbol)
shift
stock_symbol=$1
stock_symbol_arg="--symbol $1"
;;
*)
if [[ -z $job ]]
then
job=$1
else
other_args[${#other_args[@]}]=$1
fi
;;
esac
shift
done
if [[ -z $job ]]
then
echo "Must specify a job:"
help
exit 1
elif [[ $job = "WordCount" ]] && [[ -z $map_kind ]]
then
echo "Must specify a mapper for the WordCount job:"
help
exit 1
else
job2="$(echo $job | tr [A-Z] [a-z]).$job"
fi
job_lc=$(echo $job | tr [A-Z] [a-z])
: ${input:=data/$job_lc/input}
: ${output:=data/$job_lc/output/$map_kind/$(date +'%Y%m%d-%H%M%S')}
echo "Using:"
echo " App. Jar: $APP_JAR"
echo " Job: $job"
echo " Mapper: $map_kind"
echo " Combiner? $(if [[ -n $use_combiner ]]; then echo yes; else echo no; fi)"
echo " Input: $input"
echo " Output: $output"
echo " Local mode? $(if [[ -n $fs_jt ]]; then echo yes; else echo no; fi)"
if [[ -z $symbol ]]
then
echo " Stock Symbol: $stock_symbol"
fi
echo " Other args: ${other_args[@]} $fs_jt"
echo " Running: hadoop jar $APP_JAR $job2 $fs_jt ${other_args[@]} $map_kind $use_combiner $input $output" $stock_symbol_arg
[[ -z $NOOP ]] && hadoop jar $APP_JAR $fs_jt $job2 ${other_args[@]} $map_kind $use_combiner "$input" "$output" $stock_symbol_arg