-
Notifications
You must be signed in to change notification settings - Fork 0
/
spark_notebook.py
157 lines (125 loc) · 4.67 KB
/
spark_notebook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
# https://github.com/felixcheung/vagrant-projects
from os import chdir, getenv, system, umask
from os.path import exists, expanduser
from subprocess import check_output, check_call
from sys import executable
def setenv(var, value, overwrite=True):
from os import environ
if overwrite or not var in environ:
environ[var] = value
#-----------------------
# PySpark
#
master = 'local[*]'
num_executors = 12 # 24
executor_cores = 2
executor_memory = '1g' # 10g
driver_memory = '2536M'
umask(0077) # ensure that always chmod go-wrx
chdir(expanduser("~"))
default_pyspark_submit_args = '--driver-memory %s --num-executors %d --executor-cores %d --executor-memory %s' % (driver_memory, num_executors, executor_cores, executor_memory)
pyspark_submit_args = getenv('PYSPARK_SUBMIT_ARGS', None) or default_pyspark_submit_args
pyspark_submit_args = '--master %s %s' % (master, pyspark_submit_args)
setenv('PYSPARK_PYTHON', executable, overwrite=False)
setenv('PYSPARK_DRIVER_PYTHON', 'ipython') # PySpark Driver (ie. IPython)
profile_name = 'pyspark'
setenv('PYSPARK_DRIVER_PYTHON_OPTS', 'notebook --profile=%s' % profile_name)
#-----------------------
# IPython Notebook
#
ipython_notebook_config_template = '''c = get_config()
c.NotebookApp.ip = '{ip}'
c.NotebookApp.port = {port}
c.NotebookApp.open_browser = False
'''
pyspark_setup_template = '''from os import getenv
if not getenv('PYSPARK_SUBMIT_ARGS', None):
raise ValueError('PYSPARK_SUBMIT_ARGS environment variable is not set')
spark_home = getenv('SPARK_HOME', None)
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
'''
pyspark_custom_css = '''
.rendered_html img {
display: inline-block;
}
.rendered_html h1 {
font-family: "Source Sans Pro",sans-serif;
font-weight: 200;
line-height: 36px;
font-size: 33px
}
.rendered_html h2,h2 .job-name {
font-family: "Source Sans Pro",sans-serif;
font-weight: 200;
line-height: 30px;
font-size: 27px
}
.rendered_html h3 {
font-family: "Source Sans Pro",sans-serif;
font-weight: 200;
line-height: 24px;
font-size: 22px
}
.rendered_html h4 {
font-family: "Source Sans Pro",sans-serif;
font-weight: 200;
line-height: 21px;
font-size: 18px
}
.rendered_html h5 {
font-family: "Source Sans Pro",sans-serif;
font-weight: 200;
line-height: 19px;
font-size: 16px
}
'''
pyspark_custom_js = ''
matplotlib_inline = '''
c.IPKernelApp.matplotlib = 'inline'
c.InlineBackend.rc = {}
'''
ip = '*' # Warning: this is potentially insecure
port = 8001
#-----------------------
# Create profile and start
#
try:
ipython_profile_path = check_output(['ipython', 'locate']).rstrip('\n') + '/profile_%s' % profile_name
setup_py_path = ipython_profile_path + '/startup/00-pyspark-setup.py'
ipython_notebook_config_path = ipython_profile_path + '/ipython_notebook_config.py'
ipython_kernel_config_path = ipython_profile_path + '/ipython_kernel_config.py'
ipython_custom_css_path = ipython_profile_path + '/static/custom/custom.css'
ipython_custom_js_path = ipython_profile_path + '/static/custom/custom.js'
if not exists(ipython_profile_path):
print 'Creating IPython Notebook profile\n'
check_call(['ipython', 'profile', 'create', profile_name])
print 'Writing custom CSS/JS\n'
with open(ipython_custom_css_path, 'w') as css_file:
css_file.write(pyspark_custom_css)
with open(ipython_custom_js_path, 'w') as js_file:
js_file.write(pyspark_custom_js)
print '\n'
if not exists(setup_py_path):
print 'Writing PySpark setup\n'
with open(setup_py_path, 'w') as setup_file:
setup_file.write(pyspark_setup_template)
# matplotlib inline
kernel_config = open(ipython_kernel_config_path).read()
if "c.IPKernelApp.matplotlib = 'inline'" not in kernel_config:
print 'Writing IPython kernel config\n'
new_kernel_config = kernel_config.replace('# c.IPKernelApp.matplotlib = None', matplotlib_inline)
with open(ipython_kernel_config_path, 'w') as kernel_file:
kernel_file.write(new_kernel_config)
if not exists(ipython_notebook_config_path) or 'open_browser = False' not in open(ipython_notebook_config_path).read():
print 'Writing IPython Notebook config\n'
with open(ipython_notebook_config_path, 'w') as config_file:
config_file.write(ipython_notebook_config_template.format(ip=ip, port=port))
print 'Launching PySpark with IPython Notebook\n'
cmd = 'pyspark %s' % pyspark_submit_args
system(cmd)
exit(0)
except KeyboardInterrupt:
print 'Aborted\n'
exit(1)