Skip to content

Commit

Permalink
Use pdsh for parallely executing remote commands
Browse files Browse the repository at this point in the history
  • Loading branch information
anirajk committed Oct 13, 2016
1 parent 75670f6 commit ab31144
Showing 1 changed file with 96 additions and 45 deletions.
141 changes: 96 additions & 45 deletions scripts/emulabconfig.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/usr/bin/env python
"""
This module defines an Emulab specific cluster hooks and exposes configuration information such as RAMCloud hosts and binaries
Expects EMULAB_HOST environment variable to point to node-0 in the emulab experiment and passwordless ssh access to the node
This module defines an Emulab specific cluster hooks and exposes configuration information
such as location of RAMCloud binaries and list of hosts. EMULAB_HOST environment must be
exported to point to node-0 in the Cloudlab experiment where the caller has passwordless
ssh access. Check https://www.cloudlab.us/ssh-keys.php if you didn't export keys already
"""

import subprocess
Expand All @@ -10,7 +12,6 @@
import re
import socket
import xml.etree.ElementTree as ET
import traceback

__all__ = ['getHosts', 'local_scripts_path', 'top_path', 'obj_path',
'default_disk1', 'default_disk2', 'EmulabClusterHooks', 'log']
Expand All @@ -20,29 +21,35 @@
def log(msg):
print '[%s] %s' % (hostname, msg)


# If run locally, connects to EMULAB_HOST and gets the manifest from there to
# populate host list, since this is invoked to compile RAMCloud (rawmetrics.py)
# the default is to see if you can get the manifest locally
def getHosts():
nodeId = 0
serverList = []
try:
log("trying to get manifest locally")
out = subprocess.check_output("/usr/bin/geni-get manifest",shell=True, stderr=subprocess.STDOUT)
out = captureSh("/usr/bin/geni-get manifest",shell=True, stderr=subprocess.STDOUT)
except:
log("trying EMULAB_HOST to get manifest")
if 'EMULAB_HOST' not in os.environ:
log("'EMULAB_HOST' not exported")
sys.exit(1)
out = subprocess.check_output("ssh %s /usr/bin/geni-get manifest" % os.environ['EMULAB_HOST'],
shell=True, stderr=subprocess.STDOUT)
shell=True, stderr=subprocess.STDOUT)

root = ET.fromstring(out)
for child in root.getchildren():
if child.tag.endswith('node'):
for host in child.getchildren():
if host.tag.endswith('host'):
serverList.append((host.attrib['name'], host.attrib['ipv4'], nodeId))
nodeId += 1
nodeId += 1
return serverList

def ssh(server, cmd, checked=True):
""" Runs command on a remote machine over ssh."""
if checked:
return subprocess.check_call('ssh %s "%s"' % (server, cmd),
shell=True, stdout=sys.stdout)
Expand All @@ -51,6 +58,15 @@ def ssh(server, cmd, checked=True):
shell=True, stdout=sys.stdout)


def pdsh(cmd, checked=True):
""" Runs command on remote hosts using pdsh on remote hosts"""
if checked:
return subprocess.check_call('pdsh -w^/tmp/emulab-hosts "%s"' % cmd,
shell=True, stdout=sys.stdout)
else:
return subprocess.call('pdsh -w^/tmp/emulab-hosts "%s"' %cmd,
shell=True, stdout=sys.stdout)

def captureSh(command, **kwargs):
"""Execute a local command and capture its output."""

Expand Down Expand Up @@ -87,6 +103,17 @@ class EmulabClusterHooks:
def __init__(self):
self.remotewd = None
self.hosts = getHosts()
self.parallel = self.cmd_exists("pdsh")
if not self.parallel:
log("WARNING: Remote commands could be faster if you install and configure pdsh")
else:
with open("/tmp/emulab-hosts",'w') as f:
for host in self.hosts:
f.write(host[0]+'\n')


def cmd_exists(self, cmd):
return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0

def get_remote_wd(self):
if self.remotewd is None:
Expand All @@ -100,66 +127,91 @@ def get_remote_scripts_path(self):
def get_remote_obj_path(self):
return os.path.join(self.get_remote_wd(), obj_dir)

def send_code(self, server):
def send_code(self, server, parallel=False):
log("Sending code to %s" % server)
if parallel:
for host in self.hosts:
self.send_code(host[0])
return
subprocess.check_call("rsync -ave ssh --exclude 'logs/*' " +
"--exclude 'docs/*' " +
"./ %s:%s/ > /dev/null" % (server,
self.get_remote_wd()),
shell=True, stdout=sys.stdout)

#def write_local_config(self, server):
# log("Writing localconfig on %s" % server)
# hostlines = str(self.hosts).split("(")
# ssh(server, 'cd %s && echo hosts=[ | tee scripts/localconfig.py;' % self.get_remote_wd())
# for line in hostlines[1:]:
# ssh(server, 'cd %s && echo \(%s | tee -a scripts/localconfig.py;' %(self.get_remote_wd(), re.escape(line)))

def compile_code(self, server, clean=False):

def compile_code(self, server, clean=False, parallel=False):
log("Compiling code on %s" % server)
clean_cmd = ''
if clean:
clean_cmd = 'make clean;'
ssh(server,
'(cd %s; (%s make -j 8) > ' % (self.get_remote_wd(), clean_cmd) +
'%s/build.log)' % self.get_remote_wd())

def kill_procs(self, server):
log("Killing existing RAMCloud processes")
ssh(server, 'sudo pkill -f RAMCloud')

def create_log_dir(self, server):
if parallel:
pdsh('(cd %s; (%s make -j 8) > ' % (self.get_remote_wd(), clean_cmd) +
'%s/build.log)' % self.get_remote_wd())
else:
ssh(server,
'(cd %s; (%s make -j 8) > ' % (self.get_remote_wd(), clean_cmd) +
'%s/build.log)' % self.get_remote_wd())

def kill_procs(self, server, parallel=False):
log("Killing existing RAMCloud processes on %s" % server)
if self.parallel:
pdsh('sudo pkill -f RAMCloud')
else:
ssh(server, 'sudo pkill -f RAMCloud')

def create_log_dir(self, server, parallel=False):
log("Creating %s on %s" % (self.cluster.log_subdir, server))
ssh(server,
'(cd %s; ' % self.get_remote_wd() +
'mkdir -p $(dirname %s)/shm; ' % self.cluster.log_subdir +
'mkdir -p %s; ' % self.cluster.log_subdir +
'rm logs/latest; ' +
'ln -sf $(basename %s) logs/latest)' % self.cluster.log_subdir)

def fix_disk_permissions(self, server):
if parallel:
pdsh(
'(cd %s; ' % self.get_remote_wd() +
'mkdir -p $(dirname %s)/shm; ' % self.cluster.log_subdir +
'mkdir -p %s; ' % self.cluster.log_subdir +
'rm logs/latest; ' +
'ln -sf $(basename %s) logs/latest)' % self.cluster.log_subdir)

else:
ssh(server,
'(cd %s; ' % self.get_remote_wd() +
'mkdir -p $(dirname %s)/shm; ' % self.cluster.log_subdir +
'mkdir -p %s; ' % self.cluster.log_subdir +
'rm logs/latest; ' +
'ln -sf $(basename %s) logs/latest)' % self.cluster.log_subdir)

def fix_disk_permissions(self, server, parallel=False):
log("Fixing disk permissions on %s" % server)
disks = default_disk2.split(' ')[1].split(',')
cmds = ['sudo chmod 777 %s' % disk for disk in disks]
ssh(server, '(%s)' % ';'.join(cmds))
if parallel:
pdsh('(%s)' % ';'.join(cmds))
else:
ssh(server, '(%s)' % ';'.join(cmds))

def cluster_enter(self, cluster):
self.cluster = cluster
log('== Connecting to Emulab via %s ==' % self.hosts[0][0])
for host in self.hosts:
hostName = host[0]
log('-- Preparing host ' + hostName)
#self.write_local_config(hostName)
self.kill_procs(hostName)
self.send_code(hostName)
self.compile_code(hostName)
self.create_log_dir(hostName)
self.fix_disk_permissions(hostName)
if self.parallel:
self.kill_procs('all', parallel=self.parallel)
self.send_code('all', parallel=self.parallel)
self.compile_code('all', parallel=self.parallel)
self.create_log_dir('all', parallel=self.parallel)
self.fix_disk_permissions('all', parallel=self.parallel)
else:
for host in self.hosts:
hostName = host[0]
log('-- Preparing host ' + hostName)
self.kill_procs(hostName)
self.send_code(hostName)
self.compile_code(hostName)
self.create_log_dir(hostName)
self.fix_disk_permissions(hostName)
log('== Emulab Cluster Configured ==')
log("If you are running clusterperf, it might take a while!")

def collect_logs(self, server):
def collect_logs(self, server, parallel=False):
log('Collecting logs from host ' + server)
if parallel:
for host in self.hosts:
self.collect_logs(host[0])
subprocess.check_call("rsync -ave ssh " +
"%s:%s/logs/ logs/> /dev/null" % (server,
self.get_remote_wd()),
Expand All @@ -176,4 +228,3 @@ def cluster_exit(self):
local_scripts_path = os.path.dirname(os.path.abspath(__file__))
top_path = os.path.abspath(local_scripts_path + '/..')
obj_path = os.path.join(top_path, obj_dir)

0 comments on commit ab31144

Please sign in to comment.