forked from aws-samples/aws-plugin-for-slurm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
slurm-compute.sh
69 lines (54 loc) · 2.19 KB
/
slurm-compute.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
RELEASE=unknown
if [ -f /etc/system-release ]; then
if grep -q "CentOS" /etc/system-release; then
RELEASE=centos
elif grep -q "Red" /etc/system-release; then
RELEASE=rhel
elif grep -q "Amazon" /etc/system-release; then
RELEASE=amazon
fi
elif lsb_release -d | grep -q "Ubuntu"; then
RELEASE=ubuntu
echo "Slurm Install on Ubuntu/Debian is not supported at this moment"
exit 0
fi
echo $RELEASE
# Setup MUNGE
if [ $RELEASE == "centos" ] || [ $RELEASE == "rhel" ]; then
sudo yum --nogpgcheck install epel-release -y
sudo yum --nogpgcheck install munge munge-libs munge-devel -y
elif [ $RELEASE == "amazon" ]; then
sed -i "s|enabled=0|enabled=1|g" /etc/yum.repos.d/epel.repo
sudo yum install munge munge-libs munge-devel -y
fi
echo "welcometoslurmamazonuserwelcometoslurmamazonuserwelcometoslurmamazonuser" | sudo tee /etc/munge/munge.key
sudo chown munge:munge /etc/munge/munge.key
sudo chmod 600 /etc/munge/munge.key
sudo chown -R munge /etc/munge/ /var/log/munge/
sudo chmod 0700 /etc/munge/ /var/log/munge/
sudo systemctl enable munge
sudo systemctl start munge
sleep 15
#Setup SLURM
sudo yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel \
readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad rpm-build -y
#Mount SLURM NFS
sudo mkdir -p /nfs
sudo mount -t nfs $1:/nfs /nfs
export SLURM_HOME=/nfs/slurm
#Calculate n GPUs
if [ -z /dev/nvidia* ]; then
NUM_GPUS=$(ls -l /dev/nvidia* | wc -l)
export GPU_STANZA=$(echo `Gres=gpu:$NUM_GPUS`)
for i in $(seq 0 `expr $NUM_GPUS - 1`); do
export SLURM_COMPUTE_NODE=$(echo `/nfs/slurm/sbin/slurmd -C` | cut -d " " -f1)
echo $SLURM_COMPUTE_NODE Name=gpu File=/dev/nvidia$i | sudo -E tee -a $SLURM_HOME/etc/gres.conf
done
fi
export SLURM_COMPUTE=$(echo `/nfs/slurm/sbin/slurmd -C` | cut -d " " -f1,2,5,6,7)
#echo NodeName=$2 $SLURM_COMPUTE NodeHostname=$HOSTNAME State=CLOUD $GPU_STANZA | sudo -E tee -a $SLURM_HOME/etc/slurm.conf.d/slurm_nodes.conf
sudo mkdir -p /var/spool/slurm
sudo -E cp $SLURM_HOME/etc/slurm/slurmd.service /lib/systemd/system
sudo systemctl enable slurmd.service
sudo systemctl start slurmd.service