diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml index ab5ea86d..4ef209ea 100644 --- a/.github/workflows/test_and_build.yml +++ b/.github/workflows/test_and_build.yml @@ -64,6 +64,7 @@ jobs: - ComputeCanada_Graham_slurm - EPCC_Cirrus_slurm - HPCC_MagicCastle_slurm + - LLNL_Pascal_slurm - Magic_Castle_EESSI_slurm - NIST_CTCMS_slurm - Norway_SIGMA2_SAGA_slurm diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/_config_options.yml b/_includes/snippets_library/LLNL_Pascal_slurm/_config_options.yml new file mode 100644 index 00000000..c1da5a6d --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/_config_options.yml @@ -0,0 +1,70 @@ +#------------------------------------------------------------ +# NIST CTCMS Slurm +#------------------------------------------------------------ + +# Cluster host and scheduler options: the defaults come from +# Graham at Compute Canada, running Slurm. Other options can +# be found in the library of snippets, +# `_includes/snippets_library`. To use one, replace options +# below with those in `_config_options.yml` from the +# library. E.g, to customise for Cirrus at EPCC, running +# Slurm, we could replace the options below with those from +# +# _includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml +# +# If your cluster is not represented in the library, please +# copy an existing folder, rename it, and customize for your +# installation. Remember to keep the leading slash on the +# `snippets` variable below! +--- +snippets: "/snippets_library/LLNL_Pascal_slurm" + +local: + prompt: "[user@laptop ~]$" + bash_shebang: "#!/usr/bin/bash" + +remote: + name: "pascal83" + login: "pascal.nist.gov" + host: "quartz" + node: "pascal17" + location: "Lawrence Livermore National Laboratory" + homedir: "/g/g0/" + user: "yourUsername" + prompt: "yourUsername@pascal83" + bash_shebang: "#!/bin/bash" + +sched: + name: "Slurm" + submit: + name: "sbatch" + options: "--partition=pvis" + queue: + debug: "pdebug" + testing: "pvis" + status: "squeue" + flag: + user: "-u yourUsername" + interactive: "" + histdetail: "--format=JobName,Submit,Start,State,ReqCPUS,Reserved,Elapsed,MaxRSS -j" + name: "-J" + time: "-t" + queue: "-p" + partition: "-p pdebug" + del: "scancel" + interactive: "srun" + info: "sinfo" + comment: "#SBATCH" + hist: "sacct -u yourUsername" + hist_filter: "" + +episode_order: + - 10-hpc-intro + - 11-connecting + - 12-cluster + - 13-scheduler + - 14-environment-variables + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/cluster/queue-info.snip b/_includes/snippets_library/LLNL_Pascal_slurm/cluster/queue-info.snip new file mode 100644 index 00000000..9570ca27 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/cluster/queue-info.snip @@ -0,0 +1,25 @@ +``` +PARTITION AVAIL TIMELIMIT NODES STATE NODELIST +rack1 up 30-00:00:0 12 alloc r[061-072] +rack2 up 30-00:00:0 10 alloc r[003-012] +rack3 up 30-00:00:0 1 mix r036 +rack3 up 30-00:00:0 6 alloc r[032-035,037-038] +rack4 up 30-00:00:0 1 drain r048 +rack4 up 30-00:00:0 1 mix r047 +rack4 up 30-00:00:0 9 alloc r[041-046,049-051] +rack4e up 30-00:00:0 1 mix r073 +rack4e up 30-00:00:0 5 alloc r[013-016,074] +rack4e up 30-00:00:0 2 idle r[075-076] +rack5 up 30-00:00:0 3 mix r[021-022,028] +rack5 up 30-00:00:0 5 alloc r[023-027] +rack5 up 30-00:00:0 3 idle r[019-020,029] +rack6i up 30-00:00:0 2 idle r[059-060] +rack6 up 30-00:00:0 1 drain* r057 +rack6 up 30-00:00:0 1 down* r053 +rack6 up 30-00:00:0 5 alloc r[052,054-056,058] +{{ site.sched.queue.testing }} up 12:00:00 1 idle r001 +{{ site.sched.queue.debug }} up 14-00:00:0 1 idle r002 +gpu up 7-00:00:00 3 idle rgpu,rgpu[4-5] +gpu up 7-00:00:00 2 down rgpu[2-3] +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/cluster/root-folders.snip b/_includes/snippets_library/LLNL_Pascal_slurm/cluster/root-folders.snip new file mode 100644 index 00000000..715de741 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/cluster/root-folders.snip @@ -0,0 +1,6 @@ +``` +bin etc lib64 proc sbin sys var +boot {{ site.remote.homedir | replace: "/", "" }} mnt root scratch tmp working +dev lib opt run srv usr +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/cluster/specific-node-info.snip b/_includes/snippets_library/LLNL_Pascal_slurm/cluster/specific-node-info.snip new file mode 100644 index 00000000..c8a3775f --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/cluster/specific-node-info.snip @@ -0,0 +1,11 @@ +> ## Explore a Worker Node +> +> Finally, let's look at the resources available on the worker nodes where your +> jobs will actually run. Try running this command to see the name, CPUs and +> memory available on the worker nodes: +> +> ``` +> {{ site.remote.prompt }} sinfo -n {{ site.remote.node }} -o "%n %c %m" +> ``` +> {: .language-bash} +{: .challenge} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/available-modules.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/available-modules.snip new file mode 100644 index 00000000..7ae2ec26 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/available-modules.snip @@ -0,0 +1 @@ + diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/default-modules.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/missing-python.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/missing-python.snip new file mode 100644 index 00000000..054ad9d8 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/missing-python.snip @@ -0,0 +1,3 @@ +``` +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/module-load-python.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/module-load-python.snip new file mode 100644 index 00000000..d18252dd --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/module-load-python.snip @@ -0,0 +1,4 @@ +``` +{{ site.remote.prompt }} which python3 +``` +{: .language-bash} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-executable-dir.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-executable-dir.snip new file mode 100644 index 00000000..584eae91 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-executable-dir.snip @@ -0,0 +1,4 @@ +``` +/usr/bin/python3 +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-ls-dir-command.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-ls-dir-command.snip new file mode 100644 index 00000000..6fe59e0f --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-ls-dir-command.snip @@ -0,0 +1,4 @@ +``` +{{ site.remote.prompt }} ls /usr/bin/py* +``` +{: .language-bash} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-ls-dir-output.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-ls-dir-output.snip new file mode 100644 index 00000000..1f4a0c7a --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-ls-dir-output.snip @@ -0,0 +1,12 @@ +``` +py3clean pydoc3.5 python2 python3-config +py3compile pygettext python2.7 python3-futurize +py3versions pygettext2.7 python2.7-config python3m +pybuild pygettext3 python2-config python3m-config +pyclean pygettext3.5 python3 python3-pasteurize +pycompile pygobject-codegen-2.0 python3.5 python-config +pydoc pygtk-codegen-2.0 python3.5-config pyversions +pydoc2.7 pygtk-demo python3.5m +pydoc3 python python3.5m-config +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-module-path.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-module-path.snip new file mode 100644 index 00000000..054ad9d8 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/python-module-path.snip @@ -0,0 +1,3 @@ +``` +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/software-dependencies.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/software-dependencies.snip new file mode 100644 index 00000000..68f82c30 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/software-dependencies.snip @@ -0,0 +1 @@ + diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/modules/wrong-gcc-version.snip b/_includes/snippets_library/LLNL_Pascal_slurm/modules/wrong-gcc-version.snip new file mode 100644 index 00000000..68f82c30 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/modules/wrong-gcc-version.snip @@ -0,0 +1 @@ + diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/parallel/eight-tasks-jobscript.snip b/_includes/snippets_library/LLNL_Pascal_slurm/parallel/eight-tasks-jobscript.snip new file mode 100644 index 00000000..7fa5d183 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/parallel/eight-tasks-jobscript.snip @@ -0,0 +1,11 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 8 + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/LLNL_Pascal_slurm/parallel/four-tasks-jobscript.snip new file mode 100644 index 00000000..0303186a --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/parallel/four-tasks-jobscript.snip @@ -0,0 +1,11 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 4 + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/parallel/one-task-jobscript.snip b/_includes/snippets_library/LLNL_Pascal_slurm/parallel/one-task-jobscript.snip new file mode 100644 index 00000000..e5fe4b59 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/parallel/one-task-jobscript.snip @@ -0,0 +1,11 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 1 + +# Execute the task +amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/resources/account-history.snip b/_includes/snippets_library/LLNL_Pascal_slurm/resources/account-history.snip new file mode 100644 index 00000000..61ac15e5 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/resources/account-history.snip @@ -0,0 +1,16 @@ +``` + JobID JobName Partition AllocCPUS State Exit +------------ ---------- ---------- ---------- ---------- ---- +212339 hostname {{ site.sched.queue.debug }} 2 COMPLETED +212340 hostname {{ site.sched.queue.debug }} 2 COMPLETED +212341 env {{ site.sched.queue.debug }} 2 COMPLETED +212342 mpirun {{ site.sched.queue.testing }} 2 COMPLETED +212343 mpirun {{ site.sched.queue.testing }} 2 COMPLETED +212344 amdahl {{ site.sched.queue.testing }} 2 COMPLETED +212345 amdahl {{ site.sched.queue.testing }} 2 COMPLETED +212346 bash {{ site.sched.queue.testing }} 2 COMPLETED +212346.0 bash 2 COMPLETED +212346.1 amdahl 2 COMPLETED +212347 amdahl {{ site.sched.queue.testing }} 2 FAILED +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/resources/hist-fields.snip b/_includes/snippets_library/LLNL_Pascal_slurm/resources/hist-fields.snip new file mode 100644 index 00000000..f0e215ba --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/resources/hist-fields.snip @@ -0,0 +1,6 @@ +* **Hostname**: Where did your job run? +* **MaxRSS**: What was the maximum amount of memory used? +* **Elapsed**: How long did the job take? +* **State**: What is the job currently doing/what happened to it? +* **MaxDiskRead**: Amount of data read from disk. +* **MaxDiskWrite**: Amount of data written to disk. diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/resources/monitor-processes-top.snip b/_includes/snippets_library/LLNL_Pascal_slurm/resources/monitor-processes-top.snip new file mode 100644 index 00000000..30348d4e --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/resources/monitor-processes-top.snip @@ -0,0 +1,19 @@ +``` +top - 15:47:18 up 21 days, 6:25, 2 users, load average: 0.02, 0.04, 0.04 +Tasks: 223 total, 1 running, 222 sleeping, 0 stopped, 0 zombie +%Cpu(s): 0.2 us, 0.1 sy, 0.0 ni, 99.6 id, 0.1 wa, 0.0 hi, 0.0 si, 0.0 st +KiB Mem : 32950812 total, 1594456 free, 502696 used, 30853660 buff/cache +KiB Swap: 64002952 total, 64002952 free, 0 used. 31913980 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 1693 jeff 20 0 4270580 346944 171372 S 29.8 2.1 9:31.89 gnome-shell + 3140 jeff 20 0 3142044 928972 389716 S 27.5 5.7 13:30.29 Web Content + 3057 jeff 20 0 3115900 521368 231288 S 18.9 3.2 10:27.71 firefox + 6007 jeff 20 0 813992 112336 75592 S 4.3 0.7 0:28.25 tilix + 1742 jeff 20 0 975080 164508 130624 S 2.0 1.0 3:29.83 Xwayland + 1 root 20 0 230484 11924 7544 S 0.3 0.1 0:06.08 systemd + 68 root 20 0 0 0 0 I 0.3 0.0 0:01.25 kworker/4:1 + 2913 jeff 20 0 965620 47892 37432 S 0.3 0.3 0:11.76 code + 2 root 20 0 0 0 0 S 0.0 0.0 0:00.02 kthreadd +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/resources/system-memory-free.snip b/_includes/snippets_library/LLNL_Pascal_slurm/resources/system-memory-free.snip new file mode 100644 index 00000000..8a81401c --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/resources/system-memory-free.snip @@ -0,0 +1,7 @@ +``` + total used free shared buff/cache available +Mem: 31G 501M 1.5G 64M 29G 30G +Swap: 61G 0B 61G + +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/basic-job-script.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/basic-job-script.snip new file mode 100644 index 00000000..e31c6fc0 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/basic-job-script.snip @@ -0,0 +1,4 @@ +``` +Submitted batch job 36855 +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/basic-job-status.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/basic-job-status.snip new file mode 100644 index 00000000..6d397724 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/basic-job-status.snip @@ -0,0 +1,9 @@ +``` + JOBID PARTITION NAME ST TIME NODES NODELIST(REASON) +212201 {{ site.sched.queue.debug }} example- R 0:05 1 r002 +``` +{: .output} + +We can see all the details of our job, most importantly that it is in the `R` +or `RUNNING` state. Sometimes our jobs might need to wait in a queue +(`PENDING`) or have an error (`E`). diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/email-notifications.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/email-notifications.snip new file mode 100644 index 00000000..e681b3c0 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/email-notifications.snip @@ -0,0 +1,19 @@ +> Jobs on an HPC system might run for days or even weeks. We probably have +> better things to do than constantly check on the status of our job with +> `{{ site.sched.status }}`. Looking at the manual page for +> `{{ site.sched.submit.name }}`, can you set up our test job to send you an email +> when it finishes? +> +> > ## Hint +> > +> > You can use the *manual pages* for {{ site.sched.name }} utilities to find +> > more about their capabilities. On the command line, these are accessed +> > through the `man` utility: run `man `. You can find the same +> > information online by searching > "man ". +> > +> > ``` +> > {{ site.remote.prompt }} man {{ site.sched.submit.name }} +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/job-with-name-status.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/job-with-name-status.snip new file mode 100644 index 00000000..583fbff9 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/job-with-name-status.snip @@ -0,0 +1,6 @@ +``` + JOBID PARTITION NAME ST TIME NODES NODELIST(REASON) +212202 {{ site.sched.queue.debug }} hello-wo R 0:02 1 r002 + +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/option-flags-list.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/option-flags-list.snip new file mode 100644 index 00000000..5e80b164 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/option-flags-list.snip @@ -0,0 +1,15 @@ +* `--ntasks=` or `-n `: How many CPU cores does your job need, + in total? + +* `--time ` or `-t `: + How much real-world time (walltime) will your job take to run? The `` + part can be omitted. + +* `--mem=`: How much memory on a node does your job need in + megabytes? You can also specify gigabytes using by adding a little "g" + afterwards (example: `--mem=5g`) + +* `--nodes=` or `-N `: How many separate machines does your job + need to run on? Note that if you set `ntasks` to a number greater than what + one machine can offer, {{ site.sched.name }} will set this value + automatically. diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/print-sched-variables.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/print-sched-variables.snip new file mode 100644 index 00000000..90e7dbf8 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/print-sched-variables.snip @@ -0,0 +1,31 @@ +> ## Job environment variables +> +> When {{ site.sched.name }} runs a job, it sets a number of environment +> variables for the job. One of these will let us check what directory our job +> script was submitted from. The `SLURM_SUBMIT_DIR` variable is set to the +> directory from which our job was submitted. Using the `SLURM_SUBMIT_DIR` +> variable, modify your job so that it prints out the location from which the +> job was submitted. +> +> > ## Solution +> > +> > ``` +> > {{ site.remote.prompt }} nano example-job.sh +> > {{ site.remote.prompt }} cat example-job.sh +> > ``` +> > {: .language-bash} +> > +> > ``` +> > {{ site.remote.bash_shebang }} +> > {{ site.sched.comment }} {{ site.sched.flag.partition }} +> > {{ site.sched.comment }} {{ site.sched.flag.time }} 00:00:20 +> > +> > echo -n "This script is running on " +> > hostname +> > +> > echo "This job was launched in the following directory:" +> > echo ${SLURM_SUBMIT_DIR} +> > ``` +> > {: .output} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/runtime-exceeded-job.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/runtime-exceeded-job.snip new file mode 100644 index 00000000..875df1c3 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/runtime-exceeded-job.snip @@ -0,0 +1,4 @@ +``` +{{ site.remote.prompt }} cat slurm-38193.out +``` +{: .language-bash} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/runtime-exceeded-output.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/runtime-exceeded-output.snip new file mode 100644 index 00000000..1f4948ae --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/runtime-exceeded-output.snip @@ -0,0 +1,7 @@ +``` +This job is running on: +{{ site.sched.node }} +slurmstepd: error: *** JOB 38193 ON {{ site.sched.node }} CANCELLED AT +2017-07-02T16:35:48 DUE TO TIME LIMIT *** +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-job-begin.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-job-begin.snip new file mode 100644 index 00000000..552c2f12 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-job-begin.snip @@ -0,0 +1,7 @@ +``` +Submitted batch job 212203 + + JOBID PARTITION NAME ST TIME NODES NODELIST(REASON) +212203 {{ site.sched.queue.debug }} hello-wo R 0:03 1 r002 +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-job-cancel.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-job-cancel.snip new file mode 100644 index 00000000..dcddf2c8 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-job-cancel.snip @@ -0,0 +1,4 @@ +``` + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) +``` +{: .output} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-multiple-jobs.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-multiple-jobs.snip new file mode 100644 index 00000000..6f661ea8 --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/terminate-multiple-jobs.snip @@ -0,0 +1,27 @@ +> ## Cancelling multiple jobs +> +> We can also all of our jobs at once using the `-u` option. This will delete +> all jobs for a specific user (in this case, yourself). Note that you can only +> delete your own jobs. +> +> Try submitting multiple jobs and then cancelling them all. +> +> > ## Solution +> > +> > First, submit a trio of jobs: +> > +> > ``` +> > {{ site.remote.prompt }} {{ site.sched.submit.name }} {% if site.sched.submit.options != '' %}{{ site.sched.submit.options }} {% endif %}example-job.sh +> > {{ site.remote.prompt }} {{ site.sched.submit.name }} {% if site.sched.submit.options != '' %}{{ site.sched.submit.options }} {% endif %}example-job.sh +> > {{ site.remote.prompt }} {{ site.sched.submit.name }} {% if site.sched.submit.options != '' %}{{ site.sched.submit.options }} {% endif %}example-job.sh +> > ``` +> > {: .language-bash} +> > +> > Then, cancel them all: +> > +> > ``` +> > {{ site.remote.prompt }} {{ site.sched.del }} -u yourUsername +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/using-nodes-interactively.snip b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/using-nodes-interactively.snip new file mode 100644 index 00000000..48d2419a --- /dev/null +++ b/_includes/snippets_library/LLNL_Pascal_slurm/scheduler/using-nodes-interactively.snip @@ -0,0 +1,70 @@ +`{{ site.sched.interactive }}` runs a single command on the cluster and then +exits. Let's demonstrate this by running the `hostname` command with `{{ +site.sched.interactive }}`. (We can cancel an `{{ site.sched.interactive }}` +job with `Ctrl-c`.) + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} hostname +``` +{: .language-bash} + +``` +{{ site.sched.node }} +``` +{: .output} + +`{{ site.sched.interactive }}` accepts all of the same options as `{{ +site.sched.submit.name }}`. However, instead of specifying these in a script, +these options are specified on the command-line when starting a job. To submit +a job that uses 2 CPUs for instance, we could use the following command: + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} -n 2 echo "This job will use 2 CPUs." +``` +{: .language-bash} + +``` +This job will use 2 CPUs. +This job will use 2 CPUs. +``` +{: .output} + +Typically, the resulting shell environment will be the same as that for +`{{ site.sched.submit.name }}`. + +### Interactive jobs + +Sometimes, you will need a lot of resource for interactive use. Perhaps it's +our first time running an analysis or we are attempting to debug something that +went wrong with a previous job. Fortunately, {{ site.sched.name }} makes it +easy to start an interactive job with `{{ site.sched.interactive }}`: + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} --pty bash +``` +{: .language-bash} + +You should be presented with a bash prompt. Note that the prompt will likely +change to reflect your new location, in this case the compute node we are +logged on. You can also verify this with `hostname`. + +> ## Creating remote graphics +> +> To see graphical output inside your jobs, you need to use X11 forwarding. To +> connect with this feature enabled, use the `-Y` option when you login with +> the `ssh` command, e.g., +> `ssh -Y {{ site.remote.user }} @{{ site.remote.login }}`. +> +> To demonstrate what happens when you create a graphics window on the remote +> node, use the `xeyes` command. A relatively adorable pair of eyes should pop +> up (press `Ctrl-C` to stop). If you are using a Mac, you must have installed +> XQuartz (and restarted your computer) for this to work. +> +> If your cluster has the +> [slurm-spank-x11](https://github.com/hautreux/slurm-spank-x11) plugin +> installed, you can ensure X11 forwarding within interactive jobs by using the +> `--x11` option for `{{ site.sched.interactive }}` with the command +> `{{ site.sched.interactive }} --x11 --pty bash`. +{: .callout} + +When you are done with the interactive job, type `exit` to quit your session.