/gpfs/work/onboarding.sh
module avail
Key:
loaded auto-loaded modulepath
module load MODULE
sinfo
PARTITION | AVAIL | TIMELIMIT | NODES | STATE | NODELIST |
---|---|---|---|---|---|
TrixieMain* | up | 12:00:00 | 4 | drain | cn[131-134] |
TrixieMain* | up | 12:00:00 | 2 | mix | cn[108-109] |
TrixieMain* | up | 12:00:00 | 22 | idle | cn[107,110-130] |
TrixieLong | up | 2-00:00:00 | 1 | drain | cn131 |
TrixieLong | up | 2-00:00:00 | 2 | mix | cn[108-109] |
TrixieLong | up | 2-00:00:00 | 22 | idle | cn[107,110-130] |
JobTesting | up | 6:00:00 | 2 | idle | cn[135-136] |
sbatch --partition=JobTesting ...
See 📑 Account-Codes for a list of codes
DT Digital Technologies / Technologies Numériques
sbatch --account=account_code ...
sinfo --Node --responding --long
NODELIST | NODES | PARTITION | STATE | CPUS | S:C:T | MEMORY | TMP_DISK | WEIGHT | AVAIL_FE | REASON |
---|---|---|---|---|---|---|---|---|---|---|
cn106 | 1 | DevTest | idle | 64 | 2:16:2 | 192777 | 0 | 1 | (null) | none |
cn107 | 1 | TrixieLong | idle | 64 | 2:16:2 | 192777 | 0 | 1 | (null) | none |
cn107 | 1 | TrixieMain* | idle | 64 | 2:16:2 | 192777 | 0 | 1 | (null) | none |
cn108 | 1 | TrixieLong | idle | 64 | 2:16:2 | 192777 | 0 | 1 | (null) | none |
... |
scontrol show nodes
NodeName=cn136 Arch=x86_64 CoresPerSocket=16 CPUAlloc=0 CPUTot=64 CPULoad=0.01 AvailableFeatures=(null) ActiveFeatures=(null) Gres=gpu:4 NodeAddr=cn136 NodeHostName=cn136 OS=Linux 3.10.0-1160.62.1.el7.x86_64 #1 SMP Tue Apr 5 16:57:59 UTC 2022 RealMemory=192777 AllocMem=0 FreeMem=183181 Sockets=2 Boards=1 State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=JobTesting BootTime=2024-05-29T14:23:15 SlurmdStartTime=2024-05-29T14:23:36 CfgTRES=cpu=64,mem=192777M,billing=64,gres/gpu=4 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
#!/bin/bash
# vim:nowrap:
#SBATCH --job-name=My_Wonderful
#SBATCH --comment="My Wonderful Script"
# On Trixie
#SBATCH --partition=TrixieMain
#SBATCH --account=dt-mtp
#SBATCH --gres=gpu:4
#SBATCH --time=12:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=6
#SBATCH --mem=96G
# To reserve a whole node for yourself
##SBATCH --exclusive
#SBATCH --open-mode=append
#SBATCH --requeue
#SBATCH --signal=B:USR1@30
#SBATCH --output=%x-%j.out
# Requeueing on Trixie
function _requeue {
echo "~requeueing $SLURM_JOBID"
date
scontrol requeue $SLURM_JOBID
}
if [[ -n "$SLURM_JOBID" ]]; then
SACC_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode"
SACC_FORMAT+=",State,CPUTime,MaxRSS,MaxVMSize"
SACC_FORMAT+=",MaxDiskRead,MaxDiskWrite,AllocCPUs"
SACC_FORMAT+=",AllocGRES,AllocTRES%-50,NodeList"
SACC_FORMAT+=",JobName%-30,Comment%-80"
trap "sacct --jobs $SLURM_JOBID --format=$SACC_FORMAT" 0
trap _requeue USR1
fi
sbatch my_wonderful.sh
sbatch \
--job-name=My_Wonderful \
--comment="My Wonderful Script" \
--partition=TrixieMain \
--account=dt-mtp \
--gres=gpu:4 \
--time=12:00:00 \
--nodes=1 \
--ntasks-per-node=4 \
--cpus-per-task=6 \
--mem=96G \
--open-mode=append \
--requeue \
--signal=B:USR1@30 \
--output=%x-%j.out \
my_wonderful.sh args ...
😨
Overriding what is different
sbatch --job-name=OtherName my_wonderful.sh
😏
#SBATCH --mem=96G
Otherwise the scheduler assumes that you want all of the memory which implies exclusive access to that node, preventing other jobs to use the remainder of the resources of that node.
#SBATCH --requeue
#SBATCH --signal=B:USR1@30
# Requeueing on Trixie
# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
#
[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html)
function _requeue {
echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
date
scontrol requeue $SLURM_JOBID
}
if [[ -n "$SLURM_JOBID" ]]; then
# Only if the job was submitted to SLURM.
trap _requeue USR1
fi
--signal=B:USR1@30
Ask the
scheduler to send a
USR1
signal 30 seconds before
the time limit
trap _requeue USR1
Act on
USR1
signal by calling
_requeue()
sbatch my_wonderful.sh
squeue
JOBID | NAME | USER | ST | TIME | NODES | NODELIST(REASON) | SUBMIT_TIME | COMMENT |
---|---|---|---|---|---|---|---|---|
733 | My_Wonderful | larkins | R | 7:43:44 | 1 | trixie-cn101 | 2024-07-17T02:26:0 | My Wonderful Script |
nvidia-smi -l
for
Good GPU Usage
ssh -t cn101 nvidia-smi -l
+-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 Tesla V100-SXM2-32GB On | 00000000:89:00.0 Off | 0 | | N/A 54C P0 139W / 300W | 28888MiB / 32768MiB | 100% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ | 1 Tesla V100-SXM2-32GB On | 00000000:8A:00.0 Off | 0 | | N/A 68C P0 282W / 300W | 28846MiB / 32768MiB | 99% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ | 2 Tesla V100-SXM2-32GB On | 00000000:B2:00.0 Off | 0 | | N/A 58C P0 289W / 300W | 28918MiB / 32768MiB | 99% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ | 3 Tesla V100-SXM2-32GB On | 00000000:B3:00.0 Off | 0 | | N/A 68C P0 284W / 300W | 28918MiB / 32768MiB | 98% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 3077932 C ...C-Senate/nmt/tools/venv/bin/python3 28884MiB | | 1 N/A N/A 3077933 C ...C-Senate/nmt/tools/venv/bin/python3 28842MiB | | 2 N/A N/A 3077934 C ...C-Senate/nmt/tools/venv/bin/python3 28914MiB | | 3 N/A N/A 3077935 C ...C-Senate/nmt/tools/venv/bin/python3 28914MiB | +-----------------------------------------------------------------------------------------+
sbatch --mem=400G my_wonderful.sh
Please refer to 📑 Jobs Conda JupyterLab as it is a bit more involved
WARNING: Don't let your worker node run if you are not using it