环境基于 CentOS 7.6, slurm 基于 v20.11.9 版本, 前置条件是镜像已经完整集成 slurm运行时依赖
munge
准备munge用户
export MUNGE_USER_ID=2000
groupadd -g $MUNGE_USER_ID munge
useradd -m -c "MUNGE User" -d /var/lib/munge -u $MUNGE_USER_ID -g munge -s /sbin/nologin munge
安装 munge
yum -y install munge munge-devel
br-A. 生成munge.key
/usr/sbin/create-munge-key
生成/etc/munge/munge.key文件,并将此文件复制到其他节点
scp /etc/munge/munge.key compute-XXX:/etc/munge/munge.key
br-B. 接收 /etc/munge/munge.key 文件并设置文件归属
chown munge:munge /etc/munge/munge.key
开机自启/启动 munge 服务
systemctl enable munge
systemctl start munge
systemctl status munge
slurm
准备依赖
- hwloc-devel cgroup Task Constraining
- hdf5-devel HDF5 Job Profiling
- man2html HTML Man Pages
- libibumad and libibmad-devel InfiniBand Accounting
- lua-devel Lua Support
- mariadb-devel MySQL support for accounting
- pam-devel PAM Support
- numactl-devel NUMA Affinity
- readline-devel Readline Support
- rrdtool-devel RRD External Sensor Data Collection
- gtk2 and gtk2-devel sview
- http-parser-devel slurmdrestd
- json-c-devel slurmdrestd
- libyaml and libyaml-devel slurmdrestd
- libcurl and libcurl-devel slurmdrestd
- libjwt and libjwt-devel slurmdrestd
运行及编译依赖:
yum -y install gcc
yum -y install tcl tk dwz libtirpc zip
rpm -ivh --force \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-3.6.8-10.el7.x86_64.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-tkinter-3.6.8-10.el7.x86_64.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-libs-3.6.8-10.el7.x86_64.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-pip-9.0.3-5.el7.noarch.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-setuptools-39.2.0-10.el7.noarch.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-debug-3.6.8-10.el7.x86_64.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-devel-3.6.8-10.el7.x86_64.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-idle-3.6.8-10.el7.x86_64.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-test-3.6.8-10.el7.x86_64.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/redhat-rpm-config-9.1.0-88.el7.centos.noarch.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-rpm-macros-3-32.el7.noarch.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python3-rpm-generators-6-2.el7.noarch.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python-rpm-macros-3-32.el7.noarch.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/perl-srpm-macros-1-8.el7.noarch.rpm \
h体体ps://mirror.tuna.tsinghua.edu.cn/centos-vault/7.7.1908/os/x86_64/Packages/python-srpm-macros-3-32.el7.noarch.rpm
yum -y install openssl openssl-devel \
hwloc hwloc-devel \
hdf5 hdf5-devel \
man2html \
libibumad \
libibmad libibmad-devel \
lua lua-devel \
mariadb mariadb-devel \
pam pam-devel \
numactl numactl-devel \
readline readline-devel \
rrdtool rrdtool-devel \
ncurses ncurses-devel \
gtk2 gtk2-devel \
http-parser http-parser-devel \
json-c json-c-devel \
libyaml libyaml-devel \
libcurl libcurl-devel \
libjwt libjwt-devel
准备slurm用户
export SLURM_USER_ID=2001
groupadd -g $SLURM_USER_ID slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURM_USER_ID -g slurm -s /bin/bash slurm
安装 slurm
安装 slurm-v20.11.9-centos-7.6-amd64.tar.gz
tar xf slurm-v20.11.9-centos-7.6-amd64.tar.gz -C /opt
export PREFIX=/opt/slurm/v20.11.9
拷贝 service 文件
find $PREFIX/etc/systemd/system | grep service$ | xargs -i cp -v {} /etc/systemd/system/
systemctl daemon-reload
生成配置文件
$PREFIX/etc/slurmdbd.conf
cp $PREFIX/etc/slurmdbd.conf.example $PREFIX/etc/slurmdbd.conf
chmod 600 $PREFIX/etc/slurmdbd.conf
chown -R slurm:slurm $PREFIX/etc/slurmdbd.conf
安装数据库
yum -y install mariadb-server
systemctl start mariadb.service
systemctl enable mariadb.service
# 设置 root 密码
# 移除 anonymous 用户
# 禁止 root 远程登陆
# 移除 test 数据库
/usr/bin/mysql_secure_installation
创建 MySQL 管理账户
mysql -u root -p
create user 'slurm'@'%' identified by '123456';
grant all privileges on *.* to 'slurm'@'%' identified by '123456';
flush privileges;
配置 $PREFIX/etc/slurmdbd.conf
#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info
DbdAddr=localhost
DbdHost=localhost
#DbdPort=7031
SlurmUser=slurm
#MessageTimeout=300
DebugLevel=verbose
#DefaultQOS=normal,standby
LogFile=$PREFIX/var/log/slurmdbd.log
PidFile=$PREFIX/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
#StorageHost=localhost
#StoragePort=1234
StoragePass=123456
StorageUser=slurm
#StorageLoc=slurm_acct_db
启动 slurmdbd
systemctl start slurmdbd
systemctl enable slurmdbd
systemctl status slurmdbd
准备 $PREFIX/etc/slurm.conf
cp $PREFIX/etc/slurm.conf.example $PREFIX/etc/slurm.conf
$PREFIX/etc/slurm.conf 配置如下
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
ControlMachine=linux0
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=$PREFIX/var/spool
SlurmdSpoolDir=$PREFIX/var/spool
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=$PREFIX/run/slurmctld.pid
SlurmdPidFile=$PREFIX/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=info
SlurmctldLogFile=$PREFIX/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=$PREFIX/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key=$PREFIX/var/spool/jwt.key
# COMPUTE NODES
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
NodeName=linux0 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=1 RealMemory=1998
注意先去掉自带的NodeName=xxxx行, 追加 $PREFIX/sbin/slurmd -C >> $PREFIX/etc/slurm.conf, 去掉UpTime行, 注意添加jwt认证配置
sed -i '/^NodeName=/d' $PREFIX/etc/slurm.conf
$PREFIX/sbin/slurmd -C >> $PREFIX/etc/slurm.conf
sed -i '/^UpTime=/d' $PREFIX/etc/slurm.conf
dd if=/dev/random of=$PREFIX/var/spool/jwt.key bs=32 count=1
chown slurm:slurm $PREFIX/var/spool/jwt.key
chmod 0600 $PREFIX/var/spool/jwt.key
$PREFIX/etc/cgroup.conf
cp $PREFIX/etc/cgroup.conf.example $PREFIX/etc/cgroup.conf
$PREFIX/etc/cgroup.conf 文件内容如下
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=yes
ConstrainCores=no
ConstrainRAMSpace=no
启动 slurmctld/slurmd
systemctl enable slurmctld
systemctl start slurmctld
systemctl status slurmctld
systemctl enable slurmd
systemctl start slurmd
systemctl status slurmd
启动 slurmrestd
systemctl daemon-reload
systemctl start slurmrestd
systemctl enable slurmrestd
systemctl status slurmrestd
验证srun
$PREFIX/bin/srun -n4 hostname
验证sinfo
$PREFIX/bin/sinfo
验证 slurmrestd
export `$PREFIX/bin/scontrol token username=slurm`
curl -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:${SLURM_JWT}" localhost:6820/openapi
curl -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:${SLURM_JWT}" localhost:6820/slurm/v0.0.35/diag
curl -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:${SLURM_JWT}" localhost:6820/slurm/v0.0.35/nodes
curl -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:${SLURM_JWT}" localhost:6820/slurm/v0.0.35/jobs