SLURM(Simple Linux Utility for Resource Management)是一个开源、高性能、可扩展的集群管理和作业调度系统,被广泛应用于大型计算集群和超级计算机中。它能够有效地管理集群中的计算资源(如CPU、内存、GPU等),并根据用户的需求对作业进行调度,从而提高集群的使用率。
yum --nogpgcheck localinstall * -ymkdir -p /var/log/slurm/chown slurm: /var/log/slurm/# vi /etc/slurm/slurm.conf## Example slurm.conf file. Please run configurator.html# (in doc/html) to build a configuration file customized# for your environment.### slurm.conf file generated by configurator.html.# Put this file on all nodes of your cluster.# See the slurm.conf man page for more information.#ClusterName=clusterSlurmctldHost=Donau(172.16.45.29)#SlurmctldHost=##DisableRootJobs=NO#EnforcePartLimits=NO#Epilog=#EpilogSlurmctld=#FirstJobId=1#MaxJobId=67043328#GresTypes=#GroupUpdateForce=0#GroupUpdateTime=600#JobFileAppend=0#JobRequeue=1#JobSubmitPlugins=lua#KillOnBadExit=0#LaunchType=launch/slurm#Licenses=foo*4,bar#MailProg=/bin/mail#MaxJobCount=10000#MaxStepCount=40000#MaxTasksPerNode=512MpiDefault=none#MpiParams=ports=#-##PluginDir=#PlugStackConfig=#PrivateData=jobsProctrackType=proctrack/cgroup#Prolog=#PrologFlags=#PrologSlurmctld=#PropagatePrioProcess=0#PropagateResourceLimits=#PropagateResourceLimitsExcept=#RebootProgram=ReturnToService=1SlurmctldPidFile=/var/run/slurmctld.pidSlurmctldPort=6817SlurmdPidFile=/var/run/slurmd.pidSlurmdPort=6818SlurmdSpoolDir=/var/spool/slurmdSlurmUser=slurm#SlurmdUser=root#SrunEpilog=#SrunProlog=StateSaveLocation=/var/spool/slurmctldSwitchType=switch/none#TaskEpilog=TaskPlugin=task/affinity#TaskProlog=#TopologyPlugin=topology/tree#TmpFS=/tmp#TrackWCKey=no#TreeWidth=#UnkillableStepProgram=#UsePAM=0### TIMERS#BatchStartTimeout=10#CompleteWait=0#EpilogMsgTime=2000#GetEnvTimeout=2#HealthCheckInterval=0#HealthCheckProgram=InactiveLimit=0KillWait=30#MessageTimeout=10#ResvOverRun=0MinJobAge=300#OverTimeLimit=0SlurmctldTimeout=120SlurmdTimeout=300#UnkillableStepTimeout=60#VSizeFactor=0Waittime=0### SCHEDULING#DefMemPerCPU=0#MaxMemPerCPU=0#SchedulerTimeSlice=30SchedulerType=sched/backfillSelectType=select/cons_tres### JOB PRIORITY#PriorityFlags=#PriorityType=priority/multifactor#PriorityDecayHalfLife=#PriorityCalcPeriod=#PriorityFavorSmall=#PriorityMaxAge=#PriorityUsageResetPeriod=#PriorityWeightAge=#PriorityWeightFairshare=#PriorityWeightJobSize=#PriorityWeightPartition=#PriorityWeightQOS=### LOGGING AND ACCOUNTING#AccountingStorageEnforce=0#AccountingStorageHost=#AccountingStoragePass=#AccountingStoragePort=AccountingStorageType=accounting_storage/none#AccountingStorageUser=#AccountingStoreFlags=#JobCompHost=#JobCompLoc=#JobCompPass=#JobCompPort=JobCompType=jobcomp/none#JobCompUser=#JobContainerType=JobAcctGatherFrequency=30JobAcctGatherType=jobacct_gather/noneSlurmctldDebug=infoSlurmctldLogFile=/var/log/slurm/slurmctld.logSlurmdDebug=infoSlurmdLogFile=/var/log/slurm/slurmd.log#SlurmSchedLogFile=#SlurmSchedLogLevel=#DebugFlags=### POWER SAVE SUPPORT FOR IDLE NODES (optional)#SuspendProgram=#ResumeProgram=#SuspendTimeout=#ResumeTimeout=#ResumeRate=#SuspendExcNodes=#SuspendExcParts=#SuspendRate=#SuspendTime=### COMPUTE NODESNodeName=rabbitmq-node1 NodeAddr=172.16.45.2 CPUs=128 State=UNKNOWNNodeName=gczxagenta2 NodeAddr=172.16.45.4 CPUs=128 State=UNKNOWNPartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP