计算化学公社
标题:
分享一个手动自动化提交多个GMX任务的bash脚本
[打印本页]
作者Author:
laoman
时间:
2023-11-3 23:45
标题:
分享一个手动自动化提交多个GMX任务的bash脚本
本帖最后由 laoman 于 2023-11-4 21:51 编辑
楼主一直都有用一个只允许提交最长24小时任务的超算,这对于长时间的MD模拟来说,不算太友好。SLURM系统有sbatch array的选项可以自动化,但可控制的操作不是太多。于是用bash写了一个。这个脚本的目的是用一个死循环监视squeue队列的任务,任务在跑或者PD的时候就sleep,每隔1个小时查看squeue和ssh到计算节点查看各个任务跑了多少ns。提交12个体系的production任务到3个GPU 计算节点(每个节点有4张MI250X计算卡)。因为文件系统的网络经常卡顿,干脆把任务放到节点的本地硬盘去跑,跑完再copy回NFS的文件夹里。
#!/bin/bash
sub()
{
cat <<_EOF
#!/bin/bash
#SBATCH -A xxxx
#SBATCH -J $1
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH -t 24:00:00
#SBATCH -p gpu
#SBATCH -e ${1}.log -o ${1}.log
ml gromacs/2023.2-cpeGNU-22.06-gpu-mpi
systs=(SED_SYST)
workD="\$SLURM_SUBMIT_DIR/.."
export HIP_VISIBLE_DEVICES=0,2,4,6
omp=$2
mkdir /dev/shm/$USER
rm -rf /dev/shm/$USER/*
cd \$workD
cp prod.cpt prod.cpt.bak
for f in \${systs[@]}; do
mkdir -p /dev/shm/$USER/\$f
cp -rvf \$f/prod*log \$f/prod*cpt \$f/prod*tpr /dev/shm/$USER/\$f
done
cd /dev/shm/$USER
[[ -z \$(find |grep "\\.cpt") ]] && conti=" " || conti="-cpi prod.cpt -noappend"
echo flag_conti \$conti
_cmd="srun -n 4 -c \$omp gmx_mpi mdrun -deffnm prod \$conti
-ntomp \$omp -nb gpu -pme gpu -bonded gpu -update gpu
-s prod -maxh 23.6 -multidir \${systs[@]}"
echo \$_cmd
eval \$_cmd
cp -r * \$workD
_EOF
}
gettid()
{
grep -A1 "Step Time" $1 |tail -1 | awk '{printf "%.3fns", $2/1000}'
}
check()
{
ss="S%sm%s_%s"
for lig in F J; do
for ff in ff19 c36m; do
syst=$(printf "$ss" $1 $lig $ff)
if [[ -z $(ls ../$syst/prod*.log 2> /dev/null) ]]; then
echo 0ns
else
curlog=$(ls ../$syst/prod*.log 2> /dev/null|sort -n|tail -1)
echo "${curlog##*/}@$(gettid $curlog)"
fi
done
done
}
ddd() { date '+%Y-%m-%d %H:%M:%S'
}
cycle()
{
SA="SAmF_c36m SAmF_ff19 SAmJ_c36m SAmJ_ff19"
SB="SBmF_c36m SBmF_ff19 SBmJ_c36m SBmJ_ff19"
SC="SCmF_c36m SCmF_ff19 SCmJ_c36m SCmJ_ff19"
while [[ -z $NoTHiNggg ]]; do
for f in A B C; do
parray=($(check $f)) # 4 elements for a system
if [[ ${parray[0]} == '0ns' ]]; then
nstate=1
else
prelog=$(echo ${parray[0]}|cut -f 1 -d'@')
if [[ $prelog == 'prod.log' ]]; then
nstate=2
else
pstate=$(echo $prelog | sed "s/[a-z]//g; s/\.//g")
nstate=$[pstate+1]
fi
fi
pst=$(printf "%04d" $nstate)
if [[ -z $($HOME/soft/squ 2> /dev/null| grep GPU_S$f) ]]; then
echo "$(ddd): previous S$f run: ${parray[@]}"
subarg="GPU_S${f}_part${pst}"
cmd="sub $subarg 16 | sed "s/SED_SYST/\$S$f/g" > S$f.sh"
echo "$cmd"
eval $cmd
[[ -z $1 ]] && sbatch S${f}.sh
else
if [[ -z $($HOME/soft/squ 2> /dev/null| grep GPU_S$f| grep " PD ") ]]; then
[[ $(printf "%d" $pst) < 2 ]] && curlog='prod.log' || curlog="prod.part${pst}.log"
nid=$($HOME/soft/squ -nl 14 2> /dev/null|grep "GPU_S$f"|awk '{print $NF}')
echo "$(ddd): GPU_S$f is still running at $nid, current log: $curlog"
ssh $nid "$HOME/soft/get_gmxtime.sh /dev/shm/$USER/S${f}*/$curlog"
else
echo "$(ddd): GPU_S${f}_part${pst} is Pending..."
fi
fi
done
sleep 1h
echo
done
}
cycle >& ${0##*/}.log &
echo $! > ${0##*/}.pid
复制代码
供有需要的同学参考。
欢迎光临 计算化学公社 (http://bbs.keinsci.com/)
Powered by Discuz! X3.3