计算化学公社

 找回密码 Forget password
 注册 Register
Views: 17351|回复 Reply: 10
打印 Print 上一主题 Last thread 下一主题 Next thread

[任务提交/队列管理] 服务器pbsnodes后节点状态均为down

[复制链接 Copy URL]

184

帖子

2

威望

4878

eV
积分
5102

Level 6 (一方通行)

能科top50

跳转到指定楼层 Go to specific reply
楼主
各位老师,最近我在实验室的服务器上计算MS 7.0时出现如下问题:
成功提交任务后,软件中显示running但是事实上任务并没有任何计算进展。不论多小的体系提交之后都一直没有反应,然后在xshell中输入pbsnodes -l命令,发现所有服务器均处于down的状态。这是否就是我计算没有反应的原因呢?请问各位老师这是否说明是我服务器的计算节点出了问题,应该如何解决?(本人是linux小白,懂的知识不是很多,请各位老师海涵!)
输入pbsnodes -l 后显示如下:

[root@mu01 ~]# pbsnodes -l
cu01                 down
cu02                 down
cu03                 down
cu04                 down
cu05                 down
cu06                 down
cu07                 down
cu08                 down
cu09                 down
cu10                 down
cu11                 down
cu12                 down
cu13                 down
cu14                 down
cu15                 down
cu16                 down
cu17                 down
cu18                 down



In defeat, malice. In victory, revenge.

184

帖子

2

威望

4878

eV
积分
5102

Level 6 (一方通行)

能科top50

2#
 楼主 Author| 发表于 Post on 2018-6-18 21:19:34 | 只看该作者 Only view this author
另外试了下ssh cu01 发现会出现“connect to host cu01 port 22: Connection timed out”,请问各位老师应该如何解决啊
In defeat, malice. In victory, revenge.

903

帖子

37

威望

5324

eV
积分
6967

Level 6 (一方通行)

3#
发表于 Post on 2018-6-19 00:32:43 | 只看该作者 Only view this author
直接运行pbsnodes看看是什么

184

帖子

2

威望

4878

eV
积分
5102

Level 6 (一方通行)

能科top50

4#
 楼主 Author| 发表于 Post on 2018-6-19 08:06:27 | 只看该作者 Only view this author
ggdh 发表于 2018-6-19 00:32
直接运行pbsnodes看看是什么

感谢您的回复,这是输入pbsnodes后的结果:
[root@mu01 ~]# pbsnodes
cu01
     state = down
     np = 16
     properties = hp
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu02
     state = down
     np = 16
     properties = hp
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu03
     state = down
     np = 16
     properties = hp
     ntype = cluster
     status = rectime=1529170026,varattr=,jobs=,state=free,size=48310748kb:51475068kb,netload=39396174789085,gres=,loadave=0.03,ncpus=16,physmem=16318180kb,availmem=24185032kb,totmem=24551136kb,idletime=5591297,nusers=0,nsessions=0,uname=Linux cu03 2.6.32-642.el6.x86_64 #1 SMP Wed Apr 13 00:51:26 EDT 2016 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu04
     state = down
     np = 16
     properties = hp
     ntype = cluster
     status = rectime=1529170024,varattr=,jobs=,state=free,size=112123968kb:122290004kb,netload=10840079290,gres=,loadave=0.00,ncpus=16,physmem=16432240kb,availmem=32486524kb,totmem=33212124kb,idletime=1788183,nusers=1,nsessions=1,sessions=4627,uname=Linux cu04 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu05
     state = down
     np = 16
     properties = hp
     ntype = cluster
     status = rectime=1529170027,varattr=,jobs=,state=free,size=48237848kb:51475068kb,netload=2494284301,gres=,loadave=0.00,ncpus=16,physmem=16318180kb,availmem=23732232kb,totmem=24551136kb,idletime=6471685,nusers=0,nsessions=0,uname=Linux cu05 2.6.32-642.el6.x86_64 #1 SMP Wed Apr 13 00:51:26 EDT 2016 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu06
     state = down
     np = 16
     properties = hp
     ntype = cluster
     status = rectime=1529170029,varattr=,jobs=39182.mu01,state=free,size=112127064kb:122290004kb,netload=41518578735,gres=,loadave=0.00,ncpus=16,physmem=16432240kb,availmem=31542912kb,totmem=33212124kb,idletime=6455431,nusers=1,nsessions=1,sessions=4607,uname=Linux cu06 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu07
     state = down
     np = 16
     properties = hp
     ntype = cluster
     status = rectime=1529170027,varattr=,jobs=,state=free,size=112505312kb:122290004kb,netload=41480943584,gres=,loadave=0.00,ncpus=16,physmem=16432240kb,availmem=31544780kb,totmem=33212124kb,idletime=6455026,nusers=1,nsessions=1,sessions=4639,uname=Linux cu07 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu08
     state = down
     np = 16
     properties = hp
     ntype = cluster
     status = rectime=1529170028,varattr=,jobs=,state=free,size=48336000kb:51475068kb,netload=2354409926,gres=,loadave=0.00,ncpus=16,physmem=16318180kb,availmem=24016672kb,totmem=24551136kb,idletime=6483622,nusers=0,nsessions=0,uname=Linux cu08 2.6.32-642.el6.x86_64 #1 SMP Wed Apr 13 00:51:26 EDT 2016 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu09
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     status = rectime=1529169986,varattr=,jobs=,state=free,size=209870832kb:219997772kb,netload=40229829663,gres=,loadave=0.00,ncpus=12,physmem=16426668kb,availmem=32408436kb,totmem=33206552kb,idletime=6346858,nusers=1,nsessions=1,sessions=4033,uname=Linux cu09 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu10
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     status = rectime=1529169989,varattr=,jobs=,state=free,size=209863900kb:219997772kb,netload=163751965240303,gres=,loadave=0.00,ncpus=12,physmem=16426728kb,availmem=32155688kb,totmem=33206612kb,idletime=48093951,nusers=1,nsessions=1,sessions=4096,uname=Linux cu10 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu11
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     status = rectime=1529170027,varattr=,jobs=,state=free,size=209872152kb:219997772kb,netload=151231809653580,gres=,loadave=0.00,ncpus=12,physmem=16426668kb,availmem=31965444kb,totmem=33206552kb,idletime=48093251,nusers=1,nsessions=1,sessions=4096,uname=Linux cu11 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu12
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu13
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu14
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     status = rectime=1529170024,varattr=,jobs=,state=free,size=209980848kb:219997772kb,netload=208792964354471,gres=,loadave=0.00,ncpus=12,physmem=16426664kb,availmem=32709132kb,totmem=33206548kb,idletime=48093777,nusers=1,nsessions=1,sessions=4087,uname=Linux cu14 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu15
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     status = rectime=1529170026,varattr=,jobs=,state=free,size=206494528kb:219997772kb,netload=300889242366159,gres=,loadave=0.00,ncpus=12,physmem=16426664kb,availmem=32419880kb,totmem=33206548kb,idletime=48093883,nusers=1,nsessions=1,sessions=4077,uname=Linux cu15 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu16
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     status = rectime=1529170029,varattr=,jobs=,state=free,size=209828964kb:219997772kb,netload=396686296197277,gres=,loadave=0.01,ncpus=12,physmem=16426664kb,availmem=32813184kb,totmem=33206548kb,idletime=48093624,nusers=0,nsessions=0,uname=Linux cu16 2.6.18-238.el5 #1 SMP Sun Dec 19 14:22:44 EST 2010 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu17
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cu18
     state = down
     np = 12
     properties = inspur
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0
In defeat, malice. In victory, revenge.

18

帖子

0

威望

1202

eV
积分
1220

Level 4 (黑子)

5#
发表于 Post on 2018-6-19 10:06:43 | 只看该作者 Only view this author
无法SSH到各个计算节点肯定显示down
检查头节点和计算节点的网络连接,以及SSH免密码登陆设置;
最后再查看下各计算节点pbs_mom服务是否正常

216

帖子

3

威望

4470

eV
积分
4746

Level 6 (一方通行)

6#
发表于 Post on 2018-6-19 16:15:22 | 只看该作者 Only view this author
首先SSH无法登陆到计算节点,说明网络出了问题,有两种原因,第一,你提交任务的节点被其他任务因为内存占用太大而卡死,第二是硬件问题,依靠主节点已经无法解决。首先排除交换机的问题,随后检查主机到交换机的网线是否有损坏,如果都没有问题,尝试将主节点和计算节点全部重启。如果自己无法解决建议联系管理员

评分 Rate

参与人数
Participants 1
eV +1 收起 理由
Reason
sobereva + 1

查看全部评分 View all ratings

Monte Carlo

184

帖子

2

威望

4878

eV
积分
5102

Level 6 (一方通行)

能科top50

7#
 楼主 Author| 发表于 Post on 2018-6-19 19:11:12 | 只看该作者 Only view this author
legendyao 发表于 2018-6-19 10:06
无法SSH到各个计算节点肯定显示down
检查头节点和计算节点的网络连接,以及SSH免密码登陆设置;
最后再查 ...

感谢您的回复,我可以ping到主节点但是ping不到计算节点,不知您说的检查网络连接是硬件的连接还是节点间的通信?另外输入service pbs_mom restart命令会显示未识别的服务,请问是哪里出了问题啊?我对linux系统不太了解,问题可能比较蠢,请您见谅!
In defeat, malice. In victory, revenge.

184

帖子

2

威望

4878

eV
积分
5102

Level 6 (一方通行)

能科top50

8#
 楼主 Author| 发表于 Post on 2018-6-19 19:13:08 | 只看该作者 Only view this author
youyno 发表于 2018-6-19 16:15
首先SSH无法登陆到计算节点,说明网络出了问题,有两种原因,第一,你提交任务的节点被其他任务因为内存占 ...

感谢您的回复,请问是单独重启节点吗?我试过reboot重启服务器但是并没有效果,不知您可否略微讲解一下如何重启主节点和计算节点,感激不尽!
In defeat, malice. In victory, revenge.

145

帖子

0

威望

3117

eV
积分
3262

Level 5 (御坂)

9#
发表于 Post on 2018-10-29 09:57:17 | 只看该作者 Only view this author
跟楼主一样的情况
torque-6.1.2 安装问题,节点down状态如何启动

qterm -t quick
pbs_server
pbsnodes -a

发现子节点是 state = down
已关防火墙,配置正确,可ssh切换,节点服务都启动,还是出问题

主节点:

[root@calserver calserver]# for i in pbs_server pbs_sched pbs_mom trqauthd; do service $i start; done
Starting pbs_server (via systemctl):                       [  OK  ]
Starting pbs_sched (via systemctl):                        [  OK  ]
Starting pbs_mom (via systemctl):                          [  OK  ]
Starting trqauthd (via systemctl):                         [  OK  ]

[root@calserver calserver]#  ps -ef | grep pbs
root       1160      1  0 01:18 ?        00:00:00 /usr/local/torque/sbin/pbs_server -F -d /var/spool/torque
root       3566      1  0 01:20 ?        00:00:00 /usr/local/torque/sbin/pbs_sched -d /var/spool/torque
root       3593      1  0 01:20 ?        00:00:00 /usr/local/torque/sbin/pbs_mom -F -d /var/spool/torque
root       3659   3428  0 01:21 pts/0    00:00:00 grep --color=auto pbs

[root@calserver calserver]# qnodes
calserver
     state = free
     power_state = Running
     np = 16
     ntype = cluster
     status = opsys=linux,uname=Linux calserver 3.10.0-862.14.4.el7.x86_64 #1 SMP Wed Sep 26 15:12:11 UTC 2018 x86_64,sessions=1593 2113 2237 2247 2501 3135 3185 3240,nsessions=8,nusers=2,idletime=256,totmem=5960692kb,availmem=4875732kb,physmem=3863544kb,ncpus=16,loadave=0.18,gres=,netload=89393,state=free,varattr= ,cpuclock=Fixed,macaddr=00:0c:29:a0:9b:d2,version=6.1.2,rectime=1540660913,jobs=
     mom_service_port = 15002
     mom_manager_port = 15003

calnode02
     state = down
     power_state = Running
     np = 4
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003

calnode03
     state = down
     power_state = Running
     np = 12
     ntype = cluster
     mom_service_port = 15002
     mom_manager_port = 15003


主节点上pbs_server的log

[root@calserver calserver]# systemctl status pbs_server.service -l
● pbs_server.service - TORQUE pbs_server daemon
   Loaded: loaded (/usr/lib/systemd/system/pbs_server.service; enabled; vendor preset: disabled)
   Active: active (running) since Sun 2018-10-28 01:18:08 CST; 35min ago
Main PID: 1160 (pbs_server)
    Tasks: 12
   Memory: 1.6M
   CGroup: /system.slice/pbs_server.service
           └─1160 /usr/local/torque/sbin/pbs_server -F -d /var/spool/torque

Oct 28 01:18:08 calserver systemd[1]: Starting TORQUE pbs_server daemon...
Oct 28 01:18:08 calserver PBS_Server[1160]: LOG_ERROR::tcp_connect_sockaddr, Failed when trying to open tcp connection - connect() failed [rc = -2] [addr = 127.0.0.1:15003]
Oct 28 01:18:08 calserver PBS_Server[1160]: LOG_ERROR::sendHierarchyToNode, Could not send mom hierarchy to host calserver:15003
Oct 28 01:18:08 calserver PBS_Server[1160]: LOG_ERROR::tcp_connect_sockaddr, Failed when trying to open tcp connection - connect() failed [rc = 15096] [addr = 192.168.10.102:15003]
Oct 28 01:18:08 calserver PBS_Server[1160]: LOG_ERROR::sendHierarchyToNode, Could not send mom hierarchy to host calnode02:15003
Oct 28 01:18:08 calserver PBS_Server[1160]: LOG_ERROR::tcp_connect_sockaddr, Failed when trying to open tcp connection - connect() failed [rc = 15096] [addr = 192.168.10.103:15003]
Oct 28 01:18:08 calserver PBS_Server[1160]: LOG_ERROR::sendHierarchyToNode, Could not send mom hierarchy to host calnode03:15003
Oct 28 01:28:09 calserver pbs_server[1160]: Assertion failed, bad pointer in link: file "req_select.c", line 401
Oct 28 01:38:09 calserver pbs_server[1160]: Assertion failed, bad pointer in link: file "req_select.c", line 401
Oct 28 01:48:09 calserver pbs_server[1160]: Assertion failed, bad pointer in link: file "req_select.c", line 401       


计算节点:

[root@calnode02 ~]# systemctl status pbs_mom.service -l
● pbs_mom.service - TORQUE pbs_mom daemon
   Loaded: loaded (/usr/lib/systemd/system/pbs_mom.service; enabled; vendor preset: disabled)
   Active: active (running) since Sun 2018-10-28 01:18:50 CST; 10min ago
Main PID: 1041 (pbs_mom)
    Tasks: 11
   Memory: 101.8M
   CGroup: /system.slice/pbs_mom.service
           └─1041 /usr/local/torque/sbin/pbs_mom -F -d /var/spool/torque

Oct 28 01:29:05 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:05 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 154 MOM status update intervals
Oct 28 01:29:09 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:09 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 155 MOM status update intervals
Oct 28 01:29:14 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:14 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 156 MOM status update intervals
Oct 28 01:29:18 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:18 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 157 MOM status update intervals
Oct 28 01:29:22 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:22 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 158 MOM status update intervals


参考安装方法
MS7、Torque在CentOS6.5上的安装-即MS计算集群搭建(原创) - 第一性原理 - MS - 小*虫论坛-学术科研互动平台  http://muchong.com/t-9836836-1-authorid-1192095
Centos7安装-多节点Torque - u012460749的博客 - CSDN博客  https://blog.csdn.net/u012460749/article/details/78583026

145

帖子

0

威望

3117

eV
积分
3262

Level 5 (御坂)

10#
发表于 Post on 2018-10-29 18:16:23 | 只看该作者 Only view this author
博主,子节点说没法联通 socket ,但是ping的每个节点的IP,这是为什么??
防火墙,sulinux均关闭,各服务都启动了   :(

[root@calnode02 torque6]# qnodes
pbs_connect received error code 15096 ('Unable to get connection to socket') from trqauthd

Unable to communicate with calserver(10.133.103.102)
qnodes: cannot connect to server calserver, error=15096 (Unable to get connection to socket)
[root@calnode02 torque6]# ping 10.133.103.102
PING 10.133.103.102 (10.133.103.102) 56(84) bytes of data.
64 bytes from 10.133.103.102: icmp_seq=1 ttl=64 time=0.156 ms

184

帖子

2

威望

4878

eV
积分
5102

Level 6 (一方通行)

能科top50

11#
 楼主 Author| 发表于 Post on 2018-10-29 23:02:07 | 只看该作者 Only view this author
neocc 发表于 2018-10-29 18:16
博主,子节点说没法联通 socket ,但是ping的每个节点的IP,这是为什么??
防火墙,sulinux均关闭,各服 ...

不好意思啊,我至今也没搞明白...服务器的电源好像坏了,拆掉送台湾修去了,这台服务器现在就在那边晾着...
In defeat, malice. In victory, revenge.

本版积分规则 Credits rule

手机版 Mobile version|北京科音自然科学研究中心 Beijing Kein Research Center for Natural Sciences|京公网安备 11010502035419号|计算化学公社 — 北京科音旗下高水平计算化学交流论坛 ( 京ICP备14038949号-1 )|网站地图

GMT+8, 2024-11-27 11:40 , Processed in 0.174736 second(s), 22 queries , Gzip On.

快速回复 返回顶部 返回列表 Return to list