HPC集群安装脚本
1 install.sh 内容:
2 #!/bin/bash
3 #Author:liuzheng
4 #Company:Drastically
5 #Version:2.0
6
7 . /etc/init.d/functions
8
9 #显示界面
10 function menu() {
11 echo -e "\e[1;36m########################################################################################################### \e\033[0m"
12 echo -e "\e[1;36m* Thank you for choosing our product,please visit http://www.ginpie.com to get more support! * \e\033[0m"
13 echo -e "\e[1;36m* Version:2.0 * \e\033[0m"
14 echo -e "\e[1;36m* ###### ###### # # ######## ###### ######## * \e\033[0m"
15 echo -e "\e[1;36m* # ## ## # # # ## # * \e\033[0m"
16 echo -e "\e[1;36m* # ## # # # # # ## # * \e\033[0m"
17 echo -e "\e[1;36m* # ##### ## # # # ######## ## ######## * \e\033[0m"
18 echo -e "\e[1;36m* # ## ## # # # # ## # * \e\033[0m"
19 echo -e "\e[1;36m* # # # ## # ## # ## # * \e\033[0m"
20 echo -e "\e[1;36m* ##### # ###### # # # ###### ######## * \e\033[0m"
21 echo -e "\e[1;36m*********************************************************************************************************** \e\033[0m"
22 echo -e "\e[1;31m* 【0】 前期环境准备 * \e[0m "
23 echo -e "\e[1;31m* 【1】 输入|修改管理节点ip * \e[0m "
24 echo -e "\e[1;31m* 【2】 输入|修改计算节点 * \e[0m "
25 echo -e "\e[1;31m* 【3】 输入|修改域名 * \e[0m "
26 echo -e "\e[1;31m* 【4】 输入|修改密码 * \e[0m "
27 echo -e "\e[1;31m* 【5】 保存并显示信息 * \e[0m "
28 echo -e "\e[1;31m* 【6】 进行集群安装 * \e[0m "
29 echo -e "\e[1;31m* 【7】 增加计算节点 * \e[0m "
30 echo -e "\e[1;31m* 【8】 退出 * \e[0m "
31 echo -e "\e[1;31m########################################################################################################### \e[0m "
32
33 }
34
35 #显示设置信息
36 function printinfo() {
37 NODE_IP=$4
38 Datetime=$(date +%Y%m%d-%H%M%S)
39 [ -e info.txt ] && mv info.txt info.txt.$Datetime
40 touch info.txt
41 echo -e "\033[43;37m 管理节点IP地址:\t\t $1 \033[0m"
42 echo -e "\033[43;37m 域名:\t\t $2 \033[0m"
43 echo -e "\033[43;37m 密码:\t\t $3 \033[0m"
44 echo "MASTER_IP $1" >> info.txt
45 echo "Domainname $2" >> info.txt
46 echo "PASSWORD $3" >> info.txt
47 echo -e "计算节点:\n" >> info.txt
48 for key in $(echo ${!NODE_IP[*]})
49 do
50 echo -e "\033[43;37m ${NODE_IP[$key]} \t: $key \033[0m"
51
52 echo -e "$key ${NODE_IP[$key]} \n " >> info.txt
53 done
54 }
55
56
57
58 #判断ip地址是否有效
59 function check_ip() {
60 IP=$1
61 VALID_CHECK=$(echo $IP|awk -F. '$1<=255&&$2<=255&&$3<=255&&$4<=255{print "yes"}')
62
63 if [[ $IP = "quit" ]];then
64 echo ""
65 elif echo $IP|grep -E "^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$">/dev/null; then
66 if [ ${VALID_CHECK:-no} == "yes" ]; then
67 echo ""
68 fi
69 else
70 echo -e "\e[1;31m ip无效,请重新输入:\e[0m"
71 continue
72 fi
73 }
74
75
76
77 while true
78 do
79 menu
80 echo -e "\033[34m 请输入选项: \033[0m"
81 stty erase '^H'
82 read choice
83 case $choice in
84 0)
85 #安装OHPC源
86 yum -y install http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/x86_64/ohpc-release-1.3-1.el7.x86_64.rpm
87 yum -y install --skip-broken ohpc-base ohpc-warewulf
88 echo "------------------------------------------------"
89 if [ `echo $?` -eq 0 ];then
90 action "安装ohpc-base ohpc-warewulf" /bin/true
91 else
92 action "安装ohpc-base ohpc-warewulf" /bin/false
93 exit 1
94 fi
95 echo "-------------------------------------------------"
96 sleep 5
97
98 yum install -y pdsh-ohpc
99 echo "------------------------------------------------"
100 if [ `echo $?` -eq 0 ];then
101 action "安装pdsh" /bin/true
102 else
103 action "安装pdsh" /bin/false
104 exit 2
105 fi
106 echo "-------------------------------------------------"
107 bash
108 ;;
109 1)
110 while true
111 do
112 echo -e "\e[1;31m请输入管理节点的ip地址:\e[0m"
113 read MASTER_IP
114 check_ip $MASTER_IP && action "管理节点ip设置成功" /bin/true && break
115 done
116 ;;
117 2)
118 declare -A NODE_IP
119 count=1
120 FLAG=true
121
122 while ( $FLAG != "false" )
123 do
124 echo -e "请输入计算节点\e[1;31m ${count} \e[0m的ip和主机名,格式 ip hostname"
125 echo -e "输入\e[1;31m quit \e[0m退出该环节"
126 read HOST_IP HOSTNAME
127 check_ip $HOST_IP
128 if [ -z ${HOST_IP} ]&&[ -z ${HOSTNAME} ];then
129 continue
130 elif [ -z ${HOST_IP} ]||[ -z ${HOSTNAME} ];then
131 if [[ $HOST_IP = "quit" ]]||[[ $HOSTNAME = "quit" ]];then
132 # #打印所有key值
133 # echo ${!NODE_IP[*]}
134 # #打印所有value
135 # echo ${NODE_IP[*]}
136 FLAG=false
137 else
138 continue
139 fi
140
141 else
142 NODE_IP[${HOST_IP}]=${HOSTNAME}
143 FLAG=true
144 let count+=1
145 fi
146
147 done
148 ;;
149 3)
150 echo -e "\e[1;31m 请输入域名:\e[0m"
151 read Domainname
152 action "域名设置成功" /bin/true
153 ;;
154 4)
155 echo -e "\e[1;31m 请输入密码:\e[0m"
156 read PASSWORD
157 action "密码设置成功" /bin/true
158 ;;
159 5)
160 printinfo $MASTER_IP $Domainname $PASSWORD $NODE_IP
161 ;;
162 6)
163
164
165 ################################################################################################
166
167 #配置免密登录
168 if [ -f /root/.ssh/cluster.pub ]
169 then
170 for key in $(echo ${!NODE_IP[*]})
171 do
172 #判断key不为字符串0
173 if [ ${#key} -ne 1 ];then
174 expect send-sshkey.exp /root/.ssh/cluster.pub $key $PASSWORD
175 fi
176 done
177 else
178 echo -e "\e[1;31m请检查ohpc是否按照完成\e[0m"
179 break
180 fi
181
182
183
184 #配置主机名
185 hostnamectl set-hostname mgt
186 [ -n `hostname |grep mgt` ] || exit 5
187
188 #配置hosts文件并分发给其他节点
189
190 c=`grep mgt /etc/hosts | wc -l`
191 if [ ${c} -gt 0 ];then
192 sed -i '3,$'d /etc/hosts
193 fi
194
195 echo "$MASTER_IP mgt">>/etc/hosts
196 for key in $(echo ${!NODE_IP[*]});do
197 if [ ${#key} -ne 1 ];then
198 pdsh -w $key "hostnamectl set-hostname ${NODE_IP[$key]}"
199 echo "$key ${NODE_IP[$key]}">>/etc/hosts
200 fi
201 done
202
203 sed -i "s@cn\([0-9]\{1,\}\)@cn\1 cn\1.${Domainname}@g" /etc/hosts
204 sed -i "s@mgt@mgt mgt.${Domainname}@g" /etc/hosts
205
206 for key in $(echo ${!NODE_IP[*]});do
207 if [ ${#key} -ne 1 ];then
208 # pdsh -w $key 'rm -rf /etc/hosts'
209 scp /etc/hosts $key:/etc/
210 fi
211 done
212
213
214 #安装NIS
215 yum -y install ypserv rpcbind
216 echo "-------------------------------------------------"
217 if [ `echo $?` -eq 0 ];then
218 action "安装ypserv rpcbind" /bin/true
219 else
220 action "安装ypserv rpcbind" /bin/false
221 exit 1
222 fi
223 echo "-------------------------------------------------"
224
225
226 ypdomainname ${Domainname}
227 aa=`echo $MASTER_IP|rev`
228 bb=${a#*.}
229 cc='0.'${b}
230 SUBNET=`echo $cc|rev`
231 grep $SUBNET /usr/share/doc/ypserv-2.31/securenets || echo "255.255.255.0 $SUBNET" >>/usr/share/doc/ypserv-2.31/securenets
232 grep ${Domainname} /etc/sysconfig/network || echo "NISDOMAIN=${Domainname}" >> /etc/sysconfig/network
233 systemctl restart rpcbind ypserv ypxfrd yppasswdd
234 systemctl enable rpcbind ypserv ypxfrd yppasswdd
235 if [ `systemctl status rpcbind ypserv ypxfrd yppasswdd|grep Active|awk '{print $2}'|wc -l` -eq 4 ];then
236 action "启动rpcbind ypserv ypxfrd yppasswdd" /bin/true
237 else
238 action "启动rpcbind ypserv ypxfrd yppasswdd" /bin/false
239 exit 6
240 fi
241
242
243 #/usr/lib64/yp/ypinit -m
244 expect nis.exp
245
246 for key in $(echo ${!NODE_IP[*]})
247 do
248 if [ ${#key} -ne 1 ];then
249 pdsh -w $key 'yum -y install rpcbind ypbind'
250 pdsh -w $key ypdomainname $Domainname
251 pdsh -w $key "grep ${Domainname} /etc/sysconfig/network || echo "NISDOMAIN=${Domainname}" >> /etc/sysconfig/network"
252 pdsh -w $key "authconfig --enablenis --nisdomain=$Domainname --nisserver=mgt.${Domainname} --enablemkhomedir --update"
253 if [ `echo $?` -eq 0 ];then
254 action "计算节点authconfig" /bin/true
255 else
256 action "计算节点authconfig" /bin/false
257 exit 1
258 fi
259 pdsh -w $key 'systemctl enable rpcbind ypbind'
260 pdsh -w $key 'systemctl restart rpcbind ypbind'
261 fi
262 done
263
264
265
266
267 #配置NTP
268 systemctl enable ntpd.service
269
270
271 # c=`grep aliyun /etc/ntp.conf | wc -l`
272 if [[ ${c} -gt 0 ]];then
273 sed -i '59,$'d /etc/ntp.conf
274 fi
275
276 echo "server ntp1.aliyun.com">>/etc/ntp.conf
277
278 systemctl restart ntpd
279 systemctl enable ntpd
280
281 for key in $(echo ${!NODE_IP[*]});do
282 if [ ${#key} -ne 1 ];then
283 pdsh -w $key 'yum -y install ntp'
284 #pdsh -w $i 'rm -rf /etc/ntp.conf'
285 #scp /etc/ntp.conf $i:/etc/ntp.conf
286 pdsh -w $key 'echo "server mgt">>/etc/ntp.conf'
287 pdsh -w $key 'systemctl enable ntpd'
288 pdsh -w $key 'systemctl restart ntpd'
289 fi
290 done
291 #
292 #
293 #
294 #
295 #配置limit
296 c=`grep 65535 /etc/security/limits.conf | wc -l`
297 if [ ${c} -gt 0 ];then
298 sed -i '62,$'d /etc/security/limits.conf
299 fi
300
301
302 echo "* soft nofile 65535">>/etc/security/limits.conf
303 echo "* hard nofile 65535">>/etc/security/limits.conf
304 echo "* soft memlock unlimited">>/etc/security/limits.conf
305 echo "* hard memlock unlimited">>/etc/security/limits.conf
306 echo "* soft stack unlimited">>/etc/security/limits.conf
307 echo "* hard stack unlimited">>/etc/security/limits.conf
308
309
310 for key in $(echo ${!NODE_IP[*]});do
311 if [ ${#key} -ne 1 ];then
312 pdsh -w $key 'cp -a /etc/security/limits.conf /etc/security/limits.conf.bak${datetime}'
313 scp /etc/security/limits.conf $key:/etc/security/
314 fi
315 done
316
317
318 #
319 #配置PBS
320 yum -y install pbspro-server-19.1.1-0.x86_64.rpm
321
322
323
324 c=`grep -E '\-s' /opt/pbs/lib/init.d/limits.pbs_mom|wc -l`
325 if [ ${c} -gt 2 ];then
326 sed -i '12,$'d /opt/pbs/lib/init.d/limits.pbs_mom
327 echo "ulimit -s unlimited">>/opt/pbs/lib/init.d/limits.pbs_mom
328 else
329 echo "ulimit -s unlimited">>/opt/pbs/lib/init.d/limits.pbs_mom
330 fi
331
332 systemctl enable pbs
333 systemctl restart pbs
334
335 count=1
336
337 for key in $(echo ${!NODE_IP[*]})
338 do
339 if [ ${#key} -ne 1 ];then
340 /opt/pbs/bin/qmgr -c "create node ${NODE_IP[$key]}"
341 fi
342 done
343 /opt/pbs/bin/qmgr -c 'set server flatuid=true'
344 /opt/pbs/bin/qmgr -c "set server job_history_enable=True"
345
346
347 for key in $(echo ${!NODE_IP[*]})
348 do
349 if [ ${#key} -ne 1 ];then
350 scp pbspro-execution-19.1.1-0.x86_64.rpm $key:/root/
351 pdsh -w $key 'yum install -y pbspro-execution-19.1.1-0.x86_64.rpm'
352 pdsh -w $key "sed -i 's@CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME@mgt@g' /var/spool/pbs/mom_priv/config"
353 pdsh -w $key "sed -i 's@CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME@mgt@g' /etc/pbs.conf"
354 pdsh -w $key 'cp -a /opt/pbs/lib/init.d/limits.pbs_mom /opt/pbs/lib/init.d/limits.pbs_mom.bak${datetime}'
355 scp /opt/pbs/lib/init.d/limits.pbs_mom $i:/opt/pbs/lib/init.d/
356 pdsh -w $key 'systemctl enable pbs'
357 pdsh -w $key 'systemctl restart pbs'
358 fi
359 done
360
361
362
363
364 #配置NFS
365
366
367 yum install -y nfs-utils
368 mkdir /data
369 grep '/home' /etc/exports || echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports
370 grep '/opt/ohpc/pub' /etc/exports || echo "/opt/ohpc/pub *(rw,no_subtree_check,fsid=11,no_root_squash)" >> /etc/exports
371 grep '/data' /etc/exports || echo "/data *(rw,no_subtree_check,fsid=12,no_root_squash)" >> /etc/exports
372
373 systemctl enable nfs
374 systemctl enable rpcbind
375 systemctl restart rpcbind
376 systemctl restart nfs
377
378
379 for key in $(echo ${!NODE_IP[*]})
380 do
381 if [ ${#key} -ne 1 ];then
382 pdsh -w $key 'yum install -y nfs-utils'
383 pdsh -w $key 'systemctl restart nfs'
384 pdsh -w $key 'systemctl enable nfs'
385 pdsh -w $key 'mkdir -pv /data /opt/ohpc/pub'
386 pdsh -w $key "grep /home /etc/fstab || echo $MASTER_IP:/home /home nfs nfsvers=4,nodev,nosuid,noatime,_netdev 0 0 >> /etc/fstab"
387 pdsh -w $key "grep /opt/ohpc/pub /etc/fstab || echo $MASTER_IP:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=4,nodev,noatime,_netdev 0 0 >> /etc/fstab"
388 pdsh -w $key "grep /data /etc/fstab || echo $MASTER_IP:/data /data nfs nfsvers=4,nodev,nosuid,noatime,_netdev 0 0 >> /etc/fstab"
389 pdsh -w $key "mount -a"
390 fi
391 done
392
393 ;;
394 7)
395 if [ ! -e info.txt ] ;then
396 echo "重新输入集群的配置信息"
397 continue
398 fi
399 MASTER_IP=`grep MASTER_IP info.txt |awk -F ' ' '{print $2}'`
400 PASSWORD=`grep PASSWORD info.txt |awk -F ' ' '{print $2}'`
401 Domainname=`grep Domainname info.txt |awk -F ' ' '{print $2}'`
402
403 if [ -z ${!NODE_IP[*]} ]
404 then
405 echo -e "\e[1;31m 选择选项2,输入要增加的计算节点ip及主机名\e[0m"
406 continue
407 fi
408
409 #配置免密登录
410 if [ -f /root/.ssh/cluster.pub ]
411 then
412 for key in $(echo ${!NODE_IP[*]})
413 do
414 expect send-sshkey.exp /root/.ssh/cluster.pub $key $PASSWORD
415 done
416 fi
417
418 #设置hosts文件,并在管理节点的pbs上增加计算节点
419 for key in $(echo ${!NODE_IP[*]});do
420 if [ ${#key} -ne 1 ];then
421 echo $key
422 echo ${NODE_IP[$key]}
423 echo "$key ${NODE_IP[$key]} ${NODE_IP[$key]}.$Domainname">>/etc/hosts
424 pdsh -w $key "hostnamectl set-hostname ${NODE_IP[$key]}"
425 scp /etc/hosts $key:/etc/
426 /opt/pbs/bin/qmgr -c "create node ${NODE_IP[$key]}"
427 fi
428 done
429
430
431 #设置计算节点的NIS
432 for key in $(echo ${!NODE_IP[*]})
433 do
434 if [ ${#key} -ne 1 ];then
435 pdsh -w $key 'yum -y install rpcbind ypbind'
436 pdsh -w $key ypdomainname $Domainname
437 pdsh -w $key "grep ${Domainname} /etc/sysconfig/network || echo NISDOMAIN=${Domainname} >> /etc/sysconfig/network"
438 pdsh -w $key "authconfig --enablenis --nisdomain=$Domainname --nisserver=mgt.${Domainname} --enablemkhomedir --update"
439 if [ `echo $?` -eq 0 ];then
440 action "计算节点authconfig" /bin/true
441 else
442 action "计算节点authconfig" /bin/false
443 exit 1
444 fi
445 pdsh -w $key 'systemctl enable rpcbind ypbind'
446 pdsh -w $key 'systemctl restart rpcbind ypbind'
447 fi
448 done
449
450
451 #配置NTP
452 for key in $(echo ${!NODE_IP[*]});do
453 if [ ${#key} -ne 1 ];then
454 pdsh -w $key 'yum -y install ntp'
455 #pdsh -w $i 'rm -rf /etc/ntp.conf'
456 #scp /etc/ntp.conf $i:/etc/ntp.conf
457 pdsh -w $key 'echo "server mgt">>/etc/ntp.conf'
458 pdsh -w $key 'systemctl enable ntpd'
459 pdsh -w $key 'systemctl restart ntpd'
460 fi
461 done
462
463
464 #配置limit
465 for key in $(echo ${!NODE_IP[*]});do
466 if [ ${#key} -ne 1 ];then
467 pdsh -w $key 'cp -a /etc/security/limits.conf /etc/security/limits.conf.bak${datetime}'
468 scp /etc/security/limits.conf $key:/etc/security/
469 fi
470 done
471
472
473 #配置pbs
474 for key in $(echo ${!NODE_IP[*]})
475 do
476 if [ ${#key} -ne 1 ];then
477 scp pbspro-execution-19.1.1-0.x86_64.rpm $key:/root/
478 pdsh -w $key 'yum install -y pbspro-execution-19.1.1-0.x86_64.rpm'
479 pdsh -w $key "sed -i 's@CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME@mgt@g' /var/spool/pbs/mom_priv/config"
480 pdsh -w $key "sed -i 's@CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME@mgt@g' /etc/pbs.conf"
481 pdsh -w $key 'cp -a /opt/pbs/lib/init.d/limits.pbs_mom /opt/pbs/lib/init.d/limits.pbs_mom.bak${datetime}'
482 scp /opt/pbs/lib/init.d/limits.pbs_mom $i:/opt/pbs/lib/init.d/
483 pdsh -w $key 'systemctl enable pbs'
484 pdsh -w $key 'systemctl restart pbs'
485 fi
486 done
487
488 for key in $(echo ${!NODE_IP[*]})
489 do
490 if [ ${#key} -ne 1 ];then
491 pdsh -w $key 'yum install -y nfs-utils'
492 pdsh -w $key 'systemctl restart nfs'
493 pdsh -w $key 'systemctl enable nfs'
494 pdsh -w $key 'mkdir -pv /data /opt/ohpc/pub'
495 pdsh -w $key "grep /home /etc/fstab || echo $MASTER_IP:/home /home nfs nfsvers=4,nodev,nosuid,noatime,_netdev 0 0 >> /etc/fstab"
496 pdsh -w $key "grep /opt/ohpc/pub /etc/fstab || echo $MASTER_IP:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=4,nodev,noatime,_netdev 0 0 >> /etc/fstab"
497 pdsh -w $key "grep /data /etc/fstab || echo $MASTER_IP:/data /data nfs nfsvers=4,nodev,nosuid,noatime,_netdev 0 0 >> /etc/fstab"
498 pdsh -w $key "mount -a"
499 fi
500 done
501
502
503 ;;
504 8)
505 break
506 ;;
507 esac
508 done
509
510
511
512
513
514
515
516
517 nis.exp内容
518 spawn /usr/lib64/yp/ypinit -m
519 expect {
520 "list, type a <control D>." {
521 send "\004"
522 }
523 # "Is this correct*" { send "y\n" }
524 }
525 expect {
526 "Is this correct*" { send "y\n" }
527 }
528
529 expect eof
530
531
532
533
534 send-sshkey.exp
535 if { $argc != 3 } {
536 send_user "usage: expect send-sshkey.exp file host\n"
537 exit
538 }
539 #define var
540 set file [lindex $argv 0]
541 set host [lindex $argv 1]
542 set password [lindex $argv 2]
543 spawn ssh-copy-id -i $file -p 22 root@$host
544 expect {
545 "(yes/no)?" {
546 send "yes\n"
547 expect "*password*" { send "${password}\n" }
548 }
549 "*password*" { send "${password}\n" }
550 }
551 expect eof
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律