东师理想云平台故障处理一例

pstree -p `ps -e | grep python | awk '{print $1}'`

kill -9 `ps -e|grep python  | awk '{print $1}'`

vi /etc/crontab

# 解决思路
export MALLOC_ARENA_MAX=1

# Java程序在Linux上运行虚拟内存耗用很大
https://blog.csdn.net/u010686469/article/details/77319599

#Java 进程占用 VIRT 虚拟内存超高的问题研究
https://www.cnblogs.com/seasonsluo/p/java_virt.html

pmap -x 32520 | grep anon

#  重启两个占用虚拟内存大的进程
ps -ef | grep tomcat
kill -9 PID
service rabbitmq-server restart

119486 java8
33419  java7
119486 

 

# 查看内存溢出日志
cat /var/log/messages | grep 'Out of memory' -C 5

#Out of memory: Kill process 解决
https://blog.51cto.com/qiangsh/2066747

[root@localhost ~]# cat /var/log/messages | grep 'Out of memory' -C 5
Aug 20 18:15:59 localhost kernel: [71037]    89 71037    20282      117   5       0             0 cleanup
Aug 20 18:15:59 localhost kernel: [71039]    89 71039    20256      162   6       0             0 bounce
Aug 20 18:15:59 localhost kernel: [72029]     0 72029    35007      155   0       0             0 crond
Aug 20 18:15:59 localhost kernel: [72329]     0 72329    19143       82   1       0             0 sendmail
Aug 20 18:15:59 localhost kernel: [72338]     0 72338     2275       16  17       0             0 sh
Aug 20 18:15:59 localhost kernel: Out of memory: Kill process 56436 (find) score 267 or sacrifice child
Aug 20 18:15:59 localhost kernel: Killed process 56436, UID 0, (find) total-vm:19784052kB, anon-rss:10437824kB, file-rss:8kB
Aug 20 18:20:15 localhost NTP: 20 Aug 18:20:15 ntpdate[72927]: adjust time server 185.198.26.172 offset 0.017783 sec
Aug 20 18:30:15 localhost NTP: 20 Aug 18:30:14 ntpdate[74577]: adjust time server 54.183.204.201 offset -0.012002 sec
Aug 20 18:32:29 localhost kernel: sh invoked oom-killer: gfp_mask=0x200da, order=0, oom_adj=0, oom_score_adj=0
Aug 20 18:32:29 localhost kernel: sh cpuset=/ mems_allowed=0-1
--
Aug 20 18:32:29 localhost kernel: [74838]     0 74838    26489       37  18       0             0 awk
Aug 20 18:32:29 localhost kernel: [74839]     0 74839    26308       28  21       0             0 sed
Aug 20 18:32:29 localhost kernel: [74841]     0 74841     2275       14   0       0             0 sh
Aug 20 18:32:29 localhost kernel: [74842]     0 74842     1541       29  17       0             0 sshpass
Aug 20 18:32:29 localhost kernel: [74843]   497 74843     2275       13  19       0             0 sh
Aug 20 18:32:29 localhost kernel: Out of memory: Kill process 51244 (find) score 408 or sacrifice child
Aug 20 18:32:29 localhost kernel: Killed process 51244, UID 0, (find) total-vm:29107212kB, anon-rss:15434620kB, file-rss:4kB
Aug 20 18:40:12 localhost NTP: 20 Aug 18:40:12 ntpdate[76243]: adjust time server 103.105.51.156 offset 0.012008 sec
Aug 20 18:50:12 localhost NTP: 20 Aug 18:50:11 ntpdate[77522]: adjust time server 103.105.51.156 offset -0.002601 sec
Aug 20 18:57:15 localhost kernel: java invoked oom-killer: gfp_mask=0x201da, order=0, oom_adj=0, oom_score_adj=0
Aug 20 18:57:15 localhost kernel: java cpuset=/ mems_allowed=0-1
--
Aug 20 18:57:15 localhost kernel: [78302]    89 78302    20282      227  16       0             0 cleanup
Aug 20 18:57:15 localhost kernel: [78342]     0 78342    34942       90   0       0             0 crond
Aug 20 18:57:15 localhost kernel: [78348]     0 78348    26519       45   1       0             0 freemem.sh
Aug 20 18:57:15 localhost kernel: [78376]     0 78376       75        9   1       0             0 sync
Aug 20 18:57:15 localhost kernel: [78377]     0 78377      297       11   0       0             0 sh
Aug 20 18:57:15 localhost kernel: Out of memory: Kill process 30747 (find) score 848 or sacrifice child
Aug 20 18:57:15 localhost kernel: Killed process 30747, UID 0, (find) total-vm:58109856kB, anon-rss:30773196kB, file-rss:4kB
Aug 20 19:00:06 localhost NTP: 20 Aug 19:00:06 ntpdate[115182]: adjust time server 74.208.26.225 offset 0.030441 sec
Aug 20 19:10:06 localhost NTP: 20 Aug 19:10:06 ntpdate[59888]: adjust time server 206.55.191.142 offset -0.023210 sec
Aug 20 19:20:06 localhost NTP: 20 Aug 19:20:06 ntpdate[6326]: adjust time server 66.79.136.240 offset 0.017423 sec
Aug 20 19:30:06 localhost NTP: 20 Aug 19:30:06 ntpdate[148943]: adjust time server 45.79.1.70 offset -0.016261 sec
--
Oct 16 11:59:42 localhost kernel: [107168]     0 107168    14347        2   0       0             0 sftp-server
Oct 16 11:59:42 localhost kernel: [108234]    89 108234    20246      224   1       0             0 pickup
Oct 16 11:59:42 localhost kernel: [108512]    89 108512    20282      228   0       0             0 cleanup
Oct 16 11:59:42 localhost kernel: [108514]     0 108514    20331      291   2       0             0 local
Oct 16 11:59:42 localhost kernel: [108516]    89 108516    20256      224   0       0             0 bounce
Oct 16 11:59:42 localhost kernel: Out of memory: Kill process 100492 (find) score 912 or sacrifice child
Oct 16 11:59:42 localhost kernel: Killed process 100492, UID 0, (find) total-vm:62385608kB, anon-rss:31424580kB, file-rss:4kB
Oct 16 12:00:07 localhost NTP: 16 Oct 12:00:07 ntpdate[108604]: adjust time server 72.30.35.89 offset -0.003301 sec
Oct 16 12:10:06 localhost NTP: 16 Oct 12:10:06 ntpdate[109099]: adjust time server 198.255.68.106 offset -0.005266 sec
Oct 16 12:20:06 localhost NTP: 16 Oct 12:20:06 ntpdate[110347]: adjust time server 44.190.6.254 offset 0.019827 sec

 



# 扩大JVM内存
vi  /usr/local/tomcat7/bin/catalina.sh

# 原配置
#JAVA_OPTS="-Xms256m -Xmx512m -Xss1024K -XX:PermSize=128m -XX:MaxPermSize=256m"

# 32G内存参考配置
JAVA_OPTS="-server  -Xms10g -Xmx10g -XX:PermSize=1g -XX:MaxPermSize=2g -Xshare:off -Xmn1024m"

 

cat /usr/local/tomcat7/logs/catalina.out | grep 'com.alibaba.druid.pool.GetConnectionTimeoutException' -C 5

Caused by: com.alibaba.druid.pool.GetConnectionTimeoutException: wait millis 30000, active 50, runningSqlCount 3 : INSERT INTO T_TK_QUESTION_BASE (QUESTION_ID_CHAR,QUESTION_TITLE,QUESTION_TIPS,QUESTION_TYPE_ID,QUESTION_TYPE_NAME,QUESTION_DIFFICULT_ID,QUESTION_DIFFICULT_NAME,QUESTION_DIFFICULT_STAR,QUESTION_ANSWER,CREATE_TIME,CREATE_PERSON,B_USE,SOURCE_ID,TS,USE_COUNT,USE_RANGE,KG_ZG,FILE_ID,HEIGHT,PRODUCT_ID,CHECK_STATUS,CHECK_MESSAGE,PARENT_ID_CHAR,JSON_QUESTION,JSON_ANSWER,APP_TYPE,OPTIONS_COUNT,SUBJECT_ID,HAVE_CHILD,CONTENT_MD5,CONTENT_MD5_NEW_UNIQUE) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
        at com.alibaba.druid.pool.DruidDataSource.getConnectionInternal(DruidDataSource.java:1071)
        at com.alibaba.druid.pool.DruidDataSource.getConnectionDirect(DruidDataSource.java:898)

数据库连接池不释放。解决思路:

 //数据源配置
        druid.setInitialSize(20);
        druid.setMinIdle(10);
        druid.setMaxActive(50);
        druid.setMaxWait(600000);
        druid.setTimeBetweenEvictionRunsMillis(60000);
        druid.setMinEvictableIdleTimeMillis(300000);
        druid.setValidationQuery("SELECT 'x'");
        druid.setTestWhileIdle(true);
        druid.setTestOnBorrow(false);
        druid.setTestOnReturn(false);
        druid.setMaxPoolPreparedStatementPerConnectionSize(20);

        //增加回收机制
        druid.setRemoveAbandoned(true);
        druid.setRemoveAbandonedTimeoutMillis(300);
        druid.setLogAbandoned(false);

 

posted @ 2019-10-15 16:59  糖豆爸爸  阅读(439)  评论(0编辑  收藏  举报
Live2D