定位方法:
(1)如系统存在运行日志,首先分析日志信息。
(2)利用pstack工具打印出此时系统的存在线程堆栈快照。
(3)有些阻塞栈可能是因为它需要等待前面的线程执行结束才可以执行,所以要先确定依赖关系。
(4)若问题可以复现,可以利用gdb调试系统,但若不确定问题是否可以复现,则推荐不要使用gdb,
因为会破坏问题产生的环境,gdb在一定程度上会影响系统运行导致卡住的因素由成立变成不成立。
示例代码:
#include <unistd.h>
#include <pthread.h>
#include <string.h>
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex2 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex3 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex4 = PTHREAD_MUTEX_INITIALIZER;
static int sequence1 = 0;
static int sequence2 = 0;
int func1() {
pthread_mutex_lock(&mutex1);
++sequence1;
sleep(1);
pthread_mutex_lock(&mutex2);
++sequence2;
pthread_mutex_unlock(&mutex2);
pthread_mutex_unlock(&mutex1);
return sequence1;
}
int func2() {
pthread_mutex_lock(&mutex2);
++sequence2;
sleep(1);
pthread_mutex_lock(&mutex1);
++sequence1;
pthread_mutex_unlock(&mutex1);
pthread_mutex_unlock(&mutex2);
return sequence2;
}
void* thread1(void* arg) {
while (1) {
int iRetValue = func1();
if (iRetValue == 100000) {
pthread_exit(NULL);
}
}
}
void* thread2(void* arg) {
while (1) {
int iRetValue = func2();
if (iRetValue == 100000) {
pthread_exit(NULL);
}
}
}
void* thread3(void* arg) {
while (1) {
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, "thread3");
}
}
void* thread4(void* arg) {
while (1) {
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, "thread3");
}
}
int main() {
pthread_t tid[4];
if (pthread_create(&tid[0], NULL, &thread1, NULL) != 0) {
_exit(1);
}
if (pthread_create(&tid[1], NULL, &thread2, NULL) != 0) {
_exit(1);
}
if (pthread_create(&tid[2], NULL, &thread3, NULL) != 0) {
_exit(1);
}
if (pthread_create(&tid[3], NULL, &thread4, NULL) != 0) {
_exit(1);
}
sleep(5);
// pthread_cancel(tid[0]);
pthread_join(tid[0], NULL);
pthread_join(tid[1], NULL);
pthread_join(tid[2], NULL);
pthread_join(tid[3], NULL);
pthread_mutex_destroy(&mutex1);
pthread_mutex_destroy(&mutex2);
pthread_mutex_destroy(&mutex3);
pthread_mutex_destroy(&mutex4);
return 0;
}
编译执行:
g++ -lpthread -g thread.cpp
会发现程序如期的卡住了。
利用pstack获取线程快照
pstack 28443 >> stack.txt
线程快照如下
Thread 5 (Thread 0x7f159bb6d700 (LWP 28444)):
#0 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004008d1 in func1 () at thread.cpp:17
#4 0x0000000000400969 in thread1 (arg=0x0) at thread.cpp:38
#5 0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f159b36c700 (LWP 28445)):
#0 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x000000000040092d in func2 () at thread.cpp:29
#4 0x0000000000400992 in thread2 (arg=0x0) at thread.cpp:47
#5 0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f159ab6b700 (LWP 28446)):
#0 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
#1 0x00007f159bc2b2b4 in sleep () from /lib64/libc.so.6
#2 0x00000000004009c6 in thread3 (arg=0x0) at thread.cpp:56
#3 0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f159a36a700 (LWP 28447)):
#0 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
#1 0x00007f159bc2b2b4 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a0b in thread4 (arg=0x0) at thread.cpp:65
#3 0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f159cb72740 (LWP 28443)):
#0 0x00007f159c758ef7 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400b1b in main () at thread.cpp:89
由快照可以看出线程4和5一直处于等锁状态(pthread_mutex_lock),可以推测线程4和5发生了死锁。
接下来还是按之前说的,如果系统存在日志,首先看这两个线程的日志信息和线程堆栈代码推测卡住原因,如果没有日志(大型系统一般都会存在系统日志),那只能gdb调试。
gdb -p 28443
(gdb) info thread
Id Target Id Frame
5 Thread 0x7f159bb6d700 (LWP 28444) "a.out" 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
4 Thread 0x7f159b36c700 (LWP 28445) "a.out" 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
3 Thread 0x7f159ab6b700 (LWP 28446) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
2 Thread 0x7f159a36a700 (LWP 28447) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
* 1 Thread 0x7f159cb72740 (LWP 28443) "a.out" 0x00007f159c758ef7 in pthread_join () from /lib64/libpthread.so.0
可以看到pstack的5个线程,主线程pthread_join无可厚非,在等待所有线程结束。
切换到5号线程查看情况
(gdb) t 5
[Switching to thread 5 (Thread 0x7f159bb6d700 (LWP 28444))]
#0 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004008d1 in func1 () at thread.cpp:17
#4 0x0000000000400969 in thread1 (arg=0x0) at thread.cpp:38
#5 0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f159bc641cd in clone () from /lib64/libc.so.6
(gdb) f 3
#3 0x00000000004008d1 in func1 () at thread.cpp:17
17 pthread_mutex_lock(&mutex2);
(gdb) p mutex2
$1 = {__data = {__lock = 2, __count = 0, __owner = 28445, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\035o\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) p mutex1
$2 = {__data = {__lock = 2, __count = 0, __owner = 28444, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\034o\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) p mutex3
$3 = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 39 times>, __align = 0}
(gdb) info thread
Id Target Id Frame
* 5 Thread 0x7f159bb6d700 (LWP 28444) "a.out" 0x00000000004008d1 in func1 () at thread.cpp:17
4 Thread 0x7f159b36c700 (LWP 28445) "a.out" 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
3 Thread 0x7f159ab6b700 (LWP 28446) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
2 Thread 0x7f159a36a700 (LWP 28447) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
1 Thread 0x7f159cb72740 (LWP 28443) "a.out" 0x00007f159c758ef7 in pthread_join () from /lib64/libpthread.so.0
可以看成线程28444拥有锁1而在等待锁2
而锁2被线程28445(__owner = 28445)拥有。
之后转到28445线程
(gdb) t 4
[Switching to thread 4 (Thread 0x7f159b36c700 (LWP 28445))]
#0 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x000000000040092d in func2 () at thread.cpp:29
#4 0x0000000000400992 in thread2 (arg=0x0) at thread.cpp:47
#5 0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f159bc641cd in clone () from /lib64/libc.so.6
(gdb) f 3
#3 0x000000000040092d in func2 () at thread.cpp:29
29 pthread_mutex_lock(&mutex1);
(gdb) p mutex1
$4 = {__data = {__lock = 2, __count = 0, __owner = 28444, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\034o\000\000\001", '\000' <repeats 26 times>, __align = 2}
拥有锁2正在等待锁1,这样这两个线程就形成了死锁。
两个线程想同时拥有锁1和2,而且使用顺序不合理。
参考链接:
欢迎关注公众号Magicio