定位方法:
(1)如系统存在运行日志,首先分析日志信息。
(2)利用pstack工具打印出此时系统的存在线程堆栈快照。
(3)有些阻塞栈可能是因为它需要等待前面的线程执行结束才可以执行,所以要先确定依赖关系。
(4)若问题可以复现,可以利用gdb调试系统,但若不确定问题是否可以复现,则推荐不要使用gdb,

 因为会破坏问题产生的环境,gdb在一定程度上会影响系统运行导致卡住的因素由成立变成不成立。

示例代码:

#include <unistd.h>
#include <pthread.h>
#include <string.h>

pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex2 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex3 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex4 = PTHREAD_MUTEX_INITIALIZER;

static int sequence1 = 0;
static int sequence2 = 0;

int func1() {
    pthread_mutex_lock(&mutex1);
    ++sequence1;
    sleep(1);
    pthread_mutex_lock(&mutex2);
    ++sequence2;
    pthread_mutex_unlock(&mutex2);
    pthread_mutex_unlock(&mutex1);

    return sequence1;
 }

 int func2() {
    pthread_mutex_lock(&mutex2);
    ++sequence2;
    sleep(1);
    pthread_mutex_lock(&mutex1);
    ++sequence1;
    pthread_mutex_unlock(&mutex1);
    pthread_mutex_unlock(&mutex2);
    return sequence2;
}

void* thread1(void* arg) {
    while (1) {
        int iRetValue = func1();
        if (iRetValue == 100000) {
            pthread_exit(NULL);
        }
    }
}

void* thread2(void* arg) {
    while (1) {
        int iRetValue = func2();
        if (iRetValue == 100000) {
            pthread_exit(NULL);
        }
    }
}

void* thread3(void* arg) {
    while (1) {
        sleep(1);
        char szBuf[128];
        memset(szBuf, 0, sizeof(szBuf));
        strcpy(szBuf, "thread3");
    }
}

void* thread4(void* arg) {
    while (1) {
        sleep(1);
        char szBuf[128];
        memset(szBuf, 0, sizeof(szBuf));
        strcpy(szBuf, "thread3");
    }
}

int main() {
    pthread_t tid[4];
    if (pthread_create(&tid[0], NULL, &thread1, NULL) != 0) {
        _exit(1);
    }
    if (pthread_create(&tid[1], NULL, &thread2, NULL) != 0) {
        _exit(1);
    }
    if (pthread_create(&tid[2], NULL, &thread3, NULL) != 0) {
        _exit(1);
    }
    if (pthread_create(&tid[3], NULL, &thread4, NULL) != 0) {
        _exit(1);
    }

    sleep(5);
    // pthread_cancel(tid[0]);
    pthread_join(tid[0], NULL);
    pthread_join(tid[1], NULL);
    pthread_join(tid[2], NULL);
    pthread_join(tid[3], NULL);

    pthread_mutex_destroy(&mutex1);
    pthread_mutex_destroy(&mutex2);
    pthread_mutex_destroy(&mutex3);
    pthread_mutex_destroy(&mutex4);

    return 0;
}

 编译执行:

g++ -lpthread -g thread.cpp

会发现程序如期的卡住了。

利用pstack获取线程快照

pstack 28443 >> stack.txt

 线程快照如下

Thread 5 (Thread 0x7f159bb6d700 (LWP 28444)):
#0  0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2  0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x00000000004008d1 in func1 () at thread.cpp:17
#4  0x0000000000400969 in thread1 (arg=0x0) at thread.cpp:38
#5  0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6  0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f159b36c700 (LWP 28445)):
#0  0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2  0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x000000000040092d in func2 () at thread.cpp:29
#4  0x0000000000400992 in thread2 (arg=0x0) at thread.cpp:47
#5  0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6  0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f159ab6b700 (LWP 28446)):
#0  0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
#1  0x00007f159bc2b2b4 in sleep () from /lib64/libc.so.6
#2  0x00000000004009c6 in thread3 (arg=0x0) at thread.cpp:56
#3  0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#4  0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f159a36a700 (LWP 28447)):
#0  0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
#1  0x00007f159bc2b2b4 in sleep () from /lib64/libc.so.6
#2  0x0000000000400a0b in thread4 (arg=0x0) at thread.cpp:65
#3  0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#4  0x00007f159bc641cd in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f159cb72740 (LWP 28443)):
#0  0x00007f159c758ef7 in pthread_join () from /lib64/libpthread.so.0
#1  0x0000000000400b1b in main () at thread.cpp:89

由快照可以看出线程4和5一直处于等锁状态(pthread_mutex_lock),可以推测线程4和5发生了死锁。

接下来还是按之前说的,如果系统存在日志,首先看这两个线程的日志信息和线程堆栈代码推测卡住原因,如果没有日志(大型系统一般都会存在系统日志),那只能gdb调试。

gdb -p 28443

 

(gdb) info thread
  Id   Target Id         Frame 
  5    Thread 0x7f159bb6d700 (LWP 28444) "a.out" 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
  4    Thread 0x7f159b36c700 (LWP 28445) "a.out" 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
  3    Thread 0x7f159ab6b700 (LWP 28446) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
  2    Thread 0x7f159a36a700 (LWP 28447) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
* 1    Thread 0x7f159cb72740 (LWP 28443) "a.out" 0x00007f159c758ef7 in pthread_join () from /lib64/libpthread.so.0

 可以看到pstack的5个线程,主线程pthread_join无可厚非,在等待所有线程结束。

切换到5号线程查看情况

(gdb) t 5
[Switching to thread 5 (Thread 0x7f159bb6d700 (LWP 28444))]
#0  0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0  0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2  0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x00000000004008d1 in func1 () at thread.cpp:17
#4  0x0000000000400969 in thread1 (arg=0x0) at thread.cpp:38
#5  0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6  0x00007f159bc641cd in clone () from /lib64/libc.so.6

 

(gdb) f 3
#3  0x00000000004008d1 in func1 () at thread.cpp:17
17	    pthread_mutex_lock(&mutex2);
(gdb) p mutex2
$1 = {__data = {__lock = 2, __count = 0, __owner = 28445, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
  __size = "\002\000\000\000\000\000\000\000\035o\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) p mutex1
$2 = {__data = {__lock = 2, __count = 0, __owner = 28444, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
  __size = "\002\000\000\000\000\000\000\000\034o\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) p mutex3
$3 = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
  __size = '\000' <repeats 39 times>, __align = 0}
(gdb) info thread
  Id   Target Id         Frame 
* 5    Thread 0x7f159bb6d700 (LWP 28444) "a.out" 0x00000000004008d1 in func1 () at thread.cpp:17
  4    Thread 0x7f159b36c700 (LWP 28445) "a.out" 0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
  3    Thread 0x7f159ab6b700 (LWP 28446) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
  2    Thread 0x7f159a36a700 (LWP 28447) "a.out" 0x00007f159bc2b41d in nanosleep () from /lib64/libc.so.6
  1    Thread 0x7f159cb72740 (LWP 28443) "a.out" 0x00007f159c758ef7 in pthread_join () from /lib64/libpthread.so.0

可以看成线程28444拥有锁1而在等待锁2

而锁2被线程28445(__owner = 28445)拥有。

之后转到28445线程

(gdb) t 4
[Switching to thread 4 (Thread 0x7f159b36c700 (LWP 28445))]
#0  0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0  0x00007f159c75df4d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f159c759d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2  0x00007f159c759c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x000000000040092d in func2 () at thread.cpp:29
#4  0x0000000000400992 in thread2 (arg=0x0) at thread.cpp:47
#5  0x00007f159c757dc5 in start_thread () from /lib64/libpthread.so.0
#6  0x00007f159bc641cd in clone () from /lib64/libc.so.6
(gdb) f 3
#3  0x000000000040092d in func2 () at thread.cpp:29
29	    pthread_mutex_lock(&mutex1);
(gdb) p mutex1
$4 = {__data = {__lock = 2, __count = 0, __owner = 28444, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
  __size = "\002\000\000\000\000\000\000\000\034o\000\000\001", '\000' <repeats 26 times>, __align = 2}

 拥有锁2正在等待锁1,这样这两个线程就形成了死锁。

两个线程想同时拥有锁1和2,而且使用顺序不合理。

参考链接:

https://www.ibm.com/developerworks/cn/linux/l-cn-deadlock/

posted on 2017-08-15 11:27  LyndonYoung  阅读(4993)  评论(0编辑  收藏  举报