linux死锁检测
参考
https://www.ibm.com/developerworks/cn/linux/l-cn-deadlock/index.html
https://blog.csdn.net/peng314899581/article/details/79064616
https://www.cnblogs.com/youxin/p/8837771.html
https://www.jianshu.com/p/d451793cab4c?utm_source=oschina-app
http://blog.sina.com.cn/s/blog_a2a6dd380102xtec.html
https://blog.csdn.net/wanxuexiang/article/details/88382808
https://ethanhao.github.io/c++11,/gdb,/multithread,/2017/03/03/Deadlock-detecting-using-GDB-Copy.html
前沿
Windows下死锁的解决方法已经很熟悉了。首先,Windows via C/C++中,提供了一个工程-LockCop,附加到一个进程,判断是否有死锁。死锁的现象行为有显著的特点,程序表面上看上去一切正常,但是某些信息或是消息发送过去后,无法处理。一般我们用LockCop判断是否有死锁,发现有死锁之后,用Visual Studio远程附加到进程调试,看看对应线程卡在哪个位置。一般都会卡在加锁的位置,然后看看两个死锁的线程代码上几步,是不是相互锁定了对方现在正在请求的锁。这样就可以很快的查到死锁的问题。
Linux下调查死锁的方法与Windows类似,也是先确认是否死锁,然后找到哪两个线程死锁,然后调试具体看线程卡在哪一步。
代码
这个代码创建了4个线程,两个死锁,两个不断操作数组
#include <unistd.h>
#include <pthread.h>
#include <string.h>
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex2 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex3 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex4 = PTHREAD_MUTEX_INITIALIZER;
static int sequence1 = 0;
static int sequence2 = 0;
int func1()
{
pthread_mutex_lock(&mutex1);
++sequence1;
sleep(1);
pthread_mutex_lock(&mutex2);
++sequence2;
pthread_mutex_unlock(&mutex2);
pthread_mutex_unlock(&mutex1);
return sequence1;
}
int func2()
{
pthread_mutex_lock(&mutex2);
++sequence2;
sleep(1);
pthread_mutex_lock(&mutex1);
++sequence1;
pthread_mutex_unlock(&mutex1);
pthread_mutex_unlock(&mutex2);
return sequence2;
}
void* thread1(void* arg)
{
while (1)
{
int iRetValue = func1();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* thread2(void* arg)
{
while (1)
{
int iRetValue = func2();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* thread3(void* arg)
{
while (1)
{
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, "thread3");
}
}
void* thread4(void* arg)
{
while (1)
{
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, "thread3");
}
}
int main()
{
pthread_t tid[4];
if (pthread_create(&tid[0], NULL, &thread1, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[1], NULL, &thread2, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[2], NULL, &thread3, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[3], NULL, &thread4, NULL) != 0)
{
_exit(1);
}
sleep(5);
//pthread_cancel(tid[0]);
pthread_join(tid[0], NULL);
pthread_join(tid[1], NULL);
pthread_join(tid[2], NULL);
pthread_join(tid[3], NULL);
pthread_mutex_destroy(&mutex1);
pthread_mutex_destroy(&mutex2);
pthread_mutex_destroy(&mutex3);
pthread_mutex_destroy(&mutex4);
return 0;
}
编译运行
第一种方式 strace
找到我们的进程
$ ps aux -T |grep a.out
root 6794 6794 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6795 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6796 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6797 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6794 6798 0.0 0.0 38416 1664 pts/0 Sl+ 14:23 0:00 ./a.out
root 6800 6800 0.0 0.0 3216 892 pts/1 R+ 14:23 0:00 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn --exclude-dir=.idea --exclude-dir=.tox a.out
我们看到6794这个进程,也就是我们跑的程序,有5个线程,因为一个程序起来的主线程,然后又申请了4个子线程。
用strace查看每个线程的状态
# root @ debian in ~ [14:27:42] C:130
$ strace -p 6794
strace: Process 6794 attached
futex(0x7f1d36d1a9d0, FUTEX_WAIT, 6795, NULL^Cstrace: Process 6794 detached
<detached ...>
# root @ debian in ~ [14:27:46] C:130
$ strace -p 6795
strace: Process 6795 attached
futex(0x5608207030e0, FUTEX_WAIT_PRIVATE, 2, NULL^Cstrace: Process 6795 detached
<detached ...>
# root @ debian in ~ [14:27:51] C:130
$ strace -p 6796
strace: Process 6796 attached
futex(0x5608207030a0, FUTEX_WAIT_PRIVATE, 2, NULL^Cstrace: Process 6796 detached
<detached ...>
# root @ debian in ~ [14:27:55] C:130
$ strace -p 6797
strace: Process 6797 attached
restart_syscall(<... resuming interrupted nanosleep ...>) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35d17e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, ^Cstrace: Process 6797 detached
<detached ...>
# root @ debian in ~ [14:28:02] C:130
$ strace -p 6798
strace: Process 6798 attached
restart_syscall(<... resuming interrupted nanosleep ...>) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, 0x7f1d35516e20) = 0
nanosleep({tv_sec=1, tv_nsec=0}, ^Cstrace: Process 6798 detached
<detached ...>
我们知道主线程肯定是阻塞的或是循环的,不然程序就执行完退出了,所以6794是一个等待状态,多次调用strace可以看到6795和6796也一直是等待状态,按照正常的程序执行,很难在抓取信息的时候看到是加锁等待状态,更不用说多次执行都是同一个等待状态,这基本上就表示是死锁了。后面6797和6798符合代码的执行流程,就是sleep,然后做一些操作,strace可以记录到每次调用系统nanosleep的日志。有关futex的更多信息请参考futex
gdb调试
$ gdb
GNU gdb (Debian 8.2.1-2+b3) 8.2.1
Copyright (C) 2018 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
(gdb) attach 7291
Attaching to process 7291
[New LWP 7292]
[New LWP 7293]
[New LWP 7294]
[New LWP 7295]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
0x00007fea5c5c6495 in __GI___pthread_timedjoin_ex (threadid=140644543452928, thread_return=0x0, abstime=0x0, block=<optimized out>) at pthread_join_common.c:89
89 pthread_join_common.c: No such file or directory.
(gdb) info threads
Id Target Id Frame
* 1 Thread 0x7fea5c0d6740 (LWP 7291) "a.out" 0x00007fea5c5c6495 in __GI___pthread_timedjoin_ex (threadid=140644543452928, thread_return=0x0, abstime=0x0,
block=<optimized out>) at pthread_join_common.c:89
2 Thread 0x7fea5c0d5700 (LWP 7292) "a.out" __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
3 Thread 0x7fea5b8d4700 (LWP 7293) "a.out" __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
4 Thread 0x7fea5b0d3700 (LWP 7294) "a.out" 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5b0d2e20,
remaining=remaining@entry=0x7fea5b0d2e20) at ../sysdeps/unix/sysv/linux/nanosleep.c:28
5 Thread 0x7fea5a8d2700 (LWP 7295) "a.out" 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5a8d1e20,
remaining=remaining@entry=0x7fea5a8d1e20) at ../sysdeps/unix/sysv/linux/nanosleep.c:28
(gdb) thread apply all bt
Thread 5 (Thread 0x7fea5a8d2700 (LWP 7295)):
#0 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5a8d1e20, remaining=remaining@entry=0x7fea5a8d1e20)
at ../sysdeps/unix/sysv/linux/nanosleep.c:28
#1 0x00007fea5c1a162a in __sleep (seconds=0) at ../sysdeps/posix/sleep.c:55
#2 0x0000558bb85f732c in thread4 (arg=0x0) at test.cpp:80
#3 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#4 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 4 (Thread 0x7fea5b0d3700 (LWP 7294)):
#0 0x00007fea5c1a1720 in __GI___nanosleep (requested_time=requested_time@entry=0x7fea5b0d2e20, remaining=remaining@entry=0x7fea5b0d2e20)
at ../sysdeps/unix/sysv/linux/nanosleep.c:28
#1 0x00007fea5c1a162a in __sleep (seconds=0) at ../sysdeps/posix/sleep.c:55
#2 0x0000558bb85f72e7 in thread3 (arg=0x0) at test.cpp:69
#3 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#4 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 3 (Thread 0x7fea5b8d4700 (LWP 7293)):
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
#1 0x00007fea5c5c7714 in __GI___pthread_mutex_lock (mutex=0x558bb85fa0a0 <mutex1>) at ../nptl/pthread_mutex_lock.c:80
#2 0x0000558bb85f724e in func2 () at test.cpp:31
#3 0x0000558bb85f72b5 in thread2 (arg=0x0) at test.cpp:56
#4 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#5 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 2 (Thread 0x7fea5c0d5700 (LWP 7292)):
#0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
#1 0x00007fea5c5c7714 in __GI___pthread_mutex_lock (mutex=0x558bb85fa0e0 <mutex2>) at ../nptl/pthread_mutex_lock.c:80
#2 0x0000558bb85f71ea in func1 () at test.cpp:18
#3 0x0000558bb85f728e in thread1 (arg=0x0) at test.cpp:43
#4 0x00007fea5c5c4fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#5 0x00007fea5c1d44cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 1 (Thread 0x7fea5c0d6740 (LWP 7291)):
#0 0x00007fea5c5c6495 in __GI___pthread_timedjoin_ex (threadid=140644543452928, thread_return=0x0, abstime=0x0, block=<optimized out>) at pthread_join_common.c:89
#1 0x0000558bb85f7444 in main () at test.cpp:110
(gdb) p mutex1
$1 = {__data = {__lock = 2, __count = 0, __owner = 7292, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000|\034\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) p mutex2
$2 = {__data = {__lock = 2, __count = 0, __owner = 7293, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000}\034\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) detach
Detaching from program: /home/arthas/code/a.out, process 7291
[Inferior 1 (process 7291) detached]
(gdb) q
上面的线程id变了,因为一开始没有加g进行编译,所以在打印mutex1的时候,打印不出来具体信息,又重新增加了调试信息进行编译,然后执行。
attach 7291
附加到进程
info threads
查看线程概要信息
thread apply all bt
打印线程详细信息,从这里找到thread2和thread3两个死锁线程的详细信息,执行到哪个函数哪一行,等待哪一个锁。
p mutex1
通过打印对应锁的信息,可以看到thread2等待的锁被thread3占有,thread3等待的锁被thread2占有,所以发生了死锁。出问题的地方找到了,调用detach解除附加进程,q退出gdb
第二种方式 valgrind
valgrind是一个非常有用的工具,可以分析很多运行时错误,比如使用了未初始化的内存,使用了释放的内存,内存泄漏等,同样也包括死锁。
valgrind的方法很简单,这是一个工具包,需要指定使用哪个工具,helgrind是一个分析死锁的工具,可以指出出问题的地方,通过valgrind运行程序,等出现问题后,用Ctrl+C,结束掉,就可以看到打印的信息。
从下面的信息可以看出,valgrind指出了有两处错误,给定了线程名,thread2和thread3。并且列出了没出错误的调用堆栈,我们只需要到代码中查看解决就可以了。
$ valgrind --tool=helgrind ./a.out
==7608== Helgrind, a thread error detector
==7608== Copyright (C) 2007-2017, and GNU GPL'd, by OpenWorks LLP et al.
==7608== Using Valgrind-3.14.0 and LibVEX; rerun with -h for copyright info
==7608== Command: ./a.out
==7608==
^C==7608==
==7608== Process terminating with default action of signal 2 (SIGINT)
==7608== at 0x4866495: __pthread_timedjoin_ex (pthread_join_common.c:89)
==7608== by 0x48398F5: pthread_join_WRK (hg_intercepts.c:553)
==7608== by 0x109443: main (test.cpp:110)
==7608== ---Thread-Announcement------------------------------------------
==7608==
==7608== Thread #2 was created
==7608== at 0x4C984BE: clone (clone.S:71)
==7608== by 0x4863DDE: create_thread (createthread.c:101)
==7608== by 0x486580D: pthread_create@@GLIBC_2.2.5 (pthread_create.c:826)
==7608== by 0x483C6B7: pthread_create_WRK (hg_intercepts.c:427)
==7608== by 0x109379: main (test.cpp:90)
==7608==
==7608== ----------------------------------------------------------------
==7608==
==7608== Thread #2: Exiting thread still holds 1 lock
==7608== at 0x486E29C: __lll_lock_wait (lowlevellock.S:103)
==7608== by 0x4867713: pthread_mutex_lock (pthread_mutex_lock.c:80)
==7608== by 0x4839C66: mutex_lock_WRK (hg_intercepts.c:902)
==7608== by 0x1091E9: func1() (test.cpp:18)
==7608== by 0x10928D: thread1(void*) (test.cpp:43)
==7608== by 0x483C8B6: mythread_wrapper (hg_intercepts.c:389)
==7608== by 0x4864FA2: start_thread (pthread_create.c:486)
==7608== by 0x4C984CE: clone (clone.S:95)
==7608==
==7608== ---Thread-Announcement------------------------------------------
==7608==
==7608== Thread #3 was created
==7608== at 0x4C984BE: clone (clone.S:71)
==7608== by 0x4863DDE: create_thread (createthread.c:101)
==7608== by 0x486580D: pthread_create@@GLIBC_2.2.5 (pthread_create.c:826)
==7608== by 0x483C6B7: pthread_create_WRK (hg_intercepts.c:427)
==7608== by 0x1093AD: main (test.cpp:94)
==7608==
==7608== ----------------------------------------------------------------
==7608==
==7608== Thread #3: Exiting thread still holds 1 lock
==7608== at 0x486E29C: __lll_lock_wait (lowlevellock.S:103)
==7608== by 0x4867713: pthread_mutex_lock (pthread_mutex_lock.c:80)
==7608== by 0x4839C66: mutex_lock_WRK (hg_intercepts.c:902)
==7608== by 0x10924D: func2() (test.cpp:31)
==7608== by 0x1092B4: thread2(void*) (test.cpp:56)
==7608== by 0x483C8B6: mythread_wrapper (hg_intercepts.c:389)
==7608== by 0x4864FA2: start_thread (pthread_create.c:486)
==7608== by 0x4C984CE: clone (clone.S:95)
==7608==
==7608==
==7608== For counts of detected and suppressed errors, rerun with: -v
==7608== Use --history-level=approx or =none to gain increased speed, at
==7608== the cost of reduced accuracy of conflicting-access information
==7608== ERROR SUMMARY: 2 errors from 2 contexts (suppressed: 21 from 4)
第三种方法 pstack
pstack是Solaris、Red Hat系列(Fedora,Centos)和Debian系列(Ubuntu)等下提供的一个打印堆栈的调试工具。
但是在debian10下安装好了,运行报错
$ sudo pstack 1798
1798: ./a.out
pstack: Input/output error
failed to read target.