1 什么是死锁
死锁 (DeadLock): 是指两个或两个以上的进程(线程)在执行过程中,因争夺资源而造成的一种互相等待的现象,若无外力作用,它们都将无法推进下去。此时称系统处于死锁状态或系统产生了死锁,这些永远在互相等待的进程(线程)称为死锁进程(线程)。 由于资源占用是互斥的,当某个进程提出申请资源后,使得有关进程(线程)在无外力协助下,永远分配不到必需的资源而无法继续运行,这就产生了一种特殊现象死锁。
一种交叉持锁死锁的情形,此时执行程序中两个或多个线程发生永久堵塞(等待),每个线程都在等待被其它线程占用并堵塞了的资源。例如,如果线程 1 锁住了记录 A 并等待记录 B,而线程 2 锁住了记录 B 并等待记录 A,这样两个线程就发生了死锁现象。在计算机系统中 , 如果系统的资源分配策略不当,更常见的可能是程序员写的程序有错误等,则会导致进程因竞争资源不当而产生死锁的现象。
2 产生死锁的四个必要条件
(1)互斥使用(资源独占)
一个资源每次只能给一个进程使用(比如写操作)
(2)占有且等待
进程在申请新的资源的同时,保持对原有资源的占有
(3)不可抢占
资源申请者不能强行从资源占有者手中夺取资源,资源只能由占有者自愿释放
(4)循环等待
P1等待P2占有的资源,P2等待P3的资源,...Pn等待P1的资源,形成一个进程等待回路
3 一个例子及图示
3.1 图示
进程在执行一些代码之后,子线程 1 获得了锁 A,正试图获得锁 B,子线程 2 此时获得了锁 B,正试图获得锁 A,这样子线程 1 和子线程 2 将没有办法得到锁 A 和锁 B,因为它们各自被对方占有,永远不会释放,从而发生了死锁的现象。
3.2 代码
#include <unistd.h>
#include <pthread.h>
#include <string.h>
pthread_mutex_t mutexA = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexB = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexC = PTHREAD_MUTEX_INITIALIZER;
static int counterA = 0;
static int counterB = 0;
int func1()
{
pthread_mutex_lock(&mutexA);
++counterA;
sleep(1);
pthread_mutex_lock(&mutexB);
++counterB;
pthread_mutex_unlock(&mutexB);
pthread_mutex_unlock(&mutexA);
return counterA;
}
int func2()
{
pthread_mutex_lock(&mutexB);
++counterB;
sleep(1);
pthread_mutex_lock(&mutexA);
++counterA;
pthread_mutex_unlock(&mutexA);
pthread_mutex_unlock(&mutexB);
return counterB;
}
void* start_routine1(void* arg)
{
while (1)
{
int iRetValue = func1();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* start_routine2(void* arg)
{
while (1)
{
int iRetValue = func2();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* start_routine(void* arg)
{
while (1)
{
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, (char*)arg);
}
}
int main()
{
pthread_t tid[4];
if (pthread_create(&tid[0], NULL, &start_routine1, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[1], NULL, &start_routine2, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[2], NULL, &start_routine, "thread3") != 0)
{
_exit(1);
}
if (pthread_create(&tid[3], NULL, &start_routine, "thread3") != 0)
{
_exit(1);
}
sleep(5);
//pthread_cancel(tid[0]);
pthread_join(tid[0], NULL);
pthread_join(tid[1], NULL);
pthread_join(tid[2], NULL);
pthread_join(tid[3], NULL);
pthread_mutex_destroy(&mutexA);
pthread_mutex_destroy(&mutexB);
pthread_mutex_destroy(&mutexC);
return 0;
}
3.3编译程序并执行
[hadoop@spark ~]$ gcc -g deadlock.c -o deadlock -lpthread
[hadoop@spark ~]$ ./deadlock
[hadoop@spark ~]$ ps -ef |grep -i deadlock
hadoop 103176 101848 0 15:42 pts/0 00:00:00 ./deadlock
4 使用 pstack 和 gdb 工具对死锁程序进行分析
4.1 pstack
第一次:
[hadoop@spark ~]$ pstack 103176
Thread 5 (Thread 0x7f3af34d2700 (LWP 103177)):
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f3af389fdcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f3af389fc98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004008ee in func1 () at deadlock.c:18
#4 0x000000000040098b in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f3af2cd1700 (LWP 103178)):
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f3af389fdcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f3af389fc98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x000000000040094a in func2 () at deadlock.c:31
#4 0x00000000004009b9 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f3af24d0700 (LWP 103179)):
#0 0x00007f3af35921ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f3af3592044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f3af1ccf700 (LWP 103180)):
#0 0x00007f3af35921ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f3af3592044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f3af3cbd740 (LWP 103176)):
#0 0x00007f3af389ef57 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400aee in main () at deadlock.c:99
[hadoop@spark ~]$
第二次:
[hadoop@spark ~]$
[hadoop@spark ~]$ pstack 103176
Thread 5 (Thread 0x7f3af34d2700 (LWP 103177)):
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f3af389fdcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f3af389fc98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004008ee in func1 () at deadlock.c:18
#4 0x000000000040098b in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f3af2cd1700 (LWP 103178)):
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f3af389fdcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f3af389fc98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x000000000040094a in func2 () at deadlock.c:31
#4 0x00000000004009b9 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f3af24d0700 (LWP 103179)):
#0 0x00007f3af35921ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f3af3592044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f3af1ccf700 (LWP 103180)):
#0 0x00007f3af35921ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f3af3592044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f3af3cbd740 (LWP 103176)):
#0 0x00007f3af389ef57 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400aee in main () at deadlock.c:99
[hadoop@spark ~]$
连续多次查看这个进程的函数调用关系堆栈, 死锁线程将一直处于等锁的状态,对比多次的函数调用堆栈输出结果,确定哪两个线程(或者几个线程)一直没有变化且一直处于等锁的状态。
分析:
根据上面的输出对比,线程 1 和线程 2 由第一次 pstack 输出的处在 sleep 函数变化为第二次 pstack 输出的处在 memset 函数。但是线程 4 和线程 5 一直处在等锁状态(pthread_mutex_lock),在连续两次的 pstack 信息输出中没有变化,所以我们可以推测线程 4 和线程 5 发生了死锁。
然后通过 gdb attach 到死锁进程
[hadoop@spark ~]$ gdb attach 103176
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
attach: No such file or directory.
Attaching to process 103176
Reading symbols from /home/hadoop/deadlock...done.
Reading symbols from /lib64/libpthread.so.0...(no debugging symbols found)...done.
[New LWP 103180]
[New LWP 103179]
[New LWP 103178]
[New LWP 103177]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
0x00007f3af389ef57 in pthread_join () from /lib64/libpthread.so.0
Missing separate debuginfos, use: debuginfo-install glibc-2.17-196.el7_4.2.x86_64
(gdb) info thread
Id Target Id Frame
5 Thread 0x7f3af34d2700 (LWP 103177) "deadlock" 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
4 Thread 0x7f3af2cd1700 (LWP 103178) "deadlock" 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
3 Thread 0x7f3af24d0700 (LWP 103179) "deadlock" 0x00007f3af35921ad in nanosleep () from /lib64/libc.so.6
2 Thread 0x7f3af1ccf700 (LWP 103180) "deadlock" 0x00007f3af35921ad in nanosleep () from /lib64/libc.so.6
* 1 Thread 0x7f3af3cbd740 (LWP 103176) "deadlock" 0x00007f3af389ef57 in pthread_join () from /lib64/libpthread.so.0
(gdb)
查看线程 4 和线程 5 的输出
(gdb) thread 5
[Switching to thread 5 (Thread 0x7f3af34d2700 (LWP 103177))]
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) where
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f3af389fdcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f3af389fc98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004008ee in func1 () at deadlock.c:18
#4 0x000000000040098b in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
(gdb) f 3
#3 0x00000000004008ee in func1 () at deadlock.c:18
18 pthread_mutex_lock(&mutexB);
(gdb) thread 4
[Switching to thread 4 (Thread 0x7f3af2cd1700 (LWP 103178))]
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) where
#0 0x00007f3af38a442d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f3af389fdcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f3af389fc98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x000000000040094a in func2 () at deadlock.c:31
#4 0x00000000004009b9 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f3af389de25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f3af35cb34d in clone () from /lib64/libc.so.6
(gdb) f 3
#3 0x000000000040094a in func2 () at deadlock.c:31
31 pthread_mutex_lock(&mutexA);
(gdb) p mutexA
$1 = {__data = {__lock = 2, __count = 0, __owner = 103177, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\t\223\001\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) p mutexB
$2 = {__data = {__lock = 2, __count = 0, __owner = 103178, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\n\223\001\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) p mutexC
$3 = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 39 times>, __align = 0}
(gdb)
从上面可以发现,线程 4 正试图获得锁 mutexA,但是锁 mutexA 已经被 LWP 为 103177 的线程得到(__owner = 103177),线程 5 正试图获得锁 mutexB,但是锁 mutexB 已经被 LWP 为 103178 的 得到(__owner = 103178),从 pstack 的输出可以发现,LWP 103177 与线程 5 是对应的,LWP 103178 与线程 4 是对应的。所以我们可以得出, 线程 4 和线程 5 发生了交叉持锁的死锁现象。查看线程的源代码发现,线程 4 和线程 5 同时使用 mutexA 和 mutexB,且申请顺序不合理。
5 利用core文件分析
运行./deadlock(编译的时候加调试选项-g) 死锁阻塞
[hadoop@spark ~]$ ulimit -c unlimited
[hadoop@spark ~]$ ./deadlock
[hadoop@spark ~]$ ps -ef |grep deadlock
hadoop 33397 32933 0 08:49 pts/0 00:00:00 ./deadlock
hadoop 33472 102129 0 08:50 pts/2 00:00:00 grep --color=auto deadlock
[hadoop@spark ~]$ pstack 33397
Thread 5 (Thread 0x7f71f6f11700 (LWP 33398)):
#0 0x00007f71f72e342d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f71f72dedcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f71f72dec98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004008ee in func1 () at deadlock.c:18
#4 0x000000000040098b in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f71f6710700 (LWP 33399)):
#0 0x00007f71f72e342d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f71f72dedcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f71f72dec98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x000000000040094a in func2 () at deadlock.c:31
#4 0x00000000004009b9 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f71f5f0f700 (LWP 33400)):
#0 0x00007f71f6fd11ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f71f6fd1044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f71f570e700 (LWP 33401)):
#0 0x00007f71f6fd11ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f71f6fd1044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f71f76fc740 (LWP 33397)):
#0 0x00007f71f72ddf57 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400aee in main () at deadlock.c:99
Ctrl+\ 产生core dump
[hadoop@spark ~]$ ./deadlock
^\Quit (core dumped)
[hadoop@spark ~]$ ls core.*
core.33397
[hadoop@spark ~]$ gdb deadlock core.33397
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /home/hadoop/deadlock...done.
[New LWP 33397]
[New LWP 33398]
[New LWP 33399]
[New LWP 33400]
[New LWP 33401]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `./deadlock'.
Program terminated with signal 3, Quit.
#0 0x00007f71f72ddf57 in pthread_join () from /lib64/libpthread.so.0
Missing separate debuginfos, use: debuginfo-install glibc-2.17-196.el7_4.2.x86_64
(gdb) thread apply all bt
Thread 5 (Thread 0x7f71f570e700 (LWP 33401)):
#0 0x00007f71f6fd11ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f71f6fd1044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f71f5f0f700 (LWP 33400)):
#0 0x00007f71f6fd11ad in nanosleep () from /lib64/libc.so.6
#1 0x00007f71f6fd1044 in sleep () from /lib64/libc.so.6
#2 0x00000000004009ed in start_routine (arg=0x400be0) at deadlock.c:69
#3 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f71f6710700 (LWP 33399)):
#0 0x00007f71f72e342d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f71f72dedcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f71f72dec98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x000000000040094a in func2 () at deadlock.c:31
#4 0x00000000004009b9 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f71f6f11700 (LWP 33398)):
#0 0x00007f71f72e342d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f71f72dedcb in _L_lock_812 () from /lib64/libpthread.so.0
#2 0x00007f71f72dec98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00000000004008ee in func1 () at deadlock.c:18
#4 0x000000000040098b in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f71f72dce25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f71f700a34d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f71f76fc740 (LWP 33397)):
#0 0x00007f71f72ddf57 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400aee in main () at deadlock.c:99