semaphore互斥失败导致出core
先看堆栈
(gdb) bt
#0 bnet_neigh_event_thread (dummy=dummy@entry=0x0) at /vob/jenkins/workspace/_build_8.8.3/sdk/src/customer_smm/l3.c:1303
#1 0x0000000002172cb0 in thread_boot (ti_void=0x1c99dc10) at /vob_yukon/xzhou_streams/smm_88x/sdk/src/sal/core/unix/thread.c:177
#2 0x0000ffffb7166f78 in ?? ()
#3 0x0000ffffc7a49e30 in ?? ()
bnet_neigh_event_thread()
的源码片段
1283 while (1) {
1284 if ((dumdum = casa_sem_wait(&bnet_neigh_sem))) {
1285 SWMGRLOG(" **%s: casa_sem_wait=%d?\n", __FUNCTION__, dumdum);
1286 continue;
1287 }
1288
1289 if (bnet_neigh_ring_bit_hi[bnet_neigh_read_hi])
1290 {
1291 kev = bnet_neigh_ring_hi[bnet_neigh_read_hi];
1292 high_priority_event = 1;
1293 }
1294 else if (bnet_neigh_ring_bit[bnet_neigh_read])
1295 {
1296 kev = bnet_neigh_ring[bnet_neigh_read];
1297 high_priority_event = 0;
1298 }
1299 else
1300 continue;
1301
1302 // get pointer out of ring
1303 switch (kev->event_type) { // 此处出core
1304 case CASA_EVENT_IP4_FIB:
1305 net_fib_event_handler(kev->event,
1306 &kev->u.casa_rt);
1307 break;
1308 case CASA_EVENT_IP6_FIB:
"l3.c" 2667L, 79013C
bnet_neigh_ring_hi
和bnet_neigh_ring
是进程初始化时预分配的缓冲池
gdb查看一下这几个值:
(gdb) p kev
$1 = (kernel_event_t *) 0x0
(gdb) p high_priority_event
$2 = 1
(gdb) p bnet_neigh_ring_bit_hi[bnet_neigh_read_hi]
$3 = 1 '\001'
(gdb) p bnet_neigh_ring_hi[bnet_neigh_read_hi]
$4 = (kernel_event_t *) 0xffff640749e8
(gdb)
从gdb打印出的内容来看,kev
不应该是为NULL的
但是注意到,在源码的1684行有个sem_wait
,说明时有锁保护的
那么很有可能时因为这个锁没有保护齐全,在1289行和1291行之间有调度其他线程
事实上,跟踪一下使用另外bnet_neigh_ring_bit_hi
和bnet_neigh_ring_hi
的相关代码发现,则两块缓存的保护锁是bcm_neighbor_ev_ring
,而在这里并没有上锁,并且,这个锁也应该要保护bnet_neigh_read_hi
和bnet_neigh_read_hi
这两个下标,而实际上也没有保护到
改完后:
while (1) {
if ((dumdum = casa_sem_wait(&bnet_neigh_sem))) {
SWMGRLOG(" **%s: casa_sem_wait=%d?\n", __FUNCTION__, dumdum);
continue;
}
// Bug 171516 : protect neigh ring with semaphor `bcm_neighbor_ev_ring`;
casa_sem_wait(&bcm_neighbor_ev_ring);
if (bnet_neigh_ring_bit_hi[bnet_neigh_read_hi])
{
kev = bnet_neigh_ring_hi[bnet_neigh_read_hi];
high_priority_event = 1;
}
else if (bnet_neigh_ring_bit[bnet_neigh_read])
{
kev = bnet_neigh_ring[bnet_neigh_read];
high_priority_event = 0;
}
else
{
sem_post(&bcm_neighbor_ev_ring);
continue;
}
sem_post(&bcm_neighbor_ev_ring);
// get pointer out of ring
switch (kev->event_type) {
case CASA_EVENT_IP4_FIB:
net_fib_event_handler(kev->event,
&kev->u.casa_rt);
break;
case CASA_EVENT_IP6_FIB:
net_ip6_fib_proc( kev->event,
&kev->u.casa_rt);
break;