调试没有core文件的coredump
在不保留core文件的情况下,如何获取程序崩溃时候的上下文信息(主要是函数调用栈)
1.coredump原理
当程序发生内存越界访问等行为时,会触发OS的保护机制,此时OS会产生一个信号(signal)发送给对应的进程。当进程从内核态到用户态切换时,该进程会处理这个信号。此类信号(比如SEGV)的默认处理行为生成一个coredump文件。
这里会涉及以下几个问题:
1. 保存的core文件在什么地方?
2. core文件,具体会把进程地址空间的哪些内容保存下来?
3. 如何控制core文件的大小?
4. 如果在处理信号的时候,又产生了新的同类信号,该如何处理?
5. 处理信号的代码,是运行在用户态还是内核态?
6. 在一个多线程的程序中,是由哪个线程在处理这个信号?
/proc/sys/kernel/core_uses_pid` 取值是0或者1,表示是否在core文件名字后面加上进程号
`/proc/$pid/coredump_filter` 设置那些内存会被dump出来
- (bit 0) anonymous private memory
- (bit 1) anonymous shared memory
- (bit 2) file-backed private memory
- (bit 3) file-backed shared memory
- (bit 4) ELF header pages in file-backed private memory areas (it is effective only if the bit 2 is cleared)
- (bit 5) hugetlb private memory
- (bit 6) hugetlb shared memory
- (bit 7) DAX private memory
- (bit 8) DAX shared memory
2.自定义信号处理函数
需要在自定义的信号处理函数中打印出程序崩溃时候的活跃函数堆栈信息。
这里我们有两种方式:
1.使用backtrace等方法,读取进程堆栈上的信息;
2.在函数调用的同时,用户自己维护一套数据结构,用于保存函数调用链,在信号处理函数中,将这个函数调用链打印出来

/**/ int bugsignal_register(const int sig) { struct sigaction action; memset(&action, 0, sizeof(action)); action.sa_sigaction = signal_core_bugreport; action.sa_flags = SA_SIGINFO; return (-1 != sigaction(sig, &action, NULL)); } inline static void signal_core_bugreport(const int sig, siginfo_t * info, void * ptr) { switch (sig) { case SIGSEGV: case SIGABRT: case SIGFPE: case SIGILL: case SIGBUS: { signal(sig, signal_func); write_stack_msg(sig, info, ptr,"txt"); signal(sig, SIG_DFL); kill(getpid(),sig);//让其生成core文件 } break; case SIGTERM: case SIGINT: case SIGQUIT: { bugreport_def_return(sig) } break; case SIGUSR1: { write_stack_msg(sig, info, ptr,"sigusr1"); break; } case SIGUSR2: { write_stack_msg(sig, info, ptr,"sigusr2"); break; } default: break; } } static inline void bugreport_def_return(const int sig) { snprintf(stderr, 255, " normally exit , pid:%d, sig:%d\n", getpid(), sig); if (SIGTERM == sig || true) { signal(SIGTERM, SIG_DFL); } exit(0); } static inline int write_stack_msg(const int sig, siginfo_t * info, void * ptr,const char* logfile_suffix) { static const char * si_codes[3] = {"", "SEGV_MAPERR", "SEGV_ACCERR"}; size_t i = 0; ucontext_t * ucontext = (ucontext_t *)ptr; unsigned long stack_start = 0; unsigned long stack_end = 0; snprintf(logpath, "xxxxxxxxxxxxxxxx", buglogpath); foreach_stack_rang(gettid(), "/proc/getpid()/maps",&stack_start, &stack_end); umask(0); snprintf(logfile, xxxx, "%s/%s_time().txt", buglogpath, bug_process_name); int f = 0; Dl_info dl_info; void ** bp = 0; void * ip = 0; if (info->si_code >= 0 && info->si_code < 3) { snprintf(stackinfo, xxxxxx, "Segmentation Fault!\n" "info.si_signo = %d\n" "info.si_errno = %d\n" "info.si_code = %d (%s)\n" "info.si_pid = %d\n" "info.si_addr = %p\n", sig, info->si_errno, info->si_code, si_codes[info->si_code], info->si_pid, info->si_addr ); } else { snprintf(stackinfo,xxxxx, "Segmentation Fault!\n" "info.si_signo = %d\n" "info.si_errno = %d\n" "info.si_code = %d\n" "info.si_pid = %d\n" "info.si_addr = %p\n", sig, info->si_errno, info->si_code, info->si_pid, info->si_addr ); } write(fd, stackinfo, strlen(stackinfo)); ip = (void *)ucontext->uc_mcontext.arm_pc; bp = (void **)ucontext->uc_mcontext.arm_fp; write(fd, "REG:\n", strlen("REG:\n")); for (i = 0; i < sizeof(ucontext->uc_mcontext)/sizeof(unsigned long); i++) { fprintf(fd, "\t%s: 0x%08lx", rname_index[i], ((unsigned long*)&ucontext->uc_mcontext)[i]); } write(fd, "\nStack trace:\n\n", strlen("Stack trace:\n\n")); while (bp && ip) { if (!dladdr(ip, &dl_info)) { bugreporteak; } const char * symname = dl_info.dli_sname; fprintf(fd, "stack #%02d: bp:%p %s [%p->%p] <%s+%ld>\n", ++f,bp, dl_info.dli_fname, ip, (void*)((intptr_t)ip - (intptr_t)dl_info.dli_fbase), symname, (intptr_t)ip - (intptr_t)dl_info.dli_saddr ); ------------------------- } write(fd, "End of stack trace\n", strlen("End of stack trace\n")); save_stacktrace(fd, ucontext->uc_mcontext.arm_sp); close(fd); return 0; } int signal_bugreport_setup() { bug_signal_cb(SIGSEGV); bug_signal_cb(SIGABRT); bug_signal_cb(SIGFPE); bug_signal_cb(SIGINT); bug_signal_cb(SIGBUS); bug_signal_cb(SIGILL); bug_signal_cb(SIGQUIT); bug_signal_cb(SIGTERM); bug_signal_inore()(SIGHUP); bug_signal_inore()(SIGPIPE); //忽略sigchld 会导致system函数返回值失效不能忽略 return 0; } static inline void bugreport_def_term(const int sig) { char log[256]; snprintf(log, 255, "Exit Normally, pid:%d, sig:%d\n", getpid(), sig); printf("%s", log); if (SIGTERM == sig) { signal(SIGTERM, SIG_DFL); } exit(0); } void dump_trace(int Signal) { const int len = 200; void* buffer[len]; printf("dump_trace\n"); int nptrs = ::backtrace(buffer, len); printf("backtrace\n"); char** buffer_array = ::backtrace_symbols(buffer, nptrs); printf("sig:%d nptrs:%d\n", Signal, nptrs); if (buffer_array) { for (int i = 0; i < nptrs; ++i) { printf("frame=%d||trace_back=%s||\n", i, buffer_array[i]); } free(buffer_array); } exit(0); }
https://www.man7.org/linux/man-pages/man2/sigaction.2.html
The siginfo_t argument to a SA_SIGINFO handler When the SA_SIGINFO flag is specified in act.sa_flags, the signal handler address is passed via the act.sa_sigaction field. This han‐ dler takes three arguments, as follows: void handler(int sig, siginfo_t *info, void *ucontext) { ... } These three arguments are as follows sig The number of the signal that caused invocation of the han‐ dler. info A pointer to a siginfo_t, which is a structure containing fur‐ ther information about the signal, as described below. ucontext This is a pointer to a ucontext_t structure, cast to void *. The structure pointed to by this field contains signal context information that was saved on the user-space stack by the ker‐ nel; for details, see sigreturn(2). Further information about the ucontext_t structure can be found in getcontext(3). Com‐ monly, the handler function doesn't make any use of the third argument. The siginfo_t data type is a structure with the following fields: siginfo_t { int si_signo; /* Signal number */ int si_errno; /* An errno value */ int si_code; /* Signal code */ int si_trapno; /* Trap number that caused hardware-generated signal (unused on most architectures) */ pid_t si_pid; /* Sending process ID */ uid_t si_uid; /* Real user ID of sending process */ int si_status; /* Exit value or signal */ clock_t si_utime; /* User time consumed */ clock_t si_stime; /* System time consumed */ sigval_t si_value; /* Signal value */ int si_int; /* POSIX.1b signal */ void *si_ptr; /* POSIX.1b signal */ int si_overrun; /* Timer overrun count; POSIX.1b timers */ int si_timerid; /* Timer ID; POSIX.1b timers */ void *si_addr; /* Memory location which caused fault */ long si_band; /* Band event (was int in glibc 2.3.2 and earlier) */ int si_fd; /* File descriptor */ short si_addr_lsb; /* Least significant bit of address (since Linux 2.6.32) */ void *si_lower; /* Lower bound when address violation occurred (since Linux 3.19) */ void *si_upper; /* Upper bound when address violation occurred (since Linux 3.19) */ int si_pkey; /* Protection key on PTE that caused fault (since Linux 4.6) */ void *si_call_addr; /* Address of system call instruction (since Linux 3.5) */ int si_syscall; /* Number of attempted system call (since Linux 3.5) */ unsigned int si_arch; /* Architecture of attempted system call (since Linux 3.5) */ } si_signo, si_errno and si_code are defined for all signals. (si_errno is generally unused on Linux.) The rest of the struct may be a union, so that one should read only the fields that are meaning‐ ful for the given signal:
#include <execinfo.h> int backtrace(void **buffer, int size); char **backtrace_symbols(void *const *buffer, int size); void backtrace_symbols_fd(void *const *buffer, int size, int fd)
backtrace函数通过指针数组buffer返回调用程序的回溯信息,也就是所谓的函数调用栈。buffer数组中的元素是void*类型,也就是栈中保存的返回地址。
size参数指定buffer中可以保存的地址的最大个数。如果实际的回溯信息大于size,则只返回最近的size个地址。
backtrace函数返回buffer中保存的地址个数,返回值不会大于size。如果返回值小于size,则说明所有的回溯信息都已经返回了,如果等于size,则有可能被截断了。
backtrace函数在buffer数组中返回的都是一些虚拟地址,不适于分析。backtrace_symbols函数可以将backtrace返回的buffer中的地址,根据符号表中的信息,转换为字符串(函数名+偏移地址)。size参数指明了buffer中的地址个数。
backtrace_symbols返回字符串数组的首地址,该字符串是在backtrace_symbols中通过malloc分配的,因此,调用者必须使用free释放内存。如果发生了错误,则backtrace_symbols返回NULL
backtrace_symbols_fd类似于backtrace_symbols,只不过它是把字符串信息写到文件描述符fd所表示的文件中。backtrace_symbols_fd不会调用malloc函数
来自网上转载的
#include <signal.h> #include <execinfo.h> #include <stdio.h> #include <stdlib.h> #include <ucontext.h> #define BTSIZE 100 static void *getMcontextEip(ucontext_t *uc) { #if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) /* OSX < 10.6 */ #if defined(__x86_64__) return (void*) uc->uc_mcontext->__ss.__rip; #elif defined(__i386__) return (void*) uc->uc_mcontext->__ss.__eip; #else return (void*) uc->uc_mcontext->__ss.__srr0; #endif #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6) /* OSX >= 10.6 */ #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__) return (void*) uc->uc_mcontext->__ss.__rip; #else return (void*) uc->uc_mcontext->__ss.__eip; #endif #elif defined(__linux__) /* Linux */ #if defined(__i386__) return (void*) uc->uc_mcontext.gregs[14]; /* Linux 32 */ #elif defined(__X86_64__) || defined(__x86_64__) return (void*) uc->uc_mcontext.gregs[16]; /* Linux 64 */ #elif defined(__ia64__) /* Linux IA64 */ return (void*) uc->uc_mcontext.sc_ip; #endif #else return NULL; #endif } static void sig_handler(int sig, siginfo_t *info, void *secret) { ucontext_t *uc = (ucontext_t*) secret; void *buffer[BTSIZE]; char **strings; int nptrs = 0; printf("in sig_handler\n"); printf("sig is %d, SIGSEGV is %d\n", sig, SIGSEGV); printf("info.si_signo is %d, info.si_addr is %p\n", info->si_signo, info->si_addr); if (sig == SIGSEGV) { nptrs = backtrace(buffer, BTSIZE); printf("backtrace() returned %d addresses\n", nptrs); if (getMcontextEip(uc) != NULL) buffer[1] = getMcontextEip(uc); strings = backtrace_symbols(buffer, nptrs); if (strings == NULL) { perror("backtrace_symbols"); exit(EXIT_FAILURE); } printf("backtrace: \n"); int j; for (j = 0; j < nptrs; j++) { printf("[%d]%s\n", j, strings[j]); } free(strings); exit(0); } }
#ifdef CONFIG_ARM_UNWIND static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { unwind_backtrace(regs, tsk); } #else static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { unsigned int fp, mode; int ok = 1; printk("Backtrace: "); if (!tsk) tsk = current; if (regs) { fp = frame_pointer(regs); mode = processor_mode(regs); } else if (tsk != current) { fp = thread_saved_fp(tsk); mode = 0x10; } else { asm("mov %0, fp" : "=r" (fp) : : "cc"); mode = 0x10; } if (!fp) { pr_cont("no frame pointer"); ok = 0; } else if (verify_stack(fp)) { pr_cont("invalid frame pointer 0x%08x", fp); ok = 0; } else if (fp < (unsigned long)end_of_stack(tsk)) pr_cont("frame pointer underflow"); pr_cont("\n"); if (ok) c_backtrace(fp, mode); }
注意,编译器的优化策略,可能导致得到的回溯信息不准确。而且,对于GUN编译器而言,必须使用-rdynamic链接选项( -rdynamic可用来通知链接器将所有符号添加到动态符号表中),才能正确解析出符号名。此时可以使用unwind方法回溯
coredump文件本身主要的格式也是ELF格式,因此,我们可以通过readelf命令进行判断。
get_signal 这里没判断是不是信号是不是要触发core dump,然后调用do_coredump
最后会调用elf_core_dump以内核代码elf_core_dump函数为入口分析core文件怎么生成的:
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· 没有源码,如何修改代码逻辑?
· PowerShell开发游戏 · 打蜜蜂
· 在鹅厂做java开发是什么体验
· 百万级群聊的设计实践
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战