gdb catch syscall的内核支持

intro

通常使用gdb调试器,希望知道某个系统调用的发生时机,直接在该系统调用打断点即可。这里有一个假设就是这里使用的glibc库的实现,但是go生成的可执行文件就是一个单独的、静态链接文件,在go生成文件中,gdb的时候并没有可以打断点监测系统调用的方法。

我想在go中大概率有对特定系统调用打断点的方法,这里讨论的是不依赖go的(未知)知识,只是使用gdb/kernel这些(已知)功能来达到这个效果。

gdb

熟悉gdb的可能会有印象,gdb有一个watch syscall的功能,因为在通常的gcc生成文件中,系统调用通常通过glibc完成,直接通过函数名打断点轻松加愉快,这个功能的意义就不明显了,以至于该功能常识性被忽略。

(gdb) help catch syscall 
Catch system calls by their names, groups and/or numbers.
Arguments say which system calls to catch.  If no arguments are given,
every system call will be caught.  Arguments, if given, should be one
or more system call names (if your system supports that), system call
groups or system call numbers.
(gdb) 

syscall

MSR

虽然这函数的名字包含了idt,但是实现中已经不是使用int 0x80这种传统的系统调用触发方式来完成了,而是通过修改机器状态寄存器来实现并配合x86的syscall指令来完成。

static inline void idt_syscall_init(void)
{
	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

	if (ia32_enabled()) {
		wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
		/*
		 * This only works on Intel CPUs.
		 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
		 * This does not cause SYSENTER to jump to the wrong location, because
		 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
		 */
		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
		wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
			    (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
	} else {
		wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
		wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
	}

	/*
	 * Flags to clear on syscall; clear as much as possible
	 * to minimize user space-kernel interference.
	 */
	wrmsrl(MSR_SYSCALL_MASK,
	       X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
	       X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
	       X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
	       X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
	       X86_EFLAGS_AC|X86_EFLAGS_ID);
}

汇编使用的gcc的汇编语言扩展

static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
	asm volatile("1: wrmsr\n"
		     "2:\n"
		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
		     : : "c" (msr), "a"(low), "d" (high) : "memory");
}

intel处理器中关于该指令的说明

Write the value in EDX:EAX to MSR specified by ECX

syscall指令也有需要使用wrmsr来设置handler的说明

SYSCALL—Fast System Call
SYSCALL invokes an OS system-call handler at privilege level 0. It does so by loading RIP from the IA32_LSTAR
MSR (after saving the address of the instruction following SYSCALL into RCX). (The WRMSR instruction ensures
that the IA32_LSTAR MSR always contain a canonical address.)

API

当application通过syscall进入内核之后(这里略过了gdb通过ptrace设置SYSCALL_WORK_SYSCALL_TRACE的路径),会通过该调用链触发ptrace_report_syscall_entry。ptrace_report_syscall_entry函数会发送SIGTRAP信号,调试器会捕捉到该信号并做后续处理。

也就是说:gdb实现该功能,主要是依赖内核提供的功能。

do_syscall_64>>syscall_enter_from_user_mode>>syscall_enter_from_user_mode_work==>>syscall_trace_enter

long syscall_trace_enter(struct pt_regs *regs, long syscall,
				unsigned long work)
{
	long ret = 0;

	/*
	 * Handle Syscall User Dispatch.  This must comes first, since
	 * the ABI here can be something that doesn't make sense for
	 * other syscall_work features.
	 */
	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
		if (syscall_user_dispatch(regs))
			return -1L;
	}

	/* Handle ptrace */
	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
		ret = ptrace_report_syscall_entry(regs);
		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
			return -1L;
	}

	/* Do seccomp after ptrace, to catch any tracer changes. */
	if (work & SYSCALL_WORK_SECCOMP) {
		ret = __secure_computing(NULL);
		if (ret == -1L)
			return ret;
	}

	/* Either of the above might have changed the syscall number */
	syscall = syscall_get_nr(current, regs);

	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
		trace_sys_enter(regs, syscall);
		/*
		 * Probes or BPF hooks in the tracepoint may have changed the
		 * system call number as well.
		 */
		syscall = syscall_get_nr(current, regs);
	}

	syscall_enter_audit(regs, syscall);

	return ret ? : syscall;
}

int 0x80

对系统调用的方法还停留在int 0x80的印象中,现在发现这种方法已经被默认废除了,而是使用新的/更高效的syscall方法来实现了。

那么在x86_64体系下,int 0x80这种方式还能继续使用吗?

SO的答案回答了该问题:

int 0x80 works as long as all arguments (including pointers) fit in the low 32 of a register. This is the case for static code and data in the default code model ("small") in the x86-64 SysV ABI. (Section 3.5.1 : all symbols are known to be located in the virtual addresses in the range 0x00000000 to 0x7effffff, so you can do stuff like mov edi, hello (AT&T mov $hello, %edi) to get a pointer into a register with a 5 byte instruction).

回复中信息很多,可能也有些过时,简言之就是不要这么用。

在新的内核(v6.9-rc7)版本中,是否初始化int 0x80的处理表是通过ia32_enabled()函数控制的。

///@file:arch\x86\kernel\idt.c
static const struct idt_data ia32_idt[] __initconst = {
#if defined(CONFIG_IA32_EMULATION)
	SYSG(IA32_SYSCALL_VECTOR,	asm_int80_emulation),
#elif defined(CONFIG_X86_32)
	SYSG(IA32_SYSCALL_VECTOR,	entry_INT80_32),
#endif
};

/**
 * idt_setup_traps - Initialize the idt table with default traps
 */
void __init idt_setup_traps(void)
{
	idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);

	if (ia32_enabled())
		idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true);
}

#ifdef CONFIG_IA32_EMULATION

extern bool __ia32_enabled;

static __always_inline bool ia32_enabled(void)
{
	return __ia32_enabled;
}

static inline void ia32_disable(void)
{
	__ia32_enabled = false;
}

#else /* !CONFIG_IA32_EMULATION */

static __always_inline bool ia32_enabled(void)
{
	return IS_ENABLED(CONFIG_X86_32);
}

static inline void ia32_disable(void) {}

#endif

#ifdef CONFIG_IA32_EMULATION
bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);

static int ia32_emulation_override_cmdline(char *arg)
{
	return kstrtobool(arg, &__ia32_enabled);
}
early_param("ia32_emulation", ia32_emulation_override_cmdline);
#endif

相应缺省值在arch/x86/Kconfig文件,其中说明了大概率默认是打开的,也就是在x86环境下默认是打开了这种选项的。当然可以启动时给内核传递特殊参数(ia32_emulation)来关掉该功能。

///@file: arch/x86/Kconfig
config IA32_EMULATION
    bool "IA32 Emulation"
    depends on X86_64
    select ARCH_WANT_OLD_COMPAT_IPC
    select BINFMT_ELF
    select COMPAT_OLD_SIGACTION
    help 
      Include code to run legacy 32-bit programs under a
      64-bit kernel. You should likely turn this on, unless you're
      100% sure that you don't have any 32-bit programs left.

config IA32_EMULATION_DEFAULT_DISABLED
    bool "IA32 emulation disabled by default"
    default n
    depends on IA32_EMULATION
    help 
      Make IA32 emulation disabled by default. This prevents loading 32-bit
      processes and access to 32-bit syscalls. If unsure, leave it to its
      default value.

glibc

glibc的实现已经毫无意外的修改为syscall(而不是int 0x80)了。

tsecer@harry: gdb -quiet /lib64/libc.so.6 -ex 'disassemble open'
Reading symbols from /lib64/libc.so.6...
(No debugging symbols found in /lib64/libc.so.6)
Dump of assembler code for function open64:
   0x000000000011f680 <+0>:     endbr64 
   0x000000000011f684 <+4>:     sub    $0x68,%rsp
   0x000000000011f688 <+8>:     mov    %esi,%r10d
   0x000000000011f68b <+11>:    mov    %rdx,0x40(%rsp)
   0x000000000011f690 <+16>:    mov    %fs:0x28,%rax
   0x000000000011f699 <+25>:    mov    %rax,0x28(%rsp)
   0x000000000011f69e <+30>:    xor    %eax,%eax
   0x000000000011f6a0 <+32>:    and    $0x40,%r10d
   0x000000000011f6a4 <+36>:    jne    0x11f700 <open64+128>
   0x000000000011f6a6 <+38>:    mov    %esi,%eax
   0x000000000011f6a8 <+40>:    and    $0x410000,%eax
   0x000000000011f6ad <+45>:    cmp    $0x410000,%eax
   0x000000000011f6b2 <+50>:    je     0x11f700 <open64+128>
   0x000000000011f6b4 <+52>:    lea    0x2a5075(%rip),%rax        # 0x3c4730 <__libc_multiple_threads>
   0x000000000011f6bb <+59>:    mov    (%rax),%eax
   0x000000000011f6bd <+61>:    test   %eax,%eax
   0x000000000011f6bf <+63>:    jne    0x11f72e <open64+174>
   0x000000000011f6c1 <+65>:    mov    %esi,%edx
   0x000000000011f6c3 <+67>:    mov    $0x101,%eax
   0x000000000011f6c8 <+72>:    mov    %rdi,%rsi
   0x000000000011f6cb <+75>:    mov    $0xffffff9c,%edi
   0x000000000011f6d0 <+80>:    syscall 
   0x000000000011f6d2 <+82>:    cmp    $0xfffffffffffff000,%rax

outro

这个功能也是另一个常用系统工具strace的实现基础。

结合之前关于ABI寄存器使用约定,x86的32bits和64bits下还是有一些微妙的差别的。

虽然内核的基本原理没啥变化,但是“And yet it moves”。

posted on 2024-06-15 18:40  tsecer  阅读(8)  评论(0编辑  收藏  举报

导航