深入理解系统调用

　　本文将基于Linux内核通过调试跟踪，深入理解Linux的系统调用过程。本人学号04结尾，在arch/x86/entry/syscalls/syscall_64.tbl 可以找到04号为stat系统调⽤，因此以系统调用stat为例进行展开。

一、实验目的

1.找一个系统调用，系统调用号为学号最后2位相同的系统调用，本人学号最后两位为04，即要测试的系统调用号为04

2.通过汇编指令触发该系统调用

3.通过gdb跟踪该系统调用的内核处理过程

4.重点阅读分析系统调用入口的保存现场、恢复现场和系统调用返回，以及重点关注系统调用过程中内核堆栈状态的变化

二、环境准备

安装开发工具：

sudo apt install build-essential
sudo apt install qemu # install QEMU
sudo apt install libncurses5-dev bison ﬂex libssl-dev libelf-dev

下载内核源码：

sudo apt install axel
axel -n 20 https://mirrors.edge.kernel.org/pub/linux/kernel/v5.x/ linux-5.4.34.tar.xz 
xz -d linux-5.4.34.tar.xz 
tar -xvf linux-5.4.34.tar 
cd linux-5.4.34

配置内核选项：

make defconfig #Default configuration is based on 'x86_64_defconfig'
make menuconfig

Kernel hacking > Compile-time check and compiler options > Provide GDB scripts for kernel debugging 打开
返回上一页勾选Kernel debugging
返回首页，选中Processor type and features ---->Randomize the address of the kernel image (KASLR) 关闭

编译和运行内核：

make -j$(nproc)
qemu-system-x86_64 -kernel arch/x86/boot/bzImage

制作根文件系统：

axel -n 20 https://busybox.net/downloads/busybox-1.31.1.tar.bz2 
tar -jxvf busybox-1.31.1.tar.bz2 
cd busybox-1.31.1

make menuconﬁg 
#记得要编译成静态链接，不⽤动态链接库。
Settings  --->
    [*] Build static binary (no shared libs) 
#然后编译安装，默认会安装到源码⽬录下的 _install ⽬录中。 
make -j$(nproc) && make install

制作内存根文件系统镜像：

#返回到~/目录下，创建根文件目录
mkdir rootfs
cd rootfs
cp ../busybox-1.31.1/_install/* ./ -rf
mkdir dev proc sys home 
sudo cp -a /dev/{null,console,tty,tty1,tty2,tty3,tty4} dev/

在rootfs目录下制作init脚本：

touch init
vim init
 #!/bin/sh
 mount -t proc none /proc 
 mount -t sysfs none /sys
 echo "Wellcome MyOS!"
 echo "--------------------" 
 cd home
 /bin/sh

给init脚本添加可执行权限：

chmod +x init

打包成内存根文件系统镜像：

find . -print0 | cpio --null -ov --format=newc | gzip -9 > ../rootfs.cpio.gz

测试挂载根文件系统，看内核启动完成后是否执行init脚本：

qemu-system-x86_64 -kernel linux-5.4.34/arch/x86/boot/bzImage -initrd rootfs.cpio.gz

运行结果如下所示：

三、查看系统调用并编写调用汇编代码

打开/linux-5.4.34/arch/x86/entry/syscalls/syscall_64.tbl

函数声明：int stat(const char *file_name, struct stat *buf)

函数功能说明: 通过文件名filename获取文件信息，并保存在buf所指的结构体stat中

返回值: 执行成功则返回0，失败返回-1，错误代码存于errno

编写测试程序

/* file stat example */  
  
#include <stdio.h>  
#include <unistd.h>  
#include <sys/stat.h>  
#include <sys/types.h>  
  
#include <stdlib.h>  
#include <time.h>  
  
int main(int argc, char **argv){  
  
  struct stat st;  
  
  if(argc != 2){  
    fprintf(stderr, "Usage: %s <file_pathname> \n", argv[0]);  
        exit(EXIT_FAILURE);  
    }  
  
    if(stat(argv[1], &st) == -1){  
        perror("stat");  
        exit(EXIT_SUCCESS);  
    }  
  
    printf("File type:                ");  
    switch(st.st_mode & S_IFMT){  
      case S_IFBLK:  printf("block device\n");            break;  
      case S_IFCHR:  printf("character device\n");        break;  
      case S_IFDIR:  printf("directory\n");               break;  
      case S_IFIFO:  printf("FIFO/pipe\n");               break;  
      case S_IFLNK:  printf("symlink\n");                 break;  
      case S_IFREG:  printf("regular file\n");            break;  
      case S_IFSOCK: printf("socket\n");                  break;  
      default:       printf("unknown?\n");                break;  
  }  
    
  printf("I-node number:            %ld\n", (long) st.st_ino);  
  printf("Mode:                     %lo (octal)\n", (unsigned long) st.st_mode);  
  printf("Link count:               %ld\n", (long) st.st_nlink);  
  printf("Ownership:                UID=%ld   GID=%ld\n", (long) st.st_uid, (long) st.st_gid);  
  printf("device containing file id:%ld \n", (long) st.st_dev);  
  printf("device id:                %ld \n", (long) st.st_rdev);  
  printf("File size:                %lld bytes\n", (long long) st.st_size);  
  printf("Preferred I/O block size: %ld bytes\n", (long) st.st_blksize);  
  printf("Blocks allocated:         %lld\n", (long long) st.st_blocks);  
  printf("Last status change:       %s", ctime(&st.st_ctime));  
  printf("Last file access:         %s", ctime(&st.st_atime));  
  printf("Last file modification:   %s", ctime(&st.st_mtime));  
  
  exit(EXIT_SUCCESS);  
}

然后静态编译，并且反汇编存入文件example.S中。

gcc -o example example.c -static 
objdump -S example > example.S

打开openfile.S文件，找到main函数，顺着函数调用可以找到syscall指令。

通过调试程序发现调用stat()函数后，会跳转到 ../sysdeps/unix/sysv/linux/wordsize-64/fxstat.c 文件里的__fxstat()函数，最终调用 INLINE_SYSCALL() 函数，其中会调用syscall指令触发系统调用中断。

#include <errno.h>
#include <stddef.h>
#include <sys/stat.h>

#include <sysdep.h>
#include <sys/syscall.h>

/* Get information about the file FD in BUF.  */
int
__fxstat (int vers, int fd, struct stat *buf)
{
  if (vers == _STAT_VER_KERNEL || vers == _STAT_VER_LINUX)
    return INLINE_SYSCALL (fstat, 2, fd, buf);

  __set_errno (EINVAL);
  return -1;
}

hidden_def (__fxstat)
weak_alias (__fxstat, _fxstat);
#undef __fxstat64
strong_alias (__fxstat, __fxstat64);
hidden_ver (__fxstat, __fxstat64)

四、系统调用初始化

　　系统启动时会调用arch/x86/kernel/cpu/common.c 的 syscall_init()函数进行初始化。可以看到 MSR_STAR 的第 32-47 位设置为 kernel mode 的 cs，48-63位设置为 user mode 的 cs。而 IA32_LSTAR 被设置为函数 entry_SYSCALL_64 的起始地址（需要注意intel64位增加了MSR寄存器，用于快速找到系统调用的处理函数，这与32位机不同）。于是 syscall 时，跳转到 entry_SYSCALL_64 开始执行，其定义在 arch/x86/entry/entry_64.S中。

void syscall_init(void)
{
    wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
    wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

#ifdef CONFIG_IA32_EMULATION
    wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
    /*
     * This only works on Intel CPUs.
     * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
     * This does not cause SYSENTER to jump to the wrong location, because
     * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
     */
    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
    wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
    wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
    wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif

    /* Flags to clear on syscall */
    wrmsrl(MSR_SYSCALL_MASK,
           X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
           X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}

五、entry_SYSCALL_64

　　entry_SYSCALL_64作为系统调用（属于软中断）的处理函数，它的源代码如下：

ENTRY(entry_SYSCALL_64)
    /*
     * Interrupts are off on entry.
     * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
     * it is too small to ever cause noticeable irq latency.
     */
    SWAPGS_UNSAFE_STACK
    // KAISER 进内核态需要切到内核页表
    SWITCH_KERNEL_CR3_NO_STACK
    /*
     * A hypervisor implementation might want to use a label
     * after the swapgs, so that it can do the swapgs
     * for the guest and jump here on syscall.
     */
GLOBAL(entry_SYSCALL_64_after_swapgs)
    // 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中
    movq    %rsp, PER_CPU_VAR(rsp_scratch)
    // 加载内核栈偏移
    movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

    TRACE_IRQS_OFF

    /* Construct struct pt_regs on stack */
    pushq   $__USER_DS          /* pt_regs->ss */
    pushq   PER_CPU_VAR(rsp_scratch)    /* pt_regs->sp */
    pushq   %r11                /* pt_regs->flags */
    pushq   $__USER_CS          /* pt_regs->cs */
    pushq   %rcx                /* pt_regs->ip */
    pushq   %rax                /* pt_regs->orig_ax */
    pushq   %rdi                /* pt_regs->di */
    pushq   %rsi                /* pt_regs->si */
    pushq   %rdx                /* pt_regs->dx */
    pushq   %rcx                /* pt_regs->cx */
    pushq   $-ENOSYS            /* pt_regs->ax */
    pushq   %r8             /* pt_regs->r8 */
    pushq   %r9             /* pt_regs->r9 */
    pushq   %r10                /* pt_regs->r10 */
    pushq   %r11                /* pt_regs->r11 */
    // 为r12-r15, rbp, rbx保留位置
    sub $(6*8), %rsp            /* pt_regs->bp, bx, r12-15 not saved */

    /*
     * If we need to do entry work or if we guess we'll need to do
     * exit work, go straight to the slow path.
     */
    movq    PER_CPU_VAR(current_task), %r11
    testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
    jnz entry_SYSCALL64_slow_path

entry_SYSCALL_64_fastpath:
    /*
     * Easy case: enable interrupts and issue the syscall.  If the syscall
     * needs pt_regs, we'll call a stub that disables interrupts again
     * and jumps to the slow path.
     */
    TRACE_IRQS_ON
    ENABLE_INTERRUPTS(CLBR_NONE)
#if __SYSCALL_MASK == ~0
    // 确保系统调用号没超过最大值，超过了则跳转到后面的符号 1 处进行返回
    cmpq    $__NR_syscall_max, %rax
#else
    andl    $__SYSCALL_MASK, %eax
    cmpl    $__NR_syscall_max, %eax
#endif
    ja  1f              /* return -ENOSYS (already in pt_regs->ax) */
    // 除系统调用外的其他调用都通过 rcx 来传第四个参数，因此将 r10 的内容设置到 rcx
    movq    %r10, %rcx

    /*
     * This call instruction is handled specially in stub_ptregs_64.
     * It might end up jumping to the slow path.  If it jumps, RAX
     * and all argument registers are clobbered.
     */
    // 调用系统调用表中对应的函数
    call    *sys_call_table(, %rax, 8)
.Lentry_SYSCALL_64_after_fastpath_call:
    // 将函数返回值压到栈中，返回时弹出
    movq    %rax, RAX(%rsp)
1:

    /*
     * If we get here, then we know that pt_regs is clean for SYSRET64.
     * If we see that no exit work is required (which we are required
     * to check with IRQs off), then we can go straight to SYSRET64.
     */
    DISABLE_INTERRUPTS(CLBR_NONE)
    TRACE_IRQS_OFF
    movq    PER_CPU_VAR(current_task), %r11
    testl   $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
    jnz 1f

    LOCKDEP_SYS_EXIT
    TRACE_IRQS_ON       /* user mode is traced as IRQs on */
    movq    RIP(%rsp), %rcx
    movq    EFLAGS(%rsp), %r11
    RESTORE_C_REGS_EXCEPT_RCX_R11
    /*
     * This opens a window where we have a user CR3, but are
     * running in the kernel.  This makes using the CS
     * register useless for telling whether or not we need to
     * switch CR3 in NMIs.  Normal interrupts are OK because
     * they are off here.
     */
    SWITCH_USER_CR3
    movq    RSP(%rsp), %rsp
    USERGS_SYSRET64

1:
    /*
     * The fast path looked good when we started, but something changed
     * along the way and we need to switch to the slow path.  Calling
     * raise(3) will trigger this, for example.  IRQs are off.
     */
    TRACE_IRQS_ON
    ENABLE_INTERRUPTS(CLBR_NONE)
    SAVE_EXTRA_REGS
    movq    %rsp, %rdi
    call    syscall_return_slowpath /* returns with IRQs disabled */
    jmp return_from_SYSCALL_64

entry_SYSCALL64_slow_path:
    /* IRQs are off. */
    SAVE_EXTRA_REGS
    movq    %rsp, %rdi
    call    do_syscall_64       /* returns with IRQs disabled */

return_from_SYSCALL_64:
    RESTORE_EXTRA_REGS
    TRACE_IRQS_IRETQ        /* we're about to change IF */

    /*
     * Try to use SYSRET instead of IRET if we're returning to
     * a completely clean 64-bit userspace context.
     */
    movq    RCX(%rsp), %rcx
    movq    RIP(%rsp), %r11
    cmpq    %rcx, %r11          /* RCX == RIP */
    jne opportunistic_sysret_failed

    /*
     * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
     * in kernel space.  This essentially lets the user take over
     * the kernel, since userspace controls RSP.
     *
     * If width of "canonical tail" ever becomes variable, this will need
     * to be updated to remain correct on both old and new CPUs.
     */
    .ifne __VIRTUAL_MASK_SHIFT - 47
    .error "virtual address width changed -- SYSRET checks need update"
    .endif

    /* Change top 16 bits to be the sign-extension of 47th bit */
    shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
    sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx

    /* If this changed %rcx, it was not canonical */
    cmpq    %rcx, %r11
    jne opportunistic_sysret_failed

    cmpq    $__USER_CS, CS(%rsp)        /* CS must match SYSRET */
    jne opportunistic_sysret_failed

    movq    R11(%rsp), %r11
    cmpq    %r11, EFLAGS(%rsp)      /* R11 == RFLAGS */
    jne opportunistic_sysret_failed

    /*
     * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
     * restore RF properly. If the slowpath sets it for whatever reason, we
     * need to restore it correctly.
     *
     * SYSRET can restore TF, but unlike IRET, restoring TF results in a
     * trap from userspace immediately after SYSRET.  This would cause an
     * infinite loop whenever #DB happens with register state that satisfies
     * the opportunistic SYSRET conditions.  For example, single-stepping
     * this user code:
     *
     *           movq   $stuck_here, %rcx
     *           pushfq
     *           popq %r11
     *   stuck_here:
     *
     * would never get past 'stuck_here'.
     */
    testq   $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
    jnz opportunistic_sysret_failed

    /* nothing to check for RSP */

    cmpq    $__USER_DS, SS(%rsp)        /* SS must match SYSRET */
    jne opportunistic_sysret_failed

    /*
     * We win! This label is here just for ease of understanding
     * perf profiles. Nothing jumps here.
     */
syscall_return_via_sysret:
    /* rcx and r11 are already restored (see code above) */
    RESTORE_C_REGS_EXCEPT_RCX_R11
    /*
     * This opens a window where we have a user CR3, but are
     * running in the kernel.  This makes using the CS
     * register useless for telling whether or not we need to
     * switch CR3 in NMIs.  Normal interrupts are OK because
     * they are off here.
     */
    // KAISER 返回用户态需要切回用户页表
    SWITCH_USER_CR3
    /* 根据压栈的内容，恢复 rsp 为用户态的栈顶 */
    movq    RSP(%rsp), %rsp
    USERGS_SYSRET64

    // 无法快速返回，只能退化到 iret
opportunistic_sysret_failed:
    /*
     * This opens a window where we have a user CR3, but are
     * running in the kernel.  This makes using the CS
     * register useless for telling whether or not we need to
     * switch CR3 in NMIs.  Normal interrupts are OK because
     * they are off here.
     */
    SWITCH_USER_CR3
    SWAPGS
    jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)

首先将当前用户态栈偏移 rsp 存到 per-cpu 变量 rsp_scratch 中，然后将 per-cpu 变量 cpu_current_top_of_stack ，即内核态的栈偏移加载到 rsp。随后将各寄存器中的值压入内核态的栈中，包括rax、rcx、r11、rdi、rsi、rdx、r10、r8、r9。接着根据系统调用号从系统调用表(sys_call_table) 中找到相应的处理函数并执行，最终通过 USERGS_SYSRET64 ，即 sysretq 返回。

posted @ 2020-05-27 22:55 LemonTreeKjy 阅读(747) 评论(0) 编辑收藏举报

刷新页面返回顶部

LemonTreeKjy

深入理解系统调用

深入理解系统调用

一、实验目的

二、环境准备

三、查看系统调用并编写调用汇编代码

四、系统调用初始化

五、entry_SYSCALL_64

公告