深入理解系统调用

1. 实验环境

实验采用阿里云主机完成,操作系统是Ubuntu18.04,下载好linux-5.4.34.tar.xz。

相关配置:

编译源代码:

xz -d linux-5.4.34.tar.xz
tar -xvf linux-5.4.34.tar
cd linux-5.4.34
make defconfig # Default configuration is based on 'x86_64_defconfig'
make menuconfig
#打开debug相关选项
Kernel hacking  ---> 
    Compile-time checks and compiler options  ---> 
       [*] Compile the kernel with debug info 
       [*]   Provide GDB scripts for kernel debugging
[*] Kernel debugging 
#关闭KASLR,否则会导致打断点失败
Processor type and features ----> 
   [] Randomize the address of the kernel image (KASLR)
make -j4
qemu-system-x86_64 -kernel arch/x86/boot/bzImage

制作根⽂件系统

axel -n 20 https://busybox.net/downloads/busybox-1.31.1.tar.bz2 
tar -jxvf busybox-1.31.1.tar.bz2 
cd busybox-1.31.1

make menuconfig 
#记得要编译成静态链接,不⽤动态链接库。
Settings  --->
    [*] Build static binary (no shared libs) 
#然后编译安装,默认会安装到源码⽬录下的 _install ⽬录中。 
make -j$(nproc) && make install
mkdir rootfs
cd rootfs
cp ../busybox-1.31.1/_install/* ./ -rf
mkdir dev proc sys home
sudo cp -a /dev/{null,console,tty,tty1,tty2,tty3,tty4} dev/

运行测试

find . -print0 | cpio --null -ov --format=newc | gzip -9 > ../rootfs.cpio.gz
qemu-system-x86_64 -kernel linux-5.4.34/arch/x86/boot/bzImage-initrd rootfs.cpio.gz

2. 系统调用程序

我的学号后两位是06,在linux-5.4.34/arch/x86/entry/syscalls目录下查阅syscall_64.tbl文件,06号系统调用为__x64_sys_newlstat,对应的函数是lstat。

0       common  read                    __x64_sys_read
1       common  write                   __x64_sys_write
2       common  open                    __x64_sys_open
3       common  close                   __x64_sys_close
4       common  stat                    __x64_sys_newstat
5       common  fstat                   __x64_sys_newfstat
6       common  lstat                   __x64_sys_newlstat
7       common  poll                    __x64_sys_poll
8       common  lseek                   __x64_sys_lseek
9       common  mmap                    __x64_sys_mmap
10      common  mprotect                __x64_sys_mprotect

查看man lstat:

NAME
       stat, fstat, lstat, fstatat - get file status

SYNOPSIS
       #include <sys/types.h>
       #include <sys/stat.h>
       #include <unistd.h>

       int stat(const char *pathname, struct stat *statbuf);
       int fstat(int fd, struct stat *statbuf);
       int lstat(const char *pathname, struct stat *statbuf);

       #include <fcntl.h>           /* Definition of AT_* constants */
       #include <sys/stat.h>

       int fstatat(int dirfd, const char *pathname, struct stat *statbuf,
                   int flags);

   Feature Test Macro Requirements for glibc (see feature_test_macros(7)):

       lstat():
           /* glibc 2.19 and earlier */ _BSD_SOURCE
               || /* Since glibc 2.20 */ _DEFAULT_SOURCE
               || _XOPEN_SOURCE >= 500
               || /* Since glibc 2.10: */ _POSIX_C_SOURCE >= 200112L

       fstatat():
           Since glibc 2.10:
               _POSIX_C_SOURCE >= 200809L
           Before glibc 2.10:
               _ATFILE_SOURCE

DESCRIPTION
       These  functions  return information about a file, in the buffer pointed to by statbuf.  No permissions are required on the file itself, but—in the case of stat(), fstatat(), and
       lstat()—execute (search) permission is required on all of the directories in pathname that lead to the file.

       stat() and fstatat() retrieve information about the file pointed to by pathname; the differences for fstatat() are described below.

       lstat() is identical to stat(), except that if pathname is a symbolic link, then it returns information about the link itself, not the file that it refers to.

       fstat() is identical to stat(), except that the file about which information is to be retrieved is specified by the file descriptor fd.

c代码调用:

int main(int argc, char *argv[]) {
  struct stat sb;

  if (argc != 2) {
    fprintf(stderr, "Usage: %s <pathname>\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  if (lstat(argv[1], &sb) == -1) {
    perror("lstat");
    exit(EXIT_FAILURE);
  }

  printf("ID of containing device:  [%lx,%lx]\n",
         (long) major(sb.st_dev), (long) minor(sb.st_dev));

  printf("File type:                ");

  switch (sb.st_mode & S_IFMT) {
    case S_IFBLK:
      printf("block device\n");
      break;
    case S_IFCHR:
      printf("character device\n");
      break;
    case S_IFDIR:
      printf("directory\n");
      break;
    case S_IFIFO:
      printf("FIFO/pipe\n");
      break;
    case S_IFLNK:
      printf("symlink\n");
      break;
    case S_IFREG:
      printf("regular file\n");
      break;
    case S_IFSOCK:
      printf("socket\n");
      break;
    default:
      printf("unknown?\n");
      break;
  }

  printf("I-node number:            %ld\n", (long) sb.st_ino);

  printf("Mode:                     %lo (octal)\n",
         (unsigned long) sb.st_mode);
  printf("Link count:               %ld\n", (long) sb.st_nlink);
  printf("Ownership:                UID=%ld   GID=%ld\n",
         (long) sb.st_uid, (long) sb.st_gid);

  printf("Preferred I/O block size: %ld bytes\n",
         (long) sb.st_blksize);
  printf("File size:                %lld bytes\n",
         (long long) sb.st_size);
  printf("Blocks allocated:         %lld\n",
         (long long) sb.st_blocks);

  printf("Last status change:       %s", ctime(&sb.st_ctime));
  printf("Last file access:         %s", ctime(&sb.st_atime));
  printf("Last file modification:   %s", ctime(&sb.st_mtime));

  exit(EXIT_SUCCESS);
}

汇编代码调用:

int main(int argc, char *argv[]) {
  struct stat sb;

  if (argc != 2) {
    fprintf(stderr, "Usage: %s <pathname>\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  int res; // 存储函数运算结果
  asm volatile(
    "movq %1, %%rbx\n\t"  // 将第一个参数 argv[1] 放入 ebx 寄存器
    "movq %2, %%rcx\n\t"  // 将第二个参数 &sb 放入 ecx 寄存器
    "movl $6, %%eax\n\t" // lstat 的系统调用号为06,将其放入 eax 寄存器
    "int $0x80\n\t" // 触发系统中断
    "movq %%rax, %0\n\t" // 将函数处理结果返回给 res 变量中
    :"=m"(res)
    :"b"(argv[1]), "c"(&sb)
  );

  if (res == -1) {
    perror("lstat");
    exit(EXIT_FAILURE);
  }

  printf("ID of containing device:  [%lx,%lx]\n",
         (long) major(sb.st_dev), (long) minor(sb.st_dev));

  printf("File type:                ");

  switch (sb.st_mode & S_IFMT) {
    case S_IFBLK:
      printf("block device\n");
      break;
    case S_IFCHR:
      printf("character device\n");
      break;
    case S_IFDIR:
      printf("directory\n");
      break;
    case S_IFIFO:
      printf("FIFO/pipe\n");
      break;
    case S_IFLNK:
      printf("symlink\n");
      break;
    case S_IFREG:
      printf("regular file\n");
      break;
    case S_IFSOCK:
      printf("socket\n");
      break;
    default:
      printf("unknown?\n");
      break;
  }

  printf("I-node number:            %ld\n", (long) sb.st_ino);

  printf("Mode:                     %lo (octal)\n",
         (unsigned long) sb.st_mode);
  printf("Link count:               %ld\n", (long) sb.st_nlink);
  printf("Ownership:                UID=%ld   GID=%ld\n",
         (long) sb.st_uid, (long) sb.st_gid);

  printf("Preferred I/O block size: %ld bytes\n",
         (long) sb.st_blksize);
  printf("File size:                %lld bytes\n",
         (long long) sb.st_size);
  printf("Blocks allocated:         %lld\n",
         (long long) sb.st_blocks);

  printf("Last status change:       %s", ctime(&sb.st_ctime));
  printf("Last file access:         %s", ctime(&sb.st_atime));
  printf("Last file modification:   %s", ctime(&sb.st_mtime));

  exit(EXIT_SUCCESS);
}

3.GDB跟踪系统调用的内核处理过程

启动虚拟机:

qemu-system-x86_64 -kernel linux-5.4.34/arch/x86/boot/bzImage -initrd rootfs.cpio.gz -S –s

在另一个终端调试:

cd linux-5.4.34/
gdb vmlinux
(gdb) target remote:1234
(gdb) b __x64_sys_newlstat

首先断点定位为到fs/stat.c

SYSCALL_DEFINE2(newlstat, const char __user *, filename,
		struct stat __user *, statbuf)
{
	struct kstat stat;
	int error;

	error = vfs_lstat(filename, &stat);
	if (error)
		return error;

	return cp_new_stat(&stat, statbuf);
}

执行完这个函数,发现回到了函数堆栈上一层的do_sys_call_64 中 ,接下来要执行的 syscall_return_slowpath 函数要为恢复现场做准备,函数的调用顺序为entry_SYSCALL_64 () —> do_syscall_64() —> __x64_sys_newlstat()

(gdb) n
do_syscall_64 (nr=140730116323824, regs=0xffffc900001b7f58)
    at arch/x86/entry/common.c:300
300        syscall_return_slowpath(regs);
(gdb) n
301    }
(gdb) n
entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:184
184        movq    RCX(%rsp), %rcx
(gdb) n
185        movq    RIP(%rsp), %r11
(gdb) n
187        cmpq    %rcx, %r11    /* SYSRET requires RCX == RIP */
(gdb) n
188        jne    swapgs_restore_regs_and_return_to_usermode
(gdb) n
205        shl    $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
(gdb) n
206        sar    $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
(gdb) n
210        cmpq    %rcx, %r11
(gdb) n
211        jne    swapgs_restore_regs_and_return_to_usermode
(gdb) n
213        cmpq    $__USER_CS, CS(%rsp)        /* CS must match SYSRET */
(gdb) n
214        jne    swapgs_restore_regs_and_return_to_usermode
(gdb) n
216        movq    R11(%rsp), %r11
(gdb) n
217        cmpq    %r11, EFLAGS(%rsp)        /* R11 == RFLAGS */
(gdb) n
218        jne    swapgs_restore_regs_and_return_to_usermode
(gdb) n
238        testq    $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
(gdb) n
239        jnz    swapgs_restore_regs_and_return_to_usermode
(gdb) n
243        cmpq    $__USER_DS, SS(%rsp)        /* SS must match SYSRET */
(gdb) n
244        jne    swapgs_restore_regs_and_return_to_usermode
(gdb) n
253        POP_REGS pop_rdi=0 skip_r11rcx=1
(gdb) n
entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:259
259        movq    %rsp, %rdi
(gdb) n
260        movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
(gdb) n
entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:262
262        pushq    RSP-RDI(%rdi)    /* RSP */
(gdb) n
entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:263
263        pushq    (%rdi)        /* RDI */
(gdb) n
entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:271
271        SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
(gdb) n
273        popq    %rdi
(gdb) n
entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:274
274        popq    %rsp
(gdb) n
entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:275
275        USERGS_SYSRET64
(gdb) n
0x0000000000448b97 in ?? ()
(gdb) c
Continuing.
(gdb) 

4.总结

系统调⽤的初始化,也就是将系统调⽤处理⼊⼝地址告诉CPU到哪⾥找:

int $0x80是放到和其他中断⼀样按中断向量依次存放

sysenter和syscall都借助CPU内部的MSR寄存器来存放,所以查找 系统调⽤处理⼊⼝地址会更快,因此也称为快速系统调⽤

系统调⽤的执⾏,也就是⽤户程序触发系统调⽤之后,CPU及内核执⾏系统调⽤的过程:

int $0x80是CPU压栈⼀些关键寄存器,接着内核负责保存现场,系统调⽤内核 函数处理完后恢复现场,最后通过iret出栈哪些CPU压栈的关键寄存器。

sysenter和syscall都借助CPU内部的MSR寄存器来查找系统调⽤处理⼊⼝,可 以快速切换CPU的指令指针(eip/rip)到系统调⽤处理⼊⼝,但本质上还是中 断处理的思路,压栈关键寄存器、保存现场、恢复现场,最后系统调⽤返回。

x86-64引⼊了swapgs指令,类似快照的⽅式将保存现场和恢复现场时的CPU寄 存器也通过CPU内部的存储器快速保存和恢复,近⼀步加快了系统调⽤。

从系统调用的整个过程来看,主要有以下几个阶段:

(1)用户态程序,发生syscall,触发系统调用;

(2)进入内核态,完成内核初始化后,调用entry_SYSCALL_64 ()

(3)完成现场的保存,将关键寄存器压栈,并从CPU内部的MSR寄存器来查找系统调⽤处理⼊⼝,更改CPU的指令指针(eip/rip)到系统调⽤处理⼊⼝ ,调用do_syscall_64()

(4)do_syscall_64()函数中得到系统调用号,调用相关的函数gettimeofday()

(5)调用结束后,保存现场和恢复现场时的CPU存器也通过CPU内部的存储器快速保存和恢复 

(6)系统调用返回,回到用户态程序

posted on 2020-05-26 12:05  galvinchan  阅读(532)  评论(0编辑  收藏  举报

导航