kernel源码(十一)system_call.s
1 系统调用流程图
2 源码
/* * linux/kernel/system_call.s * * (C) 1991 Linus Torvalds */ /* * system_call.s contains the system-call low-level handling routines. * This also contains the timer-interrupt handler, as some of the code is * the same. The hd- and flopppy-interrupts are also here. * * NOTE: This code handles signal-recognition, which happens every time * after a timer-interrupt and after each system call. Ordinary interrupts * don't handle signal-recognition, as that would clutter them up totally * unnecessarily. * * Stack layout in 'ret_from_system_call': * * 0(%esp) - %eax * 4(%esp) - %ebx * 8(%esp) - %ecx * C(%esp) - %edx * 10(%esp) - %fs * 14(%esp) - %es * 18(%esp) - %ds * 1C(%esp) - %eip * 20(%esp) - %cs * 24(%esp) - %eflags * 28(%esp) - %oldesp * 2C(%esp) - %oldss */ SIG_CHLD = 17 EAX = 0x00 EBX = 0x04 ECX = 0x08 EDX = 0x0C FS = 0x10 ES = 0x14 DS = 0x18 EIP = 0x1C CS = 0x20 EFLAGS = 0x24 OLDESP = 0x28 OLDSS = 0x2C state = 0 # these are offsets into the task-struct. counter = 4 priority = 8 signal = 12 sigaction = 16 # MUST be 16 (=len of sigaction) blocked = (33*16) # offsets within sigaction sa_handler = 0 sa_mask = 4 sa_flags = 8 sa_restorer = 12 nr_system_calls = 72 /* * Ok, I get parallel printer interrupts while using the floppy for some * strange reason. Urgel. Now I just ignore them. */ .globl _system_call,_sys_fork,_timer_interrupt,_sys_execve .globl _hd_interrupt,_floppy_interrupt,_parallel_interrupt .globl _device_not_available, _coprocessor_error .align 2 bad_sys_call: movl $-1,%eax iret .align 2 reschedule: pushl $ret_from_sys_call jmp _schedule .align 2 _system_call: cmpl $nr_system_calls-1,%eax ja bad_sys_call push %ds push %es push %fs pushl %edx pushl %ecx # push %ebx,%ecx,%edx as parameters pushl %ebx # to the system call movl $0x10,%edx # set up ds,es to kernel space mov %dx,%ds mov %dx,%es movl $0x17,%edx # fs points to local data space mov %dx,%fs call _sys_call_table(,%eax,4) pushl %eax movl _current,%eax cmpl $0,state(%eax) # state jne reschedule cmpl $0,counter(%eax) # counter je reschedule ret_from_sys_call: movl _current,%eax # task[0] cannot have signals cmpl _task,%eax je 3f cmpw $0x0f,CS(%esp) # was old code segment supervisor ? jne 3f cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ? jne 3f movl signal(%eax),%ebx movl blocked(%eax),%ecx notl %ecx andl %ebx,%ecx bsfl %ecx,%ecx je 3f btrl %ecx,%ebx movl %ebx,signal(%eax) incl %ecx pushl %ecx call _do_signal popl %eax 3: popl %eax popl %ebx popl %ecx popl %edx pop %fs pop %es pop %ds iret .align 2 _coprocessor_error: push %ds push %es push %fs pushl %edx pushl %ecx pushl %ebx pushl %eax movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs pushl $ret_from_sys_call jmp _math_error .align 2 _device_not_available: push %ds push %es push %fs pushl %edx pushl %ecx pushl %ebx pushl %eax movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs pushl $ret_from_sys_call clts # clear TS so that we can use math movl %cr0,%eax testl $0x4,%eax # EM (math emulation bit) je _math_state_restore pushl %ebp pushl %esi pushl %edi call _math_emulate popl %edi popl %esi popl %ebp ret .align 2 _timer_interrupt: push %ds # save ds,es and put kernel data space push %es # into them. %fs is used by _system_call push %fs pushl %edx # we save %eax,%ecx,%edx as gcc doesn't pushl %ecx # save those across function calls. %ebx pushl %ebx # is saved as we use that in ret_sys_call pushl %eax movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs incl _jiffies movb $0x20,%al # EOI to interrupt controller #1 outb %al,$0x20 movl CS(%esp),%eax andl $3,%eax # %eax is CPL (0 or 3, 0=supervisor) pushl %eax call _do_timer # 'do_timer(long CPL)' does everything from addl $4,%esp # task switching to accounting ... jmp ret_from_sys_call .align 2 _sys_execve: lea EIP(%esp),%eax pushl %eax call _do_execve addl $4,%esp ret .align 2 _sys_fork: call _find_empty_process testl %eax,%eax js 1f push %gs pushl %esi pushl %edi pushl %ebp pushl %eax call _copy_process addl $20,%esp 1: ret _hd_interrupt: pushl %eax pushl %ecx pushl %edx push %ds push %es push %fs movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs movb $0x20,%al outb %al,$0xA0 # EOI to interrupt controller #1 jmp 1f # give port chance to breathe 1: jmp 1f 1: xorl %edx,%edx xchgl _do_hd,%edx testl %edx,%edx jne 1f movl $_unexpected_hd_interrupt,%edx 1: outb %al,$0x20 call *%edx # "interesting" way of handling intr. pop %fs pop %es pop %ds popl %edx popl %ecx popl %eax iret _floppy_interrupt: pushl %eax pushl %ecx pushl %edx push %ds push %es push %fs movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs movb $0x20,%al outb %al,$0x20 # EOI to interrupt controller #1 xorl %eax,%eax xchgl _do_floppy,%eax testl %eax,%eax jne 1f movl $_unexpected_floppy_interrupt,%eax 1: call *%eax # "interesting" way of handling intr. pop %fs pop %es pop %ds popl %edx popl %ecx popl %eax iret _parallel_interrupt: pushl %eax movb $0x20,%al outb %al,$0x20 popl %eax iret
int 0x80是一个软中断,内核使用这个软中断实现用户空间访问内核空间的相关功能,为什么int 0x80是用户程序访问内核的方式而不是其他的软中断呢?https://www.cnblogs.com/zhenjingcool/p/15999035.html中已经介绍过
int指令是软中断指令,执行int指令自动压栈(内核栈)的寄存器有5个(用户态原ss、原sp、原eflag、cs、eip),还会将用户态代码段的下一条指令压入内核栈,参考:https://blog.csdn.net/tfnmdmx/article/details/119424519
nr_system_calls = 72
_system_call: cmpl $nr_system_calls-1,%eax //nr_system_calls为最大支持的系统调用号,这里比较一下是否超过支持的最大系统调用号,如果超过则跳转到bad_system_call ja bad_sys_call push %ds //代码段寄存器入栈 push %es //附加段寄存器入栈 push %fs //标志段寄存器入栈 pushl %edx //edx入栈 pushl %ecx # push %ebx,%ecx,%edx as parameters pushl %ebx # to the system call movl $0x10,%edx # set up ds,es to kernel space mov %dx,%ds //代码段指向0x10处,由https://www.cnblogs.com/zhenjingcool/p/15972761.html 可知0x10表示的意思是内核态GDT表的第2项 mov %dx,%es //附加段指向0x10处 movl $0x17,%edx # fs points to local data space,0x17表示的意思是用户态LDT表的第2项 mov %dx,%fs //标志段寄存器指向0x17处.从这里我们可以看出,ds,es指向内核态代码段。fs指向用户态,在signal.c中会用得到 call _sys_call_table(,%eax,4) //sys_call_table在include/linux/sys.h中定义,见下面代码说明。由at&t寻址方式https://www.cnblogs.com/zhenjingcool/p/15925494.html可知,调用sys_call_table中下标为eax*4的位置的函数,比如1号中断的话,就是sys_exit函数。 pushl %eax //把系统调用的返回值入栈 movl _current,%eax //_current表示当前任务的数据结构的地址即task_struct写入eax cmpl $0,state(%eax) # state(%eax)=state+%eax=%eax,即判断当前任务_current是否为0,为0则表示就绪或者运行中,非0表示阻塞 jne reschedule //非0则跳转到reschedule cmpl $0,counter(%eax) # 判断时间片是否用完 je reschedule //时间片用完则跳转到reschedule
上面的_system_call有很多代码,但是最核心的就是这一行:call _sys_call_table(,%eax,4),此处是真正调用中断处理函数的地方,调用完毕后会接着执行ret_from_sys_call。这个标签也是非常重要的,我们可以把它看做系统调用后处理(post process)过程,在后处理过程中会处理该进程收到的一系列信号(这些信号往往是中断处理函数发给进程的),并调用do_signal进行信号处理。
错误的系统调用,返回-1
bad_sys_call: movl $-1,%eax iret
reschedule: pushl $ret_from_sys_call 标号地址压入堆栈 jmp _schedule //重新执行系统调度函数,函数返回后,将跳转到ret_from_sys_call处执行
上面程序中用到的include/linux/sys.h代码
fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork, sys_read, sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link, sys_unlink, sys_execve, sys_chdir, sys_time, sys_mknod, sys_chmod, sys_chown, sys_break, sys_stat, sys_lseek, sys_getpid, sys_mount, sys_umount, sys_setuid, sys_getuid, sys_stime, sys_ptrace, sys_alarm, sys_fstat, sys_pause, sys_utime, sys_stty, sys_gtty, sys_access, sys_nice, sys_ftime, sys_sync, sys_kill, sys_rename, sys_mkdir, sys_rmdir, sys_dup, sys_pipe, sys_times, sys_prof, sys_brk, sys_setgid, sys_getgid, sys_signal, sys_geteuid, sys_getegid, sys_acct, sys_phys, sys_lock, sys_ioctl, sys_fcntl, sys_mpx, sys_setpgid, sys_ulimit, sys_uname, sys_umask, sys_chroot, sys_ustat, sys_dup2, sys_getppid, sys_getpgrp, sys_setsid, sys_sigaction, sys_sgetmask, sys_ssetmask, sys_setreuid,sys_setregid };
注:局部段描述符LDT,第1项是代码段,第2项是数据段,下面代码要注意这地方
ret_from_sys_call: movl _current,%eax # task[0] cannot have signals cmpl _task,%eax //_task是数组,不加下标将表示第0个元素,这里是确定当前进程是不是_task[0] je 3f //如果是task0,则向前跳转到3标号处 cmpw $0x0f,CS(%esp) # CS为代码段基址,CS(%esp)为原代码段(也就是系统调用前的代码段),这里判断原代码段是否为00001111(段选择子TI=1表示局部描述符表,RPL=3表示用户态,index=1即第一个段<代码段>) jne 3f //如果原代码段 不是 普通用户的代码段,则说明是超级用户,不做下面的检查,直接跳转到标号3处 cmpw $0x17,OLDSS(%esp) # 同样的道理,0x17=0001 0111(TI=1表示局部描述符表,RPL=3表示用户态,index=2表示数据段),OLDSS(%esp)表示原堆栈段 jne 3f //如果原堆栈不在用户数据段中,则也退出 movl signal(%eax),%ebx //下面代码到标号3之前的代码,作用是检查当前task中的信号位图并选出一个优先级最高的信号,调用signal函数。可参考https://blog.csdn.net/finzale/article/details/44648301?utm_source=blogkpcl14 movl blocked(%eax),%ecx //注意上面的signal(%eax)表示的意思。%eax表示进程,signal为上面定义的字面量12,这里取进程结构体偏移量12处开始的4字节放到ebx中,也就是信号位图。对照task_struct结构将会更好理解:struct task_struct {long state;long counter;long priority;long signal;struct sigaction sigaction[32];long blocked;...} notl %ecx //上一行代码blocked(%eax)表示取屏蔽信号位图放到ecx中.本行代码表示每位取反 andl %ebx,%ecx //获得许可的信号位图 bsfl %ecx,%ecx //从低位(0位)开始扫描位图,看是否有1的位,若有,则ecx保留该位的偏移值 je 3f //如果没有信号,则向前跳转退出 btrl %ecx,%ebx //复位该信号(ebx含有原signal位图) movl %ebx,signal(%eax) //重新保存signal位图信息 incl %ecx //将信号调整为从1开始的数(1-32) pushl %ecx //信号值入栈作为调用do_signal的参数之一 call _do_signal //调用c函数信号处理程序 popl %eax 3: popl %eax popl %ebx popl %ecx popl %edx pop %fs pop %es pop %ds iret
总结:_system_call标号实现系统调用,其真正发起系统调用的地方是 call _sys_call_table(,%eax,4) ,然后调用结束后,会接着执行 ret_from_sys_call 标号,这个标号里面对系统调用c函数执行返回后的信号量做识别处理。
下面是协处理器异常的处理程序,首先是寄存器入栈,ret_from_sys_call入栈,然后调用math_error函数,函数返回再执行一次ret_from_sys_call
_coprocessor_error: push %ds push %es push %fs pushl %edx pushl %ecx pushl %ebx pushl %eax movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs pushl $ret_from_sys_call jmp _math_error
下面是创建新进程的函数,find_empty_process和copy_process在fork.c中定义
_sys_fork: call _find_empty_process //首先判断是否进程号已满 testl %eax,%eax js 1f //如果进程号已满,跳转到1处返回 push %gs pushl %esi pushl %edi pushl %ebp pushl %eax call _copy_process 调用copy_process进行进程拷贝 addl $20,%esp 1: ret
硬盘中断处理程序
_hd_interrupt: pushl %eax pushl %ecx pushl %edx push %ds push %es push %fs movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs movb $0x20,%al outb %al,$0xA0 # EOI to interrupt controller #1 jmp 1f # give port chance to breathe 1: jmp 1f 1: xorl %edx,%edx xchgl _do_hd,%edx testl %edx,%edx jne 1f movl $_unexpected_hd_interrupt,%edx 1: outb %al,$0x20 call *%edx # "interesting" way of handling intr. pop %fs pop %es pop %ds popl %edx popl %ecx popl %eax iret
软盘中断处理程序
_floppy_interrupt: pushl %eax pushl %ecx pushl %edx push %ds push %es push %fs movl $0x10,%eax mov %ax,%ds mov %ax,%es movl $0x17,%eax mov %ax,%fs movb $0x20,%al outb %al,$0x20 # EOI to interrupt controller #1 xorl %eax,%eax xchgl _do_floppy,%eax testl %eax,%eax jne 1f movl $_unexpected_floppy_interrupt,%eax 1: call *%eax # "interesting" way of handling intr. pop %fs pop %es pop %ds popl %edx popl %ecx popl %eax iret