gvisor entersyscall exitsyscall

 

 

The Sentry runs in both GR0 and HR3. The bluepill code is responsible for transparently bouncing the Sentry between these two modes, with the following transition events:

In HR3:
CLI (disable interrupts) => switch to GR0
 
In GR0:
Fault => switch to HR3
System call => switch HR3
 
syscall() ---> sysenter() ----->kernelSyscall()(Notice: this place focus on ring0 kernel) ----> "HLT" instruction ------>  kvm_emulate_halt() in the kvm (trap to KVM)  ----> bluepillHandler() to handle _KVM_EXIT_HLT ----> c.notify() ----> syscall.RawSyscall6(syscall.SYS_FUTEX,....)
 
I also checked the asm code for syscall.RawSyscall6() in https://golang.org/src/syscall/asm_darwin_amd64.s?h=RawSyscall
 
//go:nosplit
func (c *vCPU) notify() {
    _, _, errno := syscall.RawSyscall6(
        syscall.SYS_FUTEX,
        uintptr(unsafe.Pointer(&c.state)),
        linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
        math.MaxInt32, // Number of waiters.
        0, 0, 0)
    if errno != 0 {
        throw("futex wake error")
    }
}

 

The HLT trigger a VMEXIT, which manifests as a return to the KVM_RUN ioctl on the host side. This is in the host signal handler. The state is then copied into the signal frame, and signal_return is called.

 

entersyscall 和exitsyscall 是golang runtime的entersyscall和 exitsyscall

        entersyscall()
        bluepill(c)
        vector = c.CPU.SwitchToUser(switchOpts)
        exitsyscall()

        switch vector {
        case ring0.Syscall, ring0.SyscallInt80:
                // Fast path: system call executed.
                return usermem.NoAccess, nil

        case ring0.PageFault:
                return c.fault(int32(syscall.SIGSEGV), info)

        case ring0.Debug, ring0.Breakpoint:
                *info = arch.SignalInfo{
                        Signo: int32(syscall.SIGTRAP),
                        Code:  1, // TRAP_BRKPT (breakpoint).
                }
                info.SetAddr(switchOpts.Registers.Rip) // Include address.
                return usermem.AccessType{}, platform.ErrContextSignal

 

 

entry_impl_amd64.s:44:#define SyscallInt80               0x80
entry_impl_amd64.s:45:#define Syscall                    0x100

 

 

func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
        userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
        c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))

        // Sanitize registers.
        regs := switchOpts.Registers
        regs.Eflags &= ^uint64(UserFlagsClear)
        regs.Eflags |= UserFlagsSet
        regs.Cs = uint64(Ucode64) // Required for iret.
        regs.Ss = uint64(Udata)   // Ditto.

        // Perform the switch.
        swapgs()                                         // GS will be swapped on return.
        WriteFS(uintptr(regs.Fs_base))                   // escapes: no. Set application FS.
        WriteGS(uintptr(regs.Gs_base))                   // escapes: no. Set application GS.
        LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
        if switchOpts.FullRestore {
                vector = iret(c, regs, uintptr(userCR3))   //sysenter
 } else 
{
vector
= sysret(c, regs, uintptr(userCR3))
}
SaveFloatingPoint(switchOpts.FloatingPointState)

// escapes: no. Copy out floating point. WriteFS(uintptr(c.registers.Fs_base))

// escapes: no. Restore kernel FS.

return

}

guest 执行代码

 // Set the entrypoint for the kernel.
        kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())           //代码首地址
        kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
        kernelUserRegs.RSP = c.StackTop()
        kernelUserRegs.RFLAGS = ring0.KernelFlagsSet

 

//go:nosplit
func start(c *CPU) {
    // Save per-cpu & FS segment.
    WriteGS(kernelAddr(c.kernelEntry))
    WriteFS(uintptr(c.registers.Fs_base))

    // Initialize floating point.
    //
    // Note that on skylake, the valid XCR0 mask reported seems to be 0xff.
    // This breaks down as:
    //
    //    bit0   - x87
    //    bit1   - SSE
    //    bit2   - AVX
    //    bit3-4 - MPX
    //    bit5-7 - AVX512
    //
    // For some reason, enabled MPX & AVX512 on platforms that report them
    // seems to be cause a general protection fault. (Maybe there are some
    // virtualization issues and these aren't exported to the guest cpuid.)
    // This needs further investigation, but we can limit the floating
    // point operations to x87, SSE & AVX for now.
    fninit()
    xsetbv(0, validXCR0Mask&0x7)

    // Set the syscall target.
    wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
    wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)

    // NOTE: This depends on having the 64-bit segments immediately
    // following the 32-bit user segments. This is simply the way the
    // sysret instruction is designed to work (it assumes they follow).
    wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
    wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
}

 

sysenter

/ Set the syscall target.
        wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) // sysenter
// See entry_amd64.go.
TEXT ·sysenter(SB),NOSPLIT,$0
        // _RFLAGS_IOPL0 is always set in the user mode and it is never set in
        // the kernel mode. See the comment of UserFlagsSet for more details.
        TESTL $_RFLAGS_IOPL0, R11
        JZ kernel
user:
        SWAP_GS()
        MOVQ AX, ENTRY_SCRATCH0(GS)            // Save user AX on scratch.
        MOVQ ENTRY_KERNEL_CR3(GS), AX          // Get kernel cr3 on AX.
        WRITE_CR3()                            // Switch to kernel cr3.

        MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
        MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX  // Get user regs.
        REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
        MOVQ CX,  PTRACE_RIP(AX)
        MOVQ R11, PTRACE_FLAGS(AX)
        MOVQ SP,  PTRACE_RSP(AX)
        MOVQ ENTRY_SCRATCH0(GS), CX            // Load saved user AX value.
        MOVQ CX,  PTRACE_RAX(AX)               // Save everything else.
        MOVQ CX,  PTRACE_ORIGRAX(AX)

        MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
        MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP  // Get stacks.
        MOVQ $0, CPU_ERROR_CODE(AX)            // Clear error code.
        MOVQ $1, CPU_ERROR_TYPE(AX)            // Set error type to user.

        // Return to the kernel, where the frame is:
        //
        //      vector      (sp+32)
        //      userCR3     (sp+24)
        //      regs        (sp+16)
        //      cpu         (sp+8)
        //      vcpu.Switch (sp+0)
        //
        MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
        MOVQ $Syscall, 32(SP)                 // Output vector.
        RET

 

 

 

 

kernel:
        // We can't restore the original stack, but we can access the registers
        // in the CPU state directly. No need for temporary juggling.
        MOVQ AX,  ENTRY_SCRATCH0(GS)
        MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
        REGISTERS_SAVE(AX, CPU_REGISTERS)
        MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(AX)
        MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
        MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(AX)
        MOVQ ENTRY_SCRATCH0(GS), BX
        MOVQ BX,  CPU_REGISTERS+PTRACE_ORIGRAX(AX)
        MOVQ BX,  CPU_REGISTERS+PTRACE_RAX(AX)
        MOVQ $0,  CPU_ERROR_CODE(AX)                // Clear error code.
        MOVQ $0,  CPU_ERROR_TYPE(AX)                // Set error type to kernel.

        // Call the syscall trampoline.
        LOAD_KERNEL_STACK(GS)
        PUSHQ AX                // First argument (vCPU).
        CALL ·kernelSyscall(SB) // Call the trampoline.
        POPQ AX                 // Pop vCPU.
        JMP ·resume(SB)

 

linux 系统调用实现

 注册系统调用

register_syscall:
  xor rax, rax
  mov rdx, 0x00200008
  mov ecx, 0xc0000081 /* MSR_STAR */
  wrmsr

  mov eax, 0x3f7fd5
  xor rdx, rdx
  mov ecx, 0xc0000084 /* MSR_SYSCALL_MASK */
  wrmsr

  lea rdi, [rip + syscall_handler]
  mov eax, edi
  mov rdx, rdi
  shr rdx, 32
  mov ecx, 0xc0000082 /* MSR_LSTAR */
  wrmsr

 

 

.globl syscall_handler, kernel_stack
.extern do_handle_syscall
.intel_syntax noprefix

kernel_stack: .quad 0 /* initialize it before the first time switching into user-mode */
user_stack: .quad 0

syscall_handler:
  mov [rip + user_stack], rsp
  mov rsp, [rip + kernel_stack]
  /* save non-callee-saved registers */
  push rdi
  push rsi
  push rdx
  push rcx
  push r8
  push r9
  push r10
  push r11

  /* the forth argument */
  mov rcx, r10
  call do_handle_syscall

  pop r11
  pop r10
  pop r9
  pop r8
  pop rcx
  pop rdx
  pop rsi
  pop rdi

  mov rsp, [rip + user_stack]
  .byte 0x48 /* REX.W prefix, to indicate sysret is a 64-bit instruction */
  sysret

 

posted on 2021-01-20 19:48  tycoon3  阅读(225)  评论(0编辑  收藏  举报

导航