kvmclock代码学习

Linux源码版本: 5.3.0

guest os中的kvmclock驱动

kvmclock_init()函数主要做了以下几件事:

  1. 确定了各vcpu要使用的MSR

  2. 将各vcpu在kvmclock中实际使用的数据结构pvclock_vsyscall_time_info的物理地址利用write_msr写到属于每个vcpu的MSR

  3. 将1GHz的kvmclock作为clocksource注册到系统clocksource中

void __init kvmclock_init(void) { u8 flags; if (!kvm_para_available() || !kvmclock) // 若不支持kvmclock则直接return return; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { // 查询是否支持新的kvmclock msr msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { // 如果只支持旧的kvmclock msr,则直接return return; } if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", kvmclock_setup_percpu, NULL) < 0) { // hp:hotplug, BP:bootstrap启动 启动热插拔主核的动态准备 return; } pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); // msr_kvm_system_time:kvm_system_time使用的msr // msr_kvm_wall_clock:kvm_wall_clock使用的msr this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 将hv_clock_boot数组第一个元素的地址(虚拟地址)赋值 // 给local cpu变量hv_clock_per_cpu /* 将vcpu0的hv_clock_per_cpu物理地址写入msr_kvm_system_time指向的msr中 */ kvm_register_clock("primary cpu clock"); pvclock_set_pvti_cpu0_va(hv_clock_boot); // 将hv_clock_boot的数组地址写入pvti_cpu0_va中,pvti_cpu0_va是一个pvti类型的指针 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) // KVM_FEATURE_CLOCKSOURCE_STABLE_BIT时钟源稳定指示bit pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); // 半虚拟化时钟TSC是否稳定指示bit,什么作用呢? flags = pvclock_read_flags(&hv_clock_boot[0].pvti); // 确定vcpu0的pvti结构的flags kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); // 进行时钟调度初始化,sched_clock()用于时钟调度、时间戳,及利用硬件计数器 // 的延时以提供一个精确的延迟时钟源 /* 注册各种回调函数 */ x86_platform.calibrate_tsc = kvm_get_tsc_khz; // tsc_khz为Host的pTSCfreq,指向host TSC频率的指针 x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; // wallclock:获得系统boot时的秒数和纳秒数(绝对时间,自1970) x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 注册各非vcpu0的时钟,将各vCPU0的vptihv_clock_per_cpu写入各 // 自的msr_kvm_system_time指向的msr中 #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; // 保存sched_clock的状态,事实什么都没做 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC_CORE machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); // lpj:loops_per_jiffy /* * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate * with P/T states and does not stop in deep C-states. * * Invariant TSC exposed by host means kvmclock is not necessary: * can use TSC as clocksource. * */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && !check_tsc_unstable()) kvm_clock.rating = 299; clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); // 1Ghz的kvmclock source的注册 pv_info.name = "KVM"; }

注册为x86_platform.xxxx的函数有3个,分别为kvm_get_tsc_khz,kvm_get/set_wallclock。

kvm_get_wallclock()

/* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for * that with system time */ static void kvm_get_wallclock(struct timespec64 *now) { wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 将wall_clock(pvclock_wall_clock类型)对应的物理地址写入对应msr preempt_disable(); // wallclock的时间存于now pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); // 将host写入的wallclock读出来 preempt_enable(); } /* wallclock的内容在该函数之前已经被host写入 * * */ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, struct pvclock_vcpu_time_info *vcpu_time, struct timespec64 *ts) { u32 version; u64 delta; struct timespec64 now; /* get wallclock at system boot */ do { version = wall_clock->version; rmb(); /* fetch version before time */ /* * Note: wall_clock->sec is a u32 value, so it can * only store dates between 1970 and 2106. To allow * times beyond that, we need to create a new hypercall * interface with an extended pvclock_wall_clock structure * like ARM has. */ now.tv_sec = wall_clock->sec; // 读取wallclock时间,该时间不是完整时间 now.tv_nsec = wall_clock->nsec; // 还需要加上后面的delta, 即vm过去的时间 rmb(); /* fetch time before checking version */ } while ((wall_clock->version & 1) || (version != wall_clock->version)); delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec; now.tv_nsec = do_div(delta, NSEC_PER_SEC); now.tv_sec = delta; set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec); } struct pvclock_wall_clock { u32 version; u32 sec; u32 nsec; } __attribute__((__packed__));

kvm_get_tsc_khz()

// 以kHz为基础获得tsc count static unsigned long kvm_get_tsc_khz(void) { setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); return pvclock_tsc_khz(this_cpu_pvti()); } // 将pv_tsc_khz根据local cpu的pvti的tsc_shift和tsc_to_system_mul做校准 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) { u64 pv_tsc_khz = 1000000ULL << 32; do_div(pv_tsc_khz, src->tsc_to_system_mul); if (src->tsc_shift < 0) pv_tsc_khz <<= -src->tsc_shift; else pv_tsc_khz >>= src->tsc_shift; return pv_tsc_khz; }

guest中设置时间的代码框架

内核获取wallclock

static struct pvclock_wall_clock wall_clock __bss_decrypted; // 静态全局变量wall_clock,存储于bss段 static void kvm_get_wallclock(struct timespec64 *now) { wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 将wall_clock的物理地址写入到MSR_KVM_WALL_CLOCK, 触发wrmsr_vmexit preempt_disable(); pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); preempt_enable(); }

kvm_get_wallclock函数的第一个语句就会触发wrmsr_vmexit, 进而经过一系列的调用:

handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 data = msr_info->data; ... case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data); break; ... }
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { ... // 这里的boot time为host系统启动时间 getboottime64(&boot); // 获取host的boot time并写入boot变量 if (kvm->arch.kvmclock_offset) {//对host boot time做一些调整 struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); boot = timespec64_sub(boot, ts); } wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ wc.nsec = boot.tv_nsec; wc.version = version; // 更新guest的wallclock kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); // 将wc即host boot time的内容写入guest的wallclock中 ... }

kvm_write_wall_clock()函数执行完毕之后,guest可以在全局变量wall_clock中找到host系统的boot time.

内核更新wallclock

void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); // 做时间校准 tk->offs_real- tk->offs_boot *ts = ktime_to_timespec64(t); // 转换一下时间格式 }

重要语句为struct timekeeper *tk = &tk_core.timekeeper;而tk_core的定义为:

/* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { seqcount_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { .seq = SEQCNT_ZERO(tk_core.seq), };

所以tk_core是一个结构体变量, 存储于静态区,且同时只能有一个cpu访问该变量.

getboottime64()读取tk_core.timekeeper的offs_real和offs_boot内容,那么tk_core.timekeeper的内容在哪里设置的呢?在内核代码中找到了更新tk->offs_boot内容的代码:

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); // tk->offs_boot += delta /* * Timespec representation for VDSO update to avoid 64bit division * on every update. * VDSO: 有些syscall使用很频繁,但每次syscall都要进行从用户态到内核态的切换,开销很大,就将此类syscall的结果存储在 * 一个共享区域中,每次syscall直接读取结果即可,降低了开销 * VDSO的全称为: virtual dynamic share object */ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); // monotonic_to_boot是offs_boot的VDSO形式 // 是为了加速访问offs_boot }

tk_update_sleep_time()的核心语句为tk->offs_boot = ktime_add(tk->offs_boot, delta);,delta是什么,从哪里来,需要在内核代码中查找tk_update_sleep_time()的调用位置和参数意义.

static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { ... // 设置tk中的CLOCK_REALTIME时间,并记录误差 tk_xtime_add(tk, delta); // 该函数内容为: tk->xtime_sec += delta->tv_sec; // tk->tkr_mono.xtime_nsec += delta->tv_nsec << tk->tkr_mono.shift; // xtime_sec: 以秒为单位的当前CLOCK_REALTIME时间 // tkr: timekeeping read, 用于读出时间的结构体. // tkr_mono.xtime_nsec 读出时间肯定存在误差, xtime_nsec是读出时间的ns级误差 ... tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); ... }

在__timekeeping_inject_sleeptime()中还是看不到delta的取值,继续找.

void timekeeping_resume(void) { struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; u64 cycle_now, nsec; bool inject_sleeptime = false; read_persistent_clock64(&ts_new); // 读取新的wallclock到ts_new clockevents_resume(); // 继续时钟事件 clocksource_resume(); // 继续时钟源 raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); /* * After system resumes, we need to calculate the suspended time and * compensate it for the OS time. There are 3 sources that could be * used: Nonstop clocksource during suspend, persistent clock and rtc * device. * * One specific platform may have 1 or 2 or all of them, and the * preference will be: * suspend-nonstop clocksource -> persistent clock -> rtc * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tk->tkr_mono); // 获得当前时间 nsec = clocksource_stop_suspend_timing(clock, cycle_now); // 获得suspend的总时间 if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); inject_sleeptime = true; } if (inject_sleeptime) { suspend_timing_needed = false; __timekeeping_inject_sleeptime(tk, &ts_delta); // 原来delta指的是suspend的时间 } }

我们一路追溯的delta原来是suspend的时间,画出回溯图:

内核初始化wallclock

现在我们知道,内核如何获取wallclock,是靠x86_platform.get_wallclock,我们也知道,内核如何更新wallclock,是在系统suspend之后,resume之前,利用__timekeeping_inject_sleeptime()修改tk_core的内容,修改了wallclock.

但是,系统初始化时,wallclock肯定就被设置了,那么wallclock是如何被初始化的呢?猜测在timekeeping_init相关的函数中.

void __init timekeeping_init(void) { ... read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); // 读取walltime和bootOffset到这俩变量 ... }
void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); // 读取walltime *boot_offset = ns_to_timespec64(local_clock()); }
/* not static: needed by APM */ void read_persistent_clock64(struct timespec64 *ts) { x86_platform.get_wallclock(ts); }

是不是很熟悉,x86_platform.get_wallclock(ts);在kvm中,x86_platform.get_wallclock = kvm_get_wallclock,在host上,x86_platform.get_wallclock = vrtc_get_time.

void vrtc_get_time(struct timespec64 *now) { u8 sec, min, hour, mday, mon; unsigned long flags; u32 year; spin_lock_irqsave(&rtc_lock, flags); while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP)) cpu_relax(); sec = vrtc_cmos_read(RTC_SECONDS); min = vrtc_cmos_read(RTC_MINUTES); hour = vrtc_cmos_read(RTC_HOURS); mday = vrtc_cmos_read(RTC_DAY_OF_MONTH); mon = vrtc_cmos_read(RTC_MONTH); year = vrtc_cmos_read(RTC_YEAR); spin_unlock_irqrestore(&rtc_lock, flags); /* vRTC YEAR reg contains the offset to 1972 */ year += 1972; pr_info("vRTC: sec: %d min: %d hour: %d day: %d " "mon: %d year: %d\n", sec, min, hour, mday, mon, year); now->tv_sec = mktime64(year, mon, mday, hour, min, sec); now->tv_nsec = 0; }

可以看到,walltime的值包含了年月日时分秒,读取自rtc_cmos时钟中.

那么,结论就来了,guest的kvmclock的wallclock来自于RTC时钟, 且该wallclock由所有vcpu共享,如果vcpu想获得wallclock,就得写属于自己的msr_wall_clock.每当wallclock的内容更新,所有vcpu都能读到最新wallclock,而不是只有写msr_wall_clock的那个vcpu可以读到.

systemTime的初始化

从kvmclock驱动角度来看,在kvmclock_init()中,,就将vcpu0和其余vcpu的vpti结构的物理地址,通过write msr写到了各自的system_time_msr中.

static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); #define HVC_BOOT_ARRAY_SIZE \ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) // 表示一个page能放多少个pvti结构 static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); // pvti结构数组 void __init kvmclock_init(void) { ... // 获得msr_system_time和msr_wall_clock msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; ... this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 将pvti结构数组中的第一个元素地址给hv_clock_per_cpu kvm_register_clock("primary cpu clock"); // 将hv_clock_per_cpu的物理地址写入对应system_time_msr pvclock_set_pvti_cpu0_va(hv_clock_boot); // pvti_cpu0_va = hv_clock_boot, 将hv_clock_boot地址作为cpu0的pvti地址 #ifdef CONFIG_X86_LOCAL_APIC x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 在smp_init中,调用kvm_register_clock初始化除cpu0以外的cpu时钟, 也因此将各自的hv_clock_per_cpu的物理地址传入了对应的system_time_msr #endif ... clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);// 注册1GHz的kvmclock作为一个clocksource } static void kvm_setup_secondary_clock(void) { kvm_register_clock("secondary cpu clock"); } static void kvm_register_clock(char *txt) { struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); u64 pa; if (!src) return; pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; // 确保pvti结构的bit0为1 wrmsrl(msr_kvm_system_time, pa); // 将该cpu的pvti结构的物理地址通过写msr的方式写入对应msr_kvm_system_time pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); }

现在我们知道了,在kvmclock_init()中,会用写msr的方式,将各cpu的pvti结构的物理地址写入各自对应的system_time_msr, 这里要追溯两条线索:

一条线索向上,找出在何时调用kvmclock_init()进而将cpu0的pvti结构的物理地址写入对应msr,以及何时调用x86_cpuinit.early_percpu_clock_init,将其余cpu的pvti结构的物理地址写入对应msr.

另一条线索向下,当guest kernel中发生写msr时,会导致wrmsr_vmexit,研究在该vmexit中,会怎样处理对应msr.

可以看到,在guest启动内核时就调用了kvmclock_init(),将vcpu0的pvti结构的物理地址写入了对应msr, 并注册了将其余vcpu的pvti结构的物理地址写入对应msr的回调函数kvm_setup_secondary_clock.

接下来看何时调用x86_cpuinit.early_percpu_clock_init.

至此,所有vcpu的pvti的物理地址写入msr路径已经搞清楚,接下来看另一条线索,即当写msr动作发生时,触发vmexit,在handle_wrmsr中如何处理system_time. 与wallclock类似,也经历了以下调用过程.

handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { ... case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { struct kvm_arch *ka = &vcpu->kvm->arch; kvmclock_reset(vcpu); // 将该vcpu的pv_time_enabled标志置为false // 如果是vcpu0, 那么就将tmp设置为表示是否使用的旧的kvmclock if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { // host_initiated在handle_wrmsr()中会被置false bool tmp = (msr == MSR_KVM_SYSTEM_TIME); // 如果没有使用旧的kvmclock,则发出KVM_REQ_MASTERCLOCK_UPDATE请求 if (ka->boot_vcpu_runs_old_kvmclock != tmp) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = tmp; } // 将该vcpu的pvti的物理地址值赋值给该vpcu的arch.time,并发出KVM_REQ_GLOBAL_CLOCK_UPDATE请求 vcpu->arch.time = data; kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) // 确保pvti的bit0不为0,如果为0,将不使用kvmclock break; if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; else vcpu->arch.pv_time_enabled = true; break; } }

即,如果运行的是vcpu0,且是否使用旧的kvmclock msr与当前的boot_vcpu_runs_old_kvmclock标志不一致,那么一定是出了一些什么问题,需要校准MASTERCLOCK,发出KVM_REQ_MASTERCLOCK_UPDATE请求.然后进行普通vcpu的操作.

普通vcpu的操作: 如果运行的是其它vcpu,那么只需要将该vcpu的pvti的物理地址值赋值给该vcpu的arch.time,并发出KVM_REQ_GLOBAL_CLOCK_UPDATE请求(也就是说,vcpu0有可能连续发出两个REQUEST).之后根据kvm_gfn_to_hva_cache_init的结果将pv_time_enabled置为true或false.看一下kvm_gfn_to_hva_cache_init函数.

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); //为vcpu->arch.pv_time从memory中分配一些slots return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); // 将vcpu->arch.pv_time的cache初始化,赋值,检查有效性 } static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { int offset = offset_in_page(gpa); // 获取pvti的物理地址在page中的offset gfn_t start_gfn = gpa >> PAGE_SHIFT; // 获取pvti的物理地址的起始guest frame number gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // 获取pvti的物理地址的终止guest frame number gfn_t nr_pages_needed = end_gfn - start_gfn + 1; // 所需的page数量 gfn_t nr_pages_avail; int r = start_gfn <= end_gfn ? 0 : -EINVAL; // 正常情况下r==0 ghc->gpa = gpa; ghc->generation = slots->generation; // slots的代数,用于分辨存储的内容为第几代 ghc->len = len; ghc->hva = KVM_HVA_ERR_BAD; /* * If the requested region crosses two memslots, we still * verify that the entire region is valid here. */ while (!r && start_gfn <= end_gfn) { // 确保申请的slots有效 ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); // 为存放pvti的缓存分配对应的host 虚拟地址 if (kvm_is_error_hva(ghc->hva)) r = -EFAULT; start_gfn += nr_pages_avail; } /* Use the slow path for cross page reads and writes. */ if (!r && nr_pages_needed == 1) ghc->hva += offset; else ghc->memslot = NULL; return r; // 正常情况下return0 }

可以看出,kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)) 为vcpu->arch.pv_time申请了cache空间(对应host的虚拟地址),将pvti的物理地址写入了该cache的gpa字段.

所以,在分配host虚拟地址成功的情况下,vcpu->arch.pv_time_enabled肯定为true.

综上,在各vcpu启动后,将各vcpu的pvti结构的物理地址写入msr_system_time_i,并开辟缓存空间,用于打通host和guest.

各vcpu的pvti结构只需要一次wrmsr便可与host虚拟地址关联,之后无需wrmsr,host不定期写入pvti的最新值.

guest从pvti结构读取system time

guest从pvti结构读取system time的触发点为上面提到的3种request:

KVM_REQ_MASTERCLOCK_UPDATE

KVM_REQ_GLOBAL_CLOCK_UPDATE

KVM_REQ_CLOCK_UPDATE

那么, guest kernel中什么时候发出这3种REQUEST呢?逐个来看.

三大更新时间请求的触发点

KVM_REQ_MASTERCLOCK_UPDATE

  1. 当masterclock被使能,就一直发出KVM_REQ_MASTERCLOCK_UPDATE请求,以更新masterclock. 这样情况的代码在kvm_track_tsc_matching中.

masterclock何时可以被使能:

  • host clocksource必须为tsc
  • vcpus必须有matched tsc,即vcpus的v_tsc必须与host_tsc频率一直

调用路径一共有2条:

第一条为:(由底层函数向顶层函数追溯)

kvm_track_tsc_matching => kvm_write_tsc => kvm_set_msr_common写MSR_IA32_TSC

即在guest os运行过程中,如果出现kvm_set_msr_common(MSR_IA32_TSC), 且满足masterclock使能条件,且masterclock使能,则发出KVM_REQ_MASTERCLOCK_UPDATE请求

第二条为:(由底层函数向顶层函数追溯)

kvm_track_tsc_matching => kvm_write_tsc => kvm_arch_vcpu_postcreate => kvm_vm_ioctl_create_vcpu

即在创建vcpu时,满足masterclock使能条件,且masterclock使能,则发出KVM_REQ_MASTERCLOCK_UPDATE请求.

  1. 写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW时,如果使用的是新版kvmclock,即写的是MSR_KVM_SYSTEM_TIME_NEW, 则发出KVM_REQ_MASTERCLOCK_UPDATE.这是systemTime的初始化期间的一段.

  2. 在pvclock_gtod_update_fn中,对所有vcpu发出了KVM_REQ_MASTERCLOCK_UPDATE.而pvclock_gtod_update_fn的调用路径为:

static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); /* * Notification about pvclock gtod data update. */ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, void *priv) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; struct timekeeper *tk = priv; update_pvclock_gtod(tk); /* disable master clock if host does not trust, or does not * use, TSC based clocksource. */ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) // guest clocksource从TSC变为了非TSC时 queue_work(system_long_wq, &pvclock_gtod_work); // 将pvclock_gtod_work放入工作队列 return 0; } static struct notifier_block pvclock_gtod_notifier = { .notifier_call = pvclock_gtod_notify, }; int kvm_arch_init(void *opaque) { pvclock_gtod_register_notifier(&pvclock_gtod_notifier); // 将pvclock_gtod_notifier注册为一个时间更新listener,每当host更新时间, 就会调用pvclock_gtod_notifier进而调用pvclock_gtod_notify }

即当host更新时间,且kvm发现guest的clocksource从TSC变为非TSC时,发出KVM_REQ_MASTERCLOCK_UPDATE请求.

4.在kvm_arch_hardware_enable中,发现guest tsc发生了倒退,那么向所有vcpu发出KVM_REQ_MASTERCLOCK_UPDATE请求.

KVM_REQ_GLOBAL_CLOCK_UPDATE

  1. 在kvmclock驱动初始化时,kvmclock_init()中的kvm_register_clock触发wrmsr进而调用kvm_set_msr_common写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW, 发出KVM_REQ_GLOBAL_CLOCK_UPDATE请求

  2. 在做从vcpu到pcpu(物理cpu)的迁移时,如果guest的tsc不一致,则需要发KVM_REQ_GLOBAL_CLOCK_UPDATE请求.

KVM_REQ_CLOCK_UPDATE

  1. kvm_gen_update_masterclock中,对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.而kvm_gen_update_masterclock为KVM_REQ_MASTERCLOCK_UPDATE请求的handler.

  2. 在kvmclock_update_fn函数中对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求,kvmclock_update_fn的调用顺序为:

    kvm_arch_init_vm() { // 初始化延时作业, 将kvmclock_update_fn注册为kvm->arch.kvmclock_update_work的回调函数 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); // 初始化延时作业,将kvmclock_sync_fn注册为kvm->arch.kvmclock_sync_work的回调函数 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); } static void kvmclock_sync_fn(struct work_struct *work) { // 立即调用kvmclock_update_work->kvmclock_update_fn schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); // 300 s后重新调用kvmclock_sync_work->kvmclock_sync_fn } static void kvmclock_update_fn(struct work_struct *work) { kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // kvm_guest_time_update() kvm_vcpu_kick(vcpu); // kick a vcpu to sleep. or make a guest mode vcpu into host kernel mode. } }

    即在kvmclock的同步函数中定义了立即作业(更新kvmclock),和延时作业(同步kvmclock).也就是说,kvm第一次调用同步kvmclock函数后,每300s更新和同步一次kvmclock,每次更新kvmclock时都发出KVM_REQ_CLOCK_UPDATE请求.

  3. kvm_gen_kvmclock_update中,对当前vcpu发出KVM_REQ_CLOCK_UPDATE请求,100ms后调用更新kvmclock函数kvmclock_update_fn,后者对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.kvm_gen_kvmclock_update是KVM_REQ_GLOBAL_CLOCK_UPDATE请求的handler.

    static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { struct kvm *kvm = v->kvm; kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即发送KVM_REQ_CLOCK_UPDATE请求 schedule_delayed_work(&kvm->arch.kvmclock_update_work, KVMCLOCK_UPDATE_DELAY); // 100ms后触发kvmclock_update_fn }
  4. kvm_arch_vcpu_load中,如果检测到了外部tsc_offset_adjustment,就发出KVM_REQ_CLOCK_UPDATE请求.即在切换到特定vcpu时,做检测并决定是否发出KVM_REQ_CLOCK_UPDATE请求.

  5. kvm_set_guest_paused中,会发出KVM_REQ_CLOCK_UPDATE请求,kvm_set_guest_paused告诉guest kernel,该guest kernel已经被kvm停止了.即在guest kernel pause时,发出KVM_REQ_CLOCK_UPDATE请求.

  6. 在qemu发出KVM_SET_CLOCK的ioctl时,向所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.qemu设置时钟时,更新guest时钟是理所应当的事情.

  7. 在__kvmclock_cpufreq_notifier中,对所有vcpu发出了KVM_REQ_CLOCK_UPDATE.因为该函数为cpu频率变化时的回调函数,当host cpu频率变化时,应该重新设置guest的时间.

  8. 在vmexit时,如果guest的tsc总是追上host的tsc,说明guest的tsc频率高于host的tsc频率,需要重新校准guest的时间.因此向当前vcpu发出KVM_REQ_CLOCK_UPDATE.

  9. kvm_arch_hardware_enable,如果host tsc不稳定,就对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.而kvm_arch_hardware_enable的调用路径为:

    kvm_arch_hardware_enable => hardware_enable_nolock => kvm_starting_cpu

    ​ => kvm_resume

    也就是说,在kvm启动vcpu和恢复vcpu的运行时,都需要发出KVM_REQ_CLOCK_UPDATE以调整时间.

三大请求的处理

在确定了各更新时间的请求的triger点之后,接下来看一下这些请求的handler究竟针对请求做了哪些处理.

3种请求均在vcpu_enter_guest(),即进入non-root之前做处理.

KVM_REQ_MASTERCLOCK_UPDATE

static void kvm_gen_update_masterclock(struct kvm *kvm) { #ifdef CONFIG_X86_64 int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); // 发出KVM_REQ_MCLOCK_INPROGRESS请求,让所有vcpu无法进入guest /* no guest entries from this point */ pvclock_update_vm_gtod_copy(kvm);//确认guest能否使用master_clock(用于vcpu之间的时间同步) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // 向所有vcpu发出KVM_REQ_CLOCK_UPDATE请求 /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); // 清除KVM_REQ_MCLOCK_INPROGRESS请求,让所有vcpu进入guest spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } static void pvclock_update_vm_gtod_copy(struct kvm *kvm) { #ifdef CONFIG_X86_64 struct kvm_arch *ka = &kvm->arch; int vclock_mode; bool host_tsc_clocksource, vcpus_matched; vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); // 表示vcpus的tsc频率是否match /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ host_tsc_clocksource = kvm_get_time_and_clockread( &ka->master_kernel_ns, &ka->master_cycle_now); // 如果host使用tsc,host_tsc_clocksource为true // master_kernel_ns为master中记录的host boot以来的时间 // master_cycle_now为master中记录的host的当前tsc取值 ka->use_master_clock = host_tsc_clocksource && vcpus_matched && !ka->backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; // backwards_tsc_observed表示是否观察到tsc倒退现象 // boot_vcpu_runs_old_kvmclock表示kvmclock使用旧的MSR if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); // 如果use_master_clock为1,就将kvm_guest_has_master_clock设为1 vclock_mode = pvclock_gtod_data.clock.vclock_mode; trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, vcpus_matched); #endif }

可以看到,对于KVM_REQ_MASTERCLOCK_UPDATE请求,kvm做了两件事情,一件事情是确认guest能否使用master_clock(用于vcpu之间的时间同步),另一件事情是对所有vcpu发出了更基本的请求,即KVM_REQ_CLOCK_UPDATE请求(在KVM_REQ_CLOCK_UPDATE的处理中说明).

KVM_REQ_GLOBAL_CLOCK_UPDATE

static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { struct kvm *kvm = v->kvm; kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即发送KVM_REQ_CLOCK_UPDATE请求 schedule_delayed_work(&kvm->arch.kvmclock_update_work, KVMCLOCK_UPDATE_DELAY); // 100ms后触发kvmclock_update_fn }

对于KVM_REQ_GLOBAL_CLOCK_UPDATE请求, kvm首先对当前vcpu发送了更基本请求,即KVM_REQ_CLOCK_UPDATE请求,在发出请求后100ms,调用kvmclock_update_fn,kvmclock_update_fn的作用是对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.

也就是说,KVM_REQ_GLOBAL_CLOCK_UPDATE的处理为:

  1. 向当前vcpu发送KVM_REQ_CLOCK_UPDATE请求
  2. 向所有vcpu发送KVM_REQ_CLOCK_UPDATE请求,并kick所有vcpu.

KVM_REQ_CLOCK_UPDATE

从上面的两种请求的处理可以看到,上面两种请求都以来基础请求KVM_REQ_CLOCK_UPDATE,因此KVM_REQ_CLOCK_UPDATE的处理非常重要.

static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; u8 pvclock_flags; bool use_master_clock; kernel_ns = 0; host_tsc = 0; /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ spin_lock(&ka->pvclock_gtod_sync_lock); use_master_clock = ka->use_master_clock; if (use_master_clock) { // 如果host使用tsc clock,直接将tsc传递给guest即可 host_tsc = ka->master_cycle_now; // 将master的cycle_now记为host_tsc (tsc数) kernel_ns = ka->master_kernel_ns; // 将master的kernel_ns记为kernel_ns (master的kernel 纳秒数) host boot以来的时间 } spin_unlock(&ka->pvclock_gtod_sync_lock); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); // 读取当前vcpu的tsc值 if (unlikely(tgt_tsc_khz == 0)) { // 如果当前vcpu的tsc值无效,则发出KVM_REQ_CLOCK_UPDATE请求 local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; } if (!use_master_clock) {//如果host不使用tsc clock, host_tsc = rdtsc(); // 通过手动读取tsc的值获得host的tsc值 kernel_ns = ktime_get_boottime_ns(); // 获得host kernel boot以来的时间 } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); // 将host tsc的值通过scale和offset得到当前时间戳 /* * We may have to catch up the TSC to match elapsed wall clock * time for two reasons, even if kvmclock is used. * 1) CPU could have been running below the maximum TSC rate * 2) Broken TSC compensation resets the base at each VCPU * entry to avoid unknown leaps of TSC even when running * again on the same CPU. This may cause apparent elapsed * time to disappear, and the guest to stand still or run * very slowly. */ if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); // 通过host boot以来的时间,计算理论guest tsc if (tsc > tsc_timestamp) { // 如果理论guest tsc比经过调整的host tsc大 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);//将offset调整为原offset+tsc-tsc_timestap tsc_timestamp = tsc; // 将tsc赋值给当前时间戳,使其保持一致 } } local_irq_restore(flags); /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) // 如果支持tsc scaling tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); // 将当前vcpu的tsc值做scale if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { // 如果当前vcpu的tsc与vcpu记录的硬件tsc值不同,则调整tsc scale值 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; } // 向pvti类型的hv_clock赋值 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; vcpu->hv_clock.flags = pvclock_flags; if (vcpu->pv_time_enabled) // 如果使用半虚拟化(kvmclock) kvm_setup_pvclock_page(v); // 就将pv_clock的内容拷贝到pv_clock if (v == kvm_get_vcpu(v->kvm, 0)) kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; }

kvm_guest_time_update()做了以下几件事情:

  1. 获取host的tsc value和host kernel boot以来的ns数
  2. 读取当前vcpu的tsc value
  3. 经过一系列的校准,将最终时间赋值给vcpu->hv_clock
  4. 如果vcpu使能了半虚拟化,就调用kvm_setup_pvclock_page

来看kvm_setup_pvclock_page.

static void kvm_setup_pvclock_page(struct kvm_vcpu *v) { ... kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); // 将hv_clock的内容赋值到pv_time中去 ... }

这里的pv_time就是之前我们提到的每个vcpu都有1个的pvti结构.将将hv_clock的内容赋值到pv_time中去,即将最新时间更新到vcpu的pvti结构中去.

system time就这样在pvti结构中被更新了.

host对system time的写入

host对system time的写入一般来说有2种情况,同步写入和异步写入.

同步写入指的是周期性更新guest中system time的值,以和host时间保持一致.

异步写入指的是在特殊事件发生时(如guest suspend时),更新guest中system time的值,防止guest中的时间出错.

host对system time的同步写入

kvm通过pvclock_gtod_register_notifier向timekeeper层注册了一个回调pvclock_gtod_notify(在上面的三大请求trigger点的介绍中有提到),每当Host Kernel时钟更新时(即timekeeping_update被调用时),就会调用pvclock_gtod_notify.

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, void *priv) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; struct timekeeper *tk = priv; update_pvclock_gtod(tk); // 更新pvclock的内容 /* disable master clock if host does not trust, or does not * use, TSC based clocksource. */ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource从TSC变为了非TSC时 queue_work(system_long_wq, &pvclock_gtod_work); return 0; } static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; u64 boot_ns; boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; vdata->clock.cycle_last = tk->tkr_mono.cycle_last; //host时间更新时的clocksource的counter读数 vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; vdata->boot_ns = boot_ns; // host时间更新时的s部分,以ns记 vdata->nsec_base = tk->tkr_mono.xtime_nsec; // host时间更新时的wallclock的ns部分 vdata->wall_time_sec = tk->xtime_sec;// host时间更新时的wallclock的s部分 write_seqcount_end(&vdata->seq); }

pvclock_gtod_notify()完成了2件事情:

  1. 调用update_pvclock_gtod更新了pvclock_gtod_data
  2. 检测host的clocksource是否变为了非tsc,如果变了则将作业pvclock_gtod_work入队
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); static void pvclock_gtod_update_fn(struct work_struct *work) { struct kvm *kvm; struct kvm_vcpu *vcpu; int i; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); mutex_unlock(&kvm_lock); }

可以看到pvclock_gtod_work的实际函数pvclock_gtod_update_fn的作用为:

向所有vcpu发出KVM_REQ_MASTERCLOCK_UPDATE,而后者经过层层调用,更新每个vcpu的pvti结构中的时间数据.

也就是说,每当Host Kernel时钟更新时,如果使用master_clock,kvm会更新每个vcpu的pvti时间.内核的代码中使用tk_clock_read读取clocksource当前counter,但是没有发现上下文中有对读取时间的cpu的限制.

host对system time的异步写入

host对system time的异步写入通过qemu实现,利用kvm_vm_ioctl(KVM_SET_CLOCK),与kvm发生交互.

而kvm中,KVM_SET_CLOCK的ioctl的定义如下:

case KVM_SET_CLOCK: { struct kvm_clock_data user_ns; u64 now_ns; r = -EFAULT; if (copy_from_user(&user_ns, argp, sizeof(user_ns))) goto out; r = -EINVAL; if (user_ns.flags) goto out; r = 0; /* * TODO: userspace has to take care of races with VCPU_RUN, so * kvm_gen_update_masterclock() can be cut down to locked * pvclock_update_vm_gtod_copy(). */ kvm_gen_update_masterclock(kvm); // 确认guest能否使用masterclock,并向所有vcpu发出时间更新请求 now_ns = get_kvmclock_ns(kvm); // 读取当前cpu的时间 kvm->arch.kvmclock_offset += user_ns.clock - now_ns; // 确认当前cpu和qemu传入的cpu时间的offset kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); // 利用新的offset对所有vcpu的时间进行更新 break; }

可以看到,kvm_vm_ioctl(KVM_SET_CLOCK)做了以下几件事情:

  1. 确认guest能否使用masterclock,并向所有vcpu发出时间更新请求
  2. 读取当前cpu的时间(根据是否使用masterclock,读取时间的方式不同)
  3. 计算当前cpu和qemu传入的cpu时间的offset
  4. 利用新的offset对所有vcpu的时间进行更新

host对system time的异步写入依赖qemu和kvm的交互kvm_vm_ioctl(KVM_SET_CLOCK).

masterclock: 由于我们的kvmclock依赖于Host Boot Time和Host TSC两个量,即使Host TSC同步且Guest TSC同步,在pCPU0和pCPU1分别取两者,前者的差值和后者的差值也可能不相等,并且谁大谁小都有可能,从而可能违反kvmclock的单调性。因此,我们通过只使用一份Master Copy,即Master Clock,来解决这个问题。

---update on 6.1 2020-----

由于对pvclock_gtod_data和各vcpu的pvti结构的更新之间的关系不太清楚,特此研究记录.

与各vcpu的pvti结构对应的host虚拟地址的申请

在kvmclock驱动初始化时,kvmclock_init()中的kvm_register_clock触发wrmsr进而调用kvm_set_msr_common写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW.

在kvm_set_msr_common()中最关键的一句话为:

if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) // 计算出写入gpa为data时,对应的hva. // 将hva,gpa,内存区域长度,对应的memslot地址,该memslot对应的generation都存入arch.pv_time中 vcpu->arch.pv_time_enabled = false; else vcpu->arch.pv_time_enabled = true;

kvm_gfn_to_hva_cache_init的函数原型为:

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len);

其中,ghc为gfn_to_hva_cache结构体类型,意义为将guest frame number转化成host virtual address的cache.定义为:

struct gfn_to_hva_cache { u64 generation; // cache的代数 gpa_t gpa; // guest物理地址 unsigned long hva; // host虚拟地址 unsigned long len; //该cache的大小 struct kvm_memory_slot *memslot; // 该cache对应的kvm memslot地址 };

kvm_gfn_to_hva_cache_init()函数的实现为:

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); // slots为整个kvm memslots的地址 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { int offset = offset_in_page(gpa); // 页内offset /* 虽然gpa只是一个地址,但由于从gpa开始,需要len长度的空间,所以存在起始gfn和终点gfn */ gfn_t start_gfn = gpa >> PAGE_SHIFT; // gpa对应的起始gfn(guest frame number) gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // gpa对应的终点gfn gfn_t nr_pages_needed = end_gfn - start_gfn + 1; // 所需页的数量 gfn_t nr_pages_avail; int r = start_gfn <= end_gfn ? 0 : -EINVAL; // r判断起始gfn和终点gfn的有效性 /* 将guest物理地址,memslot的代数,所需存储空间的长度,赋值给gfn_to_hva_cache结构 */ ghc->gpa = gpa; ghc->generation = slots->generation; ghc->len = len; ghc->hva = KVM_HVA_ERR_BAD; // 先将host虚拟地址赋值为无效 //-----------------------------下面开始为gpa找对应的hva,存储到赋值给gfn_to_hva_cache结构中去 /* * If the requested region crosses two memslots, we still * verify that the entire region is valid here. */ while (!r && start_gfn <= end_gfn) { // 如果请求的空间大小横跨2个memslot,需要确定请求空间的有效性 ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); if (kvm_is_error_hva(ghc->hva)) r = -EFAULT; start_gfn += nr_pages_avail; } /* Use the slow path for cross page reads and writes. */ if (!r && nr_pages_needed == 1) ghc->hva += offset; else ghc->memslot = NULL; return r; }

从上面的分析可以看出,以下这段语句的作用为:从kvm_memslots中申请大小为pvclock_vcpu_time_info结构大小的缓存空间,该缓存空间缓存的是物理地址为data中所存地址中指向的内容,该缓存空间对应的host虚拟地址为hva.

kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))

在kvmclock驱动初始化写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW时导致的kvm_set_msr_common()中,data就是每个vcpu都有的pvti结构.让pvti有了一个host虚拟地址.

向各vcpu的pvti结构对应的host虚拟地址写入时间

先不论时间从哪里来,肯定会向pvti结构写入时间.

static int kvm_guest_time_update(struct kvm_vcpu *v) { ... if (vcpu->pv_time_enabled) // 在kvm_set_msr_common中建立pvti的gpa对应的hva时就使能了 kvm_setup_pvclock_page(v); } static void kvm_setup_pvclock_page(struct kvm_vcpu *v) { ... kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); // 将hv_clock(pvti结构体类型)写到vcpu的gfn_to_hva_cache结构的hva去,即写到了vcpu的pvti中去 } int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; BUG_ON(len + offset > ghc->len); if (slots->generation != ghc->generation) __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) return kvm_write_guest(kvm, gpa, data, len); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; r = __copy_to_user((void __user *)ghc->hva + offset, data, len); // 将数据写入gfn_to_hva_cache的hva地址中去 if (r) return -EFAULT; mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); return 0; }

综上,下面这段语句的意义为:

将vcpu的hv_clock(pvti结构类型)数据写入属于该vcpu的pvti对应的host虚拟地址中去.

kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock));

pv_clock_gtod_data

pv_clock_gtod_data是一个全局变量, kvm会在每个host的tick时更新该变量的内容.

struct pvclock_gtod_data { seqcount_t seq; struct { /* extract of a clocksource struct */ int vclock_mode; u64 cycle_last; u64 mask; u32 mult; u32 shift; } clock; u64 boot_ns; u64 nsec_base; u64 wall_time_sec; };

如何更新的呢?

kvm通过pvclock_gtod_register_notifier向timekeeper层注册了一个回调pvclock_gtod_notify,每当Host Kernel时钟更新时(即timekeeping_update被调用时),就会调用pvclock_gtod_notify,进而调用update_pvclock_gtod更新pvclock_gtod_data的值.

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, void *priv) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; struct timekeeper *tk = priv; update_pvclock_gtod(tk); // 更新pvclock_gtod_data的内容 /* disable master clock if host does not trust, or does not * use, TSC based clocksource. */ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource从TSC变为了非TSC时 queue_work(system_long_wq, &pvclock_gtod_work); return 0; } static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; u64 boot_ns; boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; // 时钟源类型 vdata->clock.cycle_last = tk->tkr_mono.cycle_last; //host时间更新时的clocksource的counter读数 vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; vdata = boot_ns; // host时间更新时,用来获得当前时间的基础时间,以ns记 vdata->nsec_base = tk->tkr_mono.xtime_nsec; // host时间更新时的wallclock的ns部分 vdata->wall_time_sec = tk->xtime_sec;// host时间更新时的wallclock的s部分 write_seqcount_end(&vdata->seq); }

pvti结构的数据来源vcpu->hv_clock

目前已知:

  1. kvm在kvm_guest_time_update()中更新各vcpu的pvti结构时,是将当前时间赋值给该函数中的hv_clock,然后将hv_clock的内容写入到pvti结构中去.
  2. kvm会在host的每个tick,即每次host更新时间时,将时间相关变量更新到全局变量pvclock_gtod_data中

由此推断,hv_clock肯定跟pvclock_gtod_data有一定的关系.下面寻找他们之间的联系.

首先,在kvm_guest_time_update()中会检查是否使用master_clock即use_master_clock的值,根据该bool值的取值,当前时间的获取方式也不同.

kvm中何时决定use_master_clock的值?暂时不做讨论,这里只需要知道use_master_clock为1时,kvm只使用一份host tsc和guest tsc,其它vcpu复制之.

use_master_clock为True

如果use_master_clock为真,则让:

host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns;

那么 ka->master_cycle_now和ka->master_kernel_ns的意义是什么,何时被赋值的?

在pvclock_update_vm_gtod_copy()中,有对master_cycle_now和master_kernel_ns的赋值,以kvm_get_time_and_clockread()形式呈现,kvm_get_time_and_clockread通过do_monotonic_boot()和pvclock_gtod_data中的值来获得master_kernel_ns的值,意义为自host boot 以来的ns数.

而do_monotonic_boot通过vgettsc=>read_tsc,获得master_cycle_now的值,read_tsc通过对比rdtsc指令和pv_clock_gtod_data->clock.cycle_last的返回值,确定tsc value是否后退,如果后退,则返回pv_clock_gtod_data->clock.cycle_last的值,即上一次读取tsc时的值,如果没有后退,则返回rdtsc指令的结果.总之,master_cycle_now代表当前PCPU上的没有后退的tsc值.

// kvm更新master_kernel_ns和master_cycle_now的函数 static void pvclock_update_vm_gtod_copy(struct kvm *kvm) { ... host_tsc_clocksource = kvm_get_time_and_clockread( &ka->master_kernel_ns, &ka->master_cycle_now); } static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { ... return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, tsc_timestamp)); // 用do_monotonic_boot获得master_kernel_ns,master_cycle_now. } static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; int mode; u64 ns; do { seq = read_seqcount_begin(&gtod->seq); ns = gtod->nsec_base; ns += vgettsc(tsc_timestamp, &mode); ns >>= gtod->clock.shift; ns += gtod->boot_ns; } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); *t = ns; return mode; } // 因为我们的host基本都用tsc,所以走的case是VCLOCK_TSC static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) { ... case VCLOCK_TSC: *mode = VCLOCK_TSC; *tsc_timestamp = read_tsc(); v = (*tsc_timestamp - gtod->clock.cycle_last) & gtod->clock.mask; break; } static u64 read_tsc(void) { u64 ret = (u64)rdtsc_ordered(); u64 last = pvclock_gtod_data.clock.cycle_last; if (likely(ret >= last)) return ret; /* * GCC likes to generate cmov here, but this branch is extremely * predictable (it's just a function of time and the likely is * very likely) and there's a data dependence, so force GCC * to generate a branch instead. I don't barrier() because * we don't actually need a barrier, and if this function * ever gets inlined it will generate worse code. */ asm volatile (""); return last; }

pvclock_update_vm_gtod_copy()只在2个kvm代码的2个地方有引用,1是kvm_arch_init_vm(),2是kvm_gen_update_masterclock().后者在2个地方有引用,1是kvm_arch_vm_ioctl(KVM_SET_CLOCK),2是在每次vcpu_enter_guest()时检查到KVM_REQ_MASTERCLOCK_UPDATE请求时.

也就是说,ka->master_kernel_ns和ka->master_cycle_now会在kvm运行的3个地方更新:

  1. 初始化虚拟机(guest)时
  2. 每次进入non-root mode检测到KVM_REQ_MASTERCLOCK_UPDATE请求时
  3. 在userspace(如qemu)主动发起更新时间的请求时

结论: 如果使用master_clock, host_tsc表示当前PCPU上的无回退的TSC值, kenel_ns表示自host启动以来的ns数.也只有use_master_clock为真时,kvm维护的pvclock_gtod_data的内容才会起作用.

use_master_clock为False

如果use_master_clock为假,则让:

host_tsc = rdtsc(); kernel_ns = ktime_get_boottime_ns();

rdtsc()会直接读取当前PCPU的TSC值

ktime_get_boottime_ns()获取自host boot以来的ns数(利用的是host kernel中的timekeeping结构).

结论: 如果不使用master_clock, host_tsc表示当前PCPU(不保证是否回退)的TSC值,kernel_ns表示自host启动以来的ns数.

TSC校准系数的调整及pvti cache的最终赋值

在获得host_tsc和kernel_ns后,利用kvm_read_l1_tsc获得arch层面的tsc_offset和tsc_scale, 并利用这二者对host_tsc进行调整,赋值给tsc_timestamp,那么tsc_timestamp的意义就非常明显了,即"本次计时的TSC时间戳".

tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);

之后将根据vcpu->tsc_catchup的取值,决定是否对arch层面的tsc_offset和tsc_scale进行调整.如果需要调整,根据上面的host_tsc计算此时tsc的理论值是多少,如果理论值比读到的tsc值大,说明guest的tsc_offset和tsc_scale已经无法正确调整host_tsc的值了,需要修正.进而利用理论tsc和当前tsc进行的差值修正guest的tsc_offset和tsc_scale,并将理论tsc值赋值给本次计时的TSC时间戳.

if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } }

接下来做了:

  1. 如果支持TSC_SCALLING Feature, 就利用该feature调整本地vcpu的目标TSC频率.
if (kvm_has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
  1. 如果本地vcpu的tsc频率与目标tsc频率不同,则重新调整hv_clock的shift和multi系数,以确保本地vcpu的tsc频率与目标tsc频率相等.
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; }

之后将本次计时的TSC时间戳,正确的system time(在实际使用时要加上wallclock时间才是标准时间)赋值给hv_clock结构,并将本次计时的时间戳保存在该vcpu的last_guest_tsc变量中.

vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp;

然后在kvm_setup_pvclock_page()中将hv_clock中的值更新到pv_time cache中去,该cache在kvmclock_init => WRITE MSR => handle_wrmsr => kvm_set_msr_common => kvm_gfn_to_hva_cache_init 流程中被分配空间及host虚拟地址,每个vcpu都有一个pv_time cache,其中的gpa指向每个vcpu的pvti结构.

if (vcpu->pv_time_enabled) kvm_setup_pvclock_page(v); // 将hv_clock中的值赋值到pv_time cache中

以上, 对pvclock_gtod_data和各vcpu的pvti结构的更新之间的关系梳理基本完成.

基本思路遵循:

  1. 建立pvti结构的cache
  2. 在每次kvm更新时间时更新cache中的内容

__EOF__

本文作者EwanHai
本文链接https://www.cnblogs.com/haiyonghao/p/14440035.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角推荐一下。您的鼓励是博主的最大动力!
posted @   EwanHai  阅读(1562)  评论(0编辑  收藏  举报
编辑推荐:
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
点击右上角即可分享
微信分享提示