kvmclock代码学习

Linux源码版本: 5.3.0

guest os中的kvmclock驱动

kvmclock_init()函数主要做了以下几件事:

  1. 确定了各vcpu要使用的MSR

  2. 将各vcpu在kvmclock中实际使用的数据结构pvclock_vsyscall_time_info的物理地址利用write_msr写到属于每个vcpu的MSR

  3. 将1GHz的kvmclock作为clocksource注册到系统clocksource中

void __init kvmclock_init(void)
{
	u8 flags;

	if (!kvm_para_available() || !kvmclock) // 若不支持kvmclock则直接return
		return;

	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { // 查询是否支持新的kvmclock msr
		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
	} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { // 如果只支持旧的kvmclock msr,则直接return
		return;
	}

	if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
			      kvmclock_setup_percpu, NULL) < 0) { // hp:hotplug, BP:bootstrap启动 启动热插拔主核的动态准备
		return;
	}

	pr_info("kvm-clock: Using msrs %x and %x",
		msr_kvm_system_time, msr_kvm_wall_clock); // msr_kvm_system_time:kvm_system_time使用的msr
												  // msr_kvm_wall_clock:kvm_wall_clock使用的msr

	this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 将hv_clock_boot数组第一个元素的地址(虚拟地址)赋值
														 //	给local cpu变量hv_clock_per_cpu
	/* 将vcpu0的hv_clock_per_cpu物理地址写入msr_kvm_system_time指向的msr中 */
	kvm_register_clock("primary cpu clock");
	pvclock_set_pvti_cpu0_va(hv_clock_boot); // 将hv_clock_boot的数组地址写入pvti_cpu0_va中,pvti_cpu0_va是一个pvti类型的指针

	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) // KVM_FEATURE_CLOCKSOURCE_STABLE_BIT时钟源稳定指示bit
		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); // 半虚拟化时钟TSC是否稳定指示bit,什么作用呢?

	flags = pvclock_read_flags(&hv_clock_boot[0].pvti); // 确定vcpu0的pvti结构的flags
	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); // 进行时钟调度初始化,sched_clock()用于时钟调度、时间戳,及利用硬件计数器
														  // 的延时以提供一个精确的延迟时钟源

	/* 注册各种回调函数 */
	x86_platform.calibrate_tsc = kvm_get_tsc_khz; // tsc_khz为Host的pTSCfreq,指向host TSC频率的指针
	x86_platform.calibrate_cpu = kvm_get_tsc_khz;
	x86_platform.get_wallclock = kvm_get_wallclock; // wallclock:获得系统boot时的秒数和纳秒数(绝对时间,自1970)
	x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
	x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 注册各非vcpu0的时钟,将各vCPU0的vptihv_clock_per_cpu写入各
																	 // 自的msr_kvm_system_time指向的msr中
#endif
	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; // 保存sched_clock的状态,事实什么都没做
	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
	machine_ops.shutdown  = kvm_shutdown;
#ifdef CONFIG_KEXEC_CORE
	machine_ops.crash_shutdown  = kvm_crash_shutdown;
#endif
	kvm_get_preset_lpj(); // lpj:loops_per_jiffy

	/*
	 * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
	 * with P/T states and does not stop in deep C-states.
	 *
	 * Invariant TSC exposed by host means kvmclock is not necessary:
	 * can use TSC as clocksource.
	 *
	 */
	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
	    boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
	    !check_tsc_unstable())
		kvm_clock.rating = 299;

	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); // 1Ghz的kvmclock source的注册
	pv_info.name = "KVM";
}

注册为x86_platform.xxxx的函数有3个,分别为kvm_get_tsc_khz,kvm_get/set_wallclock。

kvm_get_wallclock()

/*
 * The wallclock is the time of day when we booted. Since then, some time may
 * have elapsed since the hypervisor wrote the data. So we try to account for
 * that with system time
 */
static void kvm_get_wallclock(struct timespec64 *now)
{
	wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 将wall_clock(pvclock_wall_clock类型)对应的物理地址写入对应msr
	preempt_disable();
    // wallclock的时间存于now
	pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); // 将host写入的wallclock读出来
	preempt_enable();
}

/* wallclock的内容在该函数之前已经被host写入
 *  
 * 
 */
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
			    struct pvclock_vcpu_time_info *vcpu_time,
			    struct timespec64 *ts)
{
	u32 version;
	u64 delta;
	struct timespec64 now;

	/* get wallclock at system boot */
	do {
		version = wall_clock->version;
		rmb();		/* fetch version before time */
		/*
		 * Note: wall_clock->sec is a u32 value, so it can
		 * only store dates between 1970 and 2106. To allow
		 * times beyond that, we need to create a new hypercall
		 * interface with an extended pvclock_wall_clock structure
		 * like ARM has.
		 */
		now.tv_sec  = wall_clock->sec; // 读取wallclock时间,该时间不是完整时间
		now.tv_nsec = wall_clock->nsec; // 还需要加上后面的delta, 即vm过去的时间
		rmb();		/* fetch time before checking version */
	} while ((wall_clock->version & 1) || (version != wall_clock->version));

	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
	delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;

	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
	now.tv_sec = delta;

	set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
}



struct pvclock_wall_clock {
	u32   version;
	u32   sec;
	u32   nsec;
} __attribute__((__packed__));

kvm_get_tsc_khz()

// 以kHz为基础获得tsc count
static unsigned long kvm_get_tsc_khz(void)
{
	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
	return pvclock_tsc_khz(this_cpu_pvti());
}

// 将pv_tsc_khz根据local cpu的pvti的tsc_shift和tsc_to_system_mul做校准
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
	u64 pv_tsc_khz = 1000000ULL << 32;

	do_div(pv_tsc_khz, src->tsc_to_system_mul);
	if (src->tsc_shift < 0)
		pv_tsc_khz <<= -src->tsc_shift;
	else
		pv_tsc_khz >>= src->tsc_shift;
	return pv_tsc_khz;
}

guest中设置时间的代码框架

内核获取wallclock

static struct pvclock_wall_clock wall_clock __bss_decrypted; // 静态全局变量wall_clock,存储于bss段

static void kvm_get_wallclock(struct timespec64 *now)
{
	wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); // 将wall_clock的物理地址写入到MSR_KVM_WALL_CLOCK, 触发wrmsr_vmexit
	preempt_disable();
	pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
	preempt_enable();
}

kvm_get_wallclock函数的第一个语句就会触发wrmsr_vmexit, 进而经过一系列的调用:

handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
    u64 data = msr_info->data;
    ...
        case MSR_KVM_WALL_CLOCK_NEW:
		case MSR_KVM_WALL_CLOCK:
			vcpu->kvm->arch.wall_clock = data;
			kvm_write_wall_clock(vcpu->kvm, data);
			break;
    ...
}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
    ...
        // 这里的boot time为host系统启动时间
        getboottime64(&boot); // 获取host的boot time并写入boot变量
    	if (kvm->arch.kvmclock_offset) {//对host boot time做一些调整
			struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
			boot = timespec64_sub(boot, ts);
	} 
		wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
		wc.nsec = boot.tv_nsec;
		wc.version = version;
		 // 更新guest的wallclock
		kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); // 将wc即host boot time的内容写入guest的wallclock中
    ...
}

kvm_write_wall_clock()函数执行完毕之后,guest可以在全局变量wall_clock中找到host系统的boot time.

内核更新wallclock

void getboottime64(struct timespec64 *ts)
{
	struct timekeeper *tk = &tk_core.timekeeper;
	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); // 做时间校准 tk->offs_real- tk->offs_boot

	*ts = ktime_to_timespec64(t); // 转换一下时间格式
}

重要语句为struct timekeeper *tk = &tk_core.timekeeper;而tk_core的定义为:

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
static struct {
	seqcount_t		seq;
	struct timekeeper	timekeeper;
} tk_core ____cacheline_aligned = {
	.seq = SEQCNT_ZERO(tk_core.seq),
};

所以tk_core是一个结构体变量, 存储于静态区,且同时只能有一个cpu访问该变量.

getboottime64()读取tk_core.timekeeper的offs_real和offs_boot内容,那么tk_core.timekeeper的内容在哪里设置的呢?在内核代码中找到了更新tk->offs_boot内容的代码:

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
	tk->offs_boot = ktime_add(tk->offs_boot, delta); // tk->offs_boot += delta
	/*
	 * Timespec representation for VDSO update to avoid 64bit division
	 * on every update.
	 * VDSO: 有些syscall使用很频繁,但每次syscall都要进行从用户态到内核态的切换,开销很大,就将此类syscall的结果存储在
	 * 一个共享区域中,每次syscall直接读取结果即可,降低了开销
	 * VDSO的全称为: virtual dynamic share object
	 */
	tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); // monotonic_to_boot是offs_boot的VDSO形式
    																																	// 是为了加速访问offs_boot
}

tk_update_sleep_time()的核心语句为tk->offs_boot = ktime_add(tk->offs_boot, delta);,delta是什么,从哪里来,需要在内核代码中查找tk_update_sleep_time()的调用位置和参数意义.

static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
					   const struct timespec64 *delta)
{
    ...
    // 设置tk中的CLOCK_REALTIME时间,并记录误差
    tk_xtime_add(tk, delta); // 该函数内容为: tk->xtime_sec += delta->tv_sec;
    												   //                              tk->tkr_mono.xtime_nsec += delta->tv_nsec << tk->tkr_mono.shift;
    												  // xtime_sec:  以秒为单位的当前CLOCK_REALTIME时间
    												 // tkr: timekeeping read, 用于读出时间的结构体.
    												// tkr_mono.xtime_nsec 读出时间肯定存在误差, xtime_nsec是读出时间的ns级误差
    ...	
    tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
    ...
}

在__timekeeping_inject_sleeptime()中还是看不到delta的取值,继续找.

void timekeeping_resume(void)
{
    struct timekeeper *tk = &tk_core.timekeeper;
	struct clocksource *clock = tk->tkr_mono.clock;
	unsigned long flags;
	struct timespec64 ts_new, ts_delta;
	u64 cycle_now, nsec;
	bool inject_sleeptime = false;

	read_persistent_clock64(&ts_new); // 读取新的wallclock到ts_new

	clockevents_resume(); // 继续时钟事件
	clocksource_resume(); // 继续时钟源

	raw_spin_lock_irqsave(&timekeeper_lock, flags);
	write_seqcount_begin(&tk_core.seq);

	/*
	 * After system resumes, we need to calculate the suspended time and
	 * compensate it for the OS time. There are 3 sources that could be
	 * used: Nonstop clocksource during suspend, persistent clock and rtc
	 * device.
	 *
	 * One specific platform may have 1 or 2 or all of them, and the
	 * preference will be:
	 *	suspend-nonstop clocksource -> persistent clock -> rtc
	 * The less preferred source will only be tried if there is no better
	 * usable source. The rtc part is handled separately in rtc core code.
	 */
	cycle_now = tk_clock_read(&tk->tkr_mono); // 获得当前时间
	nsec = clocksource_stop_suspend_timing(clock, cycle_now); // 获得suspend的总时间
	if (nsec > 0) {
		ts_delta = ns_to_timespec64(nsec);
		inject_sleeptime = true;
	} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
		ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
		inject_sleeptime = true;
	}

	if (inject_sleeptime) {
		suspend_timing_needed = false;
		__timekeeping_inject_sleeptime(tk, &ts_delta); // 原来delta指的是suspend的时间
	}
}

我们一路追溯的delta原来是suspend的时间,画出回溯图:

内核初始化wallclock

现在我们知道,内核如何获取wallclock,是靠x86_platform.get_wallclock,我们也知道,内核如何更新wallclock,是在系统suspend之后,resume之前,利用__timekeeping_inject_sleeptime()修改tk_core的内容,修改了wallclock.

但是,系统初始化时,wallclock肯定就被设置了,那么wallclock是如何被初始化的呢?猜测在timekeeping_init相关的函数中.

void __init timekeeping_init(void)
{
    ...
        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); // 读取walltime和bootOffset到这俩变量
    ...
}
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
				     struct timespec64 *boot_offset)
{
	read_persistent_clock64(wall_time); // 读取walltime
	*boot_offset = ns_to_timespec64(local_clock());
}
/* not static: needed by APM */
void read_persistent_clock64(struct timespec64 *ts)
{
	x86_platform.get_wallclock(ts);
}

是不是很熟悉,x86_platform.get_wallclock(ts);在kvm中,x86_platform.get_wallclock = kvm_get_wallclock,在host上,x86_platform.get_wallclock = vrtc_get_time.

void vrtc_get_time(struct timespec64 *now)
{
    u8 sec, min, hour, mday, mon;
	unsigned long flags;
	u32 year;

	spin_lock_irqsave(&rtc_lock, flags);

	while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
		cpu_relax();

	sec = vrtc_cmos_read(RTC_SECONDS);
	min = vrtc_cmos_read(RTC_MINUTES);
	hour = vrtc_cmos_read(RTC_HOURS);
	mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
	mon = vrtc_cmos_read(RTC_MONTH);
	year = vrtc_cmos_read(RTC_YEAR);

	spin_unlock_irqrestore(&rtc_lock, flags);

	/* vRTC YEAR reg contains the offset to 1972 */
	year += 1972;

	pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
		"mon: %d year: %d\n", sec, min, hour, mday, mon, year);

	now->tv_sec = mktime64(year, mon, mday, hour, min, sec);
	now->tv_nsec = 0;
}

可以看到,walltime的值包含了年月日时分秒,读取自rtc_cmos时钟中.

那么,结论就来了,guest的kvmclock的wallclock来自于RTC时钟, 且该wallclock由所有vcpu共享,如果vcpu想获得wallclock,就得写属于自己的msr_wall_clock.每当wallclock的内容更新,所有vcpu都能读到最新wallclock,而不是只有写msr_wall_clock的那个vcpu可以读到.

systemTime的初始化

从kvmclock驱动角度来看,在kvmclock_init()中,,就将vcpu0和其余vcpu的vpti结构的物理地址,通过write msr写到了各自的system_time_msr中.

static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
#define HVC_BOOT_ARRAY_SIZE \
	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) // 表示一个page能放多少个pvti结构
static struct pvclock_vsyscall_time_info
			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); // pvti结构数组

void __init kvmclock_init(void)
{
    ...
        // 获得msr_system_time和msr_wall_clock
    	msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
    
    ...
        this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); // 将pvti结构数组中的第一个元素地址给hv_clock_per_cpu
    	kvm_register_clock("primary cpu clock");  // 将hv_clock_per_cpu的物理地址写入对应system_time_msr
  	    pvclock_set_pvti_cpu0_va(hv_clock_boot); // pvti_cpu0_va = hv_clock_boot, 将hv_clock_boot地址作为cpu0的pvti地址
#ifdef CONFIG_X86_LOCAL_APIC
	x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; // 在smp_init中,调用kvm_register_clock初始化除cpu0以外的cpu时钟, 也因此将各自的hv_clock_per_cpu的物理地址传入了对应的system_time_msr
#endif
    
    ...
        
        clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);// 注册1GHz的kvmclock作为一个clocksource
    	
}

static void kvm_setup_secondary_clock(void)
{
	kvm_register_clock("secondary cpu clock");
}

static void kvm_register_clock(char *txt)
{
	struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
	u64 pa;

	if (!src)
		return;

	pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; // 确保pvti结构的bit0为1
	wrmsrl(msr_kvm_system_time, pa); // 将该cpu的pvti结构的物理地址通过写msr的方式写入对应msr_kvm_system_time
    
	pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}

现在我们知道了,在kvmclock_init()中,会用写msr的方式,将各cpu的pvti结构的物理地址写入各自对应的system_time_msr, 这里要追溯两条线索:

一条线索向上,找出在何时调用kvmclock_init()进而将cpu0的pvti结构的物理地址写入对应msr,以及何时调用x86_cpuinit.early_percpu_clock_init,将其余cpu的pvti结构的物理地址写入对应msr.

另一条线索向下,当guest kernel中发生写msr时,会导致wrmsr_vmexit,研究在该vmexit中,会怎样处理对应msr.

可以看到,在guest启动内核时就调用了kvmclock_init(),将vcpu0的pvti结构的物理地址写入了对应msr, 并注册了将其余vcpu的pvti结构的物理地址写入对应msr的回调函数kvm_setup_secondary_clock.

接下来看何时调用x86_cpuinit.early_percpu_clock_init.

至此,所有vcpu的pvti的物理地址写入msr路径已经搞清楚,接下来看另一条线索,即当写msr动作发生时,触发vmexit,在handle_wrmsr中如何处理system_time. 与wallclock类似,也经历了以下调用过程.

handle_wrmsr=>kvm_set_msr=>kvm_x86_ops->set_msr=>vmx_set_msr=>kvm_set_msr_common

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
	...
        case MSR_KVM_SYSTEM_TIME_NEW:
		case MSR_KVM_SYSTEM_TIME: {
			struct kvm_arch *ka = &vcpu->kvm->arch;

			kvmclock_reset(vcpu); // 将该vcpu的pv_time_enabled标志置为false

            // 如果是vcpu0, 那么就将tmp设置为表示是否使用的旧的kvmclock
			if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { // host_initiated在handle_wrmsr()中会被置false
			bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
			
            // 如果没有使用旧的kvmclock,则发出KVM_REQ_MASTERCLOCK_UPDATE请求
			if (ka->boot_vcpu_runs_old_kvmclock != tmp)
				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

			ka->boot_vcpu_runs_old_kvmclock = tmp;
		}
			// 将该vcpu的pvti的物理地址值赋值给该vpcu的arch.time,并发出KVM_REQ_GLOBAL_CLOCK_UPDATE请求
			vcpu->arch.time = data;
			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);

			/* we verify if the enable bit is set... */
			if (!(data & 1)) // 确保pvti的bit0不为0,如果为0,将不使用kvmclock
				break;

			if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
		  	   &vcpu->arch.pv_time, data & ~1ULL,
		  	   sizeof(struct pvclock_vcpu_time_info)))
				vcpu->arch.pv_time_enabled = false;
			else
				vcpu->arch.pv_time_enabled = true;

			break;
	}
}

即,如果运行的是vcpu0,且是否使用旧的kvmclock msr与当前的boot_vcpu_runs_old_kvmclock标志不一致,那么一定是出了一些什么问题,需要校准MASTERCLOCK,发出KVM_REQ_MASTERCLOCK_UPDATE请求.然后进行普通vcpu的操作.

普通vcpu的操作: 如果运行的是其它vcpu,那么只需要将该vcpu的pvti的物理地址值赋值给该vcpu的arch.time,并发出KVM_REQ_GLOBAL_CLOCK_UPDATE请求(也就是说,vcpu0有可能连续发出两个REQUEST).之后根据kvm_gfn_to_hva_cache_init的结果将pv_time_enabled置为true或false.看一下kvm_gfn_to_hva_cache_init函数.

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			      gpa_t gpa, unsigned long len)
{
	struct kvm_memslots *slots = kvm_memslots(kvm); //为vcpu->arch.pv_time从memory中分配一些slots
	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); // 将vcpu->arch.pv_time的cache初始化,赋值,检查有效性
}

static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
				       struct gfn_to_hva_cache *ghc,
				       gpa_t gpa, unsigned long len)
{
	int offset = offset_in_page(gpa); // 获取pvti的物理地址在page中的offset
	gfn_t start_gfn = gpa >> PAGE_SHIFT; // 获取pvti的物理地址的起始guest frame number
	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // 获取pvti的物理地址的终止guest frame number
	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;  // 所需的page数量
	gfn_t nr_pages_avail;
	int r = start_gfn <= end_gfn ? 0 : -EINVAL; // 正常情况下r==0

	ghc->gpa = gpa;
	ghc->generation = slots->generation; // slots的代数,用于分辨存储的内容为第几代
	ghc->len = len;
	ghc->hva = KVM_HVA_ERR_BAD;

	/*
	 * If the requested region crosses two memslots, we still
	 * verify that the entire region is valid here.
	 */
	while (!r && start_gfn <= end_gfn) { // 确保申请的slots有效
		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
					   &nr_pages_avail); // 为存放pvti的缓存分配对应的host 虚拟地址
		if (kvm_is_error_hva(ghc->hva))
			r = -EFAULT;
		start_gfn += nr_pages_avail;
	}

	/* Use the slow path for cross page reads and writes. */
	if (!r && nr_pages_needed == 1)
		ghc->hva += offset;
	else
		ghc->memslot = NULL;

	return r; // 正常情况下return0
}

可以看出,kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)) 为vcpu->arch.pv_time申请了cache空间(对应host的虚拟地址),将pvti的物理地址写入了该cache的gpa字段.

所以,在分配host虚拟地址成功的情况下,vcpu->arch.pv_time_enabled肯定为true.

综上,在各vcpu启动后,将各vcpu的pvti结构的物理地址写入msr_system_time_i,并开辟缓存空间,用于打通host和guest.

各vcpu的pvti结构只需要一次wrmsr便可与host虚拟地址关联,之后无需wrmsr,host不定期写入pvti的最新值.

guest从pvti结构读取system time

guest从pvti结构读取system time的触发点为上面提到的3种request:

KVM_REQ_MASTERCLOCK_UPDATE

KVM_REQ_GLOBAL_CLOCK_UPDATE

KVM_REQ_CLOCK_UPDATE

那么, guest kernel中什么时候发出这3种REQUEST呢?逐个来看.

三大更新时间请求的触发点

KVM_REQ_MASTERCLOCK_UPDATE

  1. 当masterclock被使能,就一直发出KVM_REQ_MASTERCLOCK_UPDATE请求,以更新masterclock. 这样情况的代码在kvm_track_tsc_matching中.

masterclock何时可以被使能:

  • host clocksource必须为tsc
  • vcpus必须有matched tsc,即vcpus的v_tsc必须与host_tsc频率一直

调用路径一共有2条:

第一条为:(由底层函数向顶层函数追溯)

kvm_track_tsc_matching => kvm_write_tsc => kvm_set_msr_common写MSR_IA32_TSC

即在guest os运行过程中,如果出现kvm_set_msr_common(MSR_IA32_TSC), 且满足masterclock使能条件,且masterclock使能,则发出KVM_REQ_MASTERCLOCK_UPDATE请求

第二条为:(由底层函数向顶层函数追溯)

kvm_track_tsc_matching => kvm_write_tsc => kvm_arch_vcpu_postcreate => kvm_vm_ioctl_create_vcpu

即在创建vcpu时,满足masterclock使能条件,且masterclock使能,则发出KVM_REQ_MASTERCLOCK_UPDATE请求.

  1. 写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW时,如果使用的是新版kvmclock,即写的是MSR_KVM_SYSTEM_TIME_NEW, 则发出KVM_REQ_MASTERCLOCK_UPDATE.这是systemTime的初始化期间的一段.

  2. 在pvclock_gtod_update_fn中,对所有vcpu发出了KVM_REQ_MASTERCLOCK_UPDATE.而pvclock_gtod_update_fn的调用路径为:

static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);

/*
 * Notification about pvclock gtod data update.
 */
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
			       void *priv)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	struct timekeeper *tk = priv;

	update_pvclock_gtod(tk);

	/* disable master clock if host does not trust, or does not
	 * use, TSC based clocksource.
	 */
	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
	    atomic_read(&kvm_guest_has_master_clock) != 0) // guest clocksource从TSC变为了非TSC时
		queue_work(system_long_wq, &pvclock_gtod_work); // 将pvclock_gtod_work放入工作队列

	return 0;
}


static struct notifier_block pvclock_gtod_notifier = {
	.notifier_call = pvclock_gtod_notify,
};

int kvm_arch_init(void *opaque)
{
    pvclock_gtod_register_notifier(&pvclock_gtod_notifier); // 将pvclock_gtod_notifier注册为一个时间更新listener,每当host更新时间, 就会调用pvclock_gtod_notifier进而调用pvclock_gtod_notify
}

即当host更新时间,且kvm发现guest的clocksource从TSC变为非TSC时,发出KVM_REQ_MASTERCLOCK_UPDATE请求.

4.在kvm_arch_hardware_enable中,发现guest tsc发生了倒退,那么向所有vcpu发出KVM_REQ_MASTERCLOCK_UPDATE请求.

KVM_REQ_GLOBAL_CLOCK_UPDATE

  1. 在kvmclock驱动初始化时,kvmclock_init()中的kvm_register_clock触发wrmsr进而调用kvm_set_msr_common写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW, 发出KVM_REQ_GLOBAL_CLOCK_UPDATE请求

  2. 在做从vcpu到pcpu(物理cpu)的迁移时,如果guest的tsc不一致,则需要发KVM_REQ_GLOBAL_CLOCK_UPDATE请求.

KVM_REQ_CLOCK_UPDATE

  1. kvm_gen_update_masterclock中,对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.而kvm_gen_update_masterclock为KVM_REQ_MASTERCLOCK_UPDATE请求的handler.

  2. 在kvmclock_update_fn函数中对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求,kvmclock_update_fn的调用顺序为:

    kvm_arch_init_vm()
    {
        // 初始化延时作业, 将kvmclock_update_fn注册为kvm->arch.kvmclock_update_work的回调函数
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        // 初始化延时作业,将kvmclock_sync_fn注册为kvm->arch.kvmclock_sync_work的回调函数
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
    }
    
    static void kvmclock_sync_fn(struct work_struct *work)
    {
        // 立即调用kvmclock_update_work->kvmclock_update_fn
        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); 
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
    					KVMCLOCK_SYNC_PERIOD); // 300 s后重新调用kvmclock_sync_work->kvmclock_sync_fn
    }
    
    static void kvmclock_update_fn(struct work_struct *work)
    {
      	kvm_for_each_vcpu(i, vcpu, kvm) {
    		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // kvm_guest_time_update()
    		kvm_vcpu_kick(vcpu); // kick a vcpu to sleep. or make a guest mode vcpu into host kernel mode.
    	}  
    }
    

    即在kvmclock的同步函数中定义了立即作业(更新kvmclock),和延时作业(同步kvmclock).也就是说,kvm第一次调用同步kvmclock函数后,每300s更新和同步一次kvmclock,每次更新kvmclock时都发出KVM_REQ_CLOCK_UPDATE请求.

  3. kvm_gen_kvmclock_update中,对当前vcpu发出KVM_REQ_CLOCK_UPDATE请求,100ms后调用更新kvmclock函数kvmclock_update_fn,后者对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.kvm_gen_kvmclock_update是KVM_REQ_GLOBAL_CLOCK_UPDATE请求的handler.

    static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
    {
    	struct kvm *kvm = v->kvm;
    
    	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即发送KVM_REQ_CLOCK_UPDATE请求
    	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
    					KVMCLOCK_UPDATE_DELAY); // 100ms后触发kvmclock_update_fn
    }
    
  4. kvm_arch_vcpu_load中,如果检测到了外部tsc_offset_adjustment,就发出KVM_REQ_CLOCK_UPDATE请求.即在切换到特定vcpu时,做检测并决定是否发出KVM_REQ_CLOCK_UPDATE请求.

  5. kvm_set_guest_paused中,会发出KVM_REQ_CLOCK_UPDATE请求,kvm_set_guest_paused告诉guest kernel,该guest kernel已经被kvm停止了.即在guest kernel pause时,发出KVM_REQ_CLOCK_UPDATE请求.

  6. 在qemu发出KVM_SET_CLOCK的ioctl时,向所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.qemu设置时钟时,更新guest时钟是理所应当的事情.

  7. 在__kvmclock_cpufreq_notifier中,对所有vcpu发出了KVM_REQ_CLOCK_UPDATE.因为该函数为cpu频率变化时的回调函数,当host cpu频率变化时,应该重新设置guest的时间.

  8. 在vmexit时,如果guest的tsc总是追上host的tsc,说明guest的tsc频率高于host的tsc频率,需要重新校准guest的时间.因此向当前vcpu发出KVM_REQ_CLOCK_UPDATE.

  9. kvm_arch_hardware_enable,如果host tsc不稳定,就对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.而kvm_arch_hardware_enable的调用路径为:

    kvm_arch_hardware_enable => hardware_enable_nolock => kvm_starting_cpu

    ​ => kvm_resume

    也就是说,在kvm启动vcpu和恢复vcpu的运行时,都需要发出KVM_REQ_CLOCK_UPDATE以调整时间.

三大请求的处理

在确定了各更新时间的请求的triger点之后,接下来看一下这些请求的handler究竟针对请求做了哪些处理.

3种请求均在vcpu_enter_guest(),即进入non-root之前做处理.

KVM_REQ_MASTERCLOCK_UPDATE

static void kvm_gen_update_masterclock(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
	int i;
	struct kvm_vcpu *vcpu;
	struct kvm_arch *ka = &kvm->arch;

	spin_lock(&ka->pvclock_gtod_sync_lock);
	kvm_make_mclock_inprogress_request(kvm); // 发出KVM_REQ_MCLOCK_INPROGRESS请求,让所有vcpu无法进入guest
	/* no guest entries from this point */
	pvclock_update_vm_gtod_copy(kvm);//确认guest能否使用master_clock(用于vcpu之间的时间同步)

	kvm_for_each_vcpu(i, vcpu, kvm)
		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); // 向所有vcpu发出KVM_REQ_CLOCK_UPDATE请求

	/* guest entries allowed */
	kvm_for_each_vcpu(i, vcpu, kvm)
		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); // 清除KVM_REQ_MCLOCK_INPROGRESS请求,让所有vcpu进入guest

	spin_unlock(&ka->pvclock_gtod_sync_lock);
#endif
}

static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
	struct kvm_arch *ka = &kvm->arch;
	int vclock_mode;
	bool host_tsc_clocksource, vcpus_matched;

	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
			atomic_read(&kvm->online_vcpus)); // 表示vcpus的tsc频率是否match

	/*
	 * If the host uses TSC clock, then passthrough TSC as stable
	 * to the guest.
	 */
	host_tsc_clocksource = kvm_get_time_and_clockread(
					&ka->master_kernel_ns,
					&ka->master_cycle_now); // 如果host使用tsc,host_tsc_clocksource为true
    																	  // master_kernel_ns为master中记录的host boot以来的时间
    																	 // master_cycle_now为master中记录的host的当前tsc取值

	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
				&& !ka->backwards_tsc_observed
				&& !ka->boot_vcpu_runs_old_kvmclock; // backwards_tsc_observed表示是否观察到tsc倒退现象
    																								 // boot_vcpu_runs_old_kvmclock表示kvmclock使用旧的MSR

	if (ka->use_master_clock)
		atomic_set(&kvm_guest_has_master_clock, 1); // 如果use_master_clock为1,就将kvm_guest_has_master_clock设为1

	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
					vcpus_matched);
#endif
}

可以看到,对于KVM_REQ_MASTERCLOCK_UPDATE请求,kvm做了两件事情,一件事情是确认guest能否使用master_clock(用于vcpu之间的时间同步),另一件事情是对所有vcpu发出了更基本的请求,即KVM_REQ_CLOCK_UPDATE请求(在KVM_REQ_CLOCK_UPDATE的处理中说明).

KVM_REQ_GLOBAL_CLOCK_UPDATE

static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
	struct kvm *kvm = v->kvm;

	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); // 立即发送KVM_REQ_CLOCK_UPDATE请求
	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
					KVMCLOCK_UPDATE_DELAY); // 100ms后触发kvmclock_update_fn
}

对于KVM_REQ_GLOBAL_CLOCK_UPDATE请求, kvm首先对当前vcpu发送了更基本请求,即KVM_REQ_CLOCK_UPDATE请求,在发出请求后100ms,调用kvmclock_update_fn,kvmclock_update_fn的作用是对所有vcpu发出KVM_REQ_CLOCK_UPDATE请求.

也就是说,KVM_REQ_GLOBAL_CLOCK_UPDATE的处理为:

  1. 向当前vcpu发送KVM_REQ_CLOCK_UPDATE请求
  2. 向所有vcpu发送KVM_REQ_CLOCK_UPDATE请求,并kick所有vcpu.

KVM_REQ_CLOCK_UPDATE

从上面的两种请求的处理可以看到,上面两种请求都以来基础请求KVM_REQ_CLOCK_UPDATE,因此KVM_REQ_CLOCK_UPDATE的处理非常重要.

static int kvm_guest_time_update(struct kvm_vcpu *v)
{
	unsigned long flags, tgt_tsc_khz;
	struct kvm_vcpu_arch *vcpu = &v->arch;
	struct kvm_arch *ka = &v->kvm->arch;
	s64 kernel_ns;
	u64 tsc_timestamp, host_tsc;
	u8 pvclock_flags;
	bool use_master_clock;

	kernel_ns = 0;
	host_tsc = 0;

	/*
	 * If the host uses TSC clock, then passthrough TSC as stable
	 * to the guest.
	 */
	spin_lock(&ka->pvclock_gtod_sync_lock);
	use_master_clock = ka->use_master_clock;
	if (use_master_clock) { // 如果host使用tsc clock,直接将tsc传递给guest即可
		host_tsc = ka->master_cycle_now; // 将master的cycle_now记为host_tsc (tsc数)
		kernel_ns = ka->master_kernel_ns; // 将master的kernel_ns记为kernel_ns (master的kernel 纳秒数) host boot以来的时间
	}
	spin_unlock(&ka->pvclock_gtod_sync_lock);

	/* Keep irq disabled to prevent changes to the clock */
	local_irq_save(flags);
	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); // 读取当前vcpu的tsc值
	if (unlikely(tgt_tsc_khz == 0)) { // 如果当前vcpu的tsc值无效,则发出KVM_REQ_CLOCK_UPDATE请求
		local_irq_restore(flags);
		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
		return 1;
	}
	if (!use_master_clock) {//如果host不使用tsc clock,
		host_tsc = rdtsc(); // 通过手动读取tsc的值获得host的tsc值
		kernel_ns = ktime_get_boottime_ns(); // 获得host kernel boot以来的时间
	}

	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); // 将host tsc的值通过scale和offset得到当前时间戳

	/*
	 * We may have to catch up the TSC to match elapsed wall clock
	 * time for two reasons, even if kvmclock is used.
	 *   1) CPU could have been running below the maximum TSC rate
	 *   2) Broken TSC compensation resets the base at each VCPU
	 *      entry to avoid unknown leaps of TSC even when running
	 *      again on the same CPU.  This may cause apparent elapsed
	 *      time to disappear, and the guest to stand still or run
	 *	very slowly.
	 */
	if (vcpu->tsc_catchup) {
		u64 tsc = compute_guest_tsc(v, kernel_ns); // 通过host boot以来的时间,计算理论guest tsc
		if (tsc > tsc_timestamp) { // 如果理论guest tsc比经过调整的host tsc大
			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);//将offset调整为原offset+tsc-tsc_timestap
			tsc_timestamp = tsc; // 将tsc赋值给当前时间戳,使其保持一致
		}
	}

	local_irq_restore(flags);

	/* With all the info we got, fill in the values */

	if (kvm_has_tsc_control) // 如果支持tsc scaling
		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); // 将当前vcpu的tsc值做scale

	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { // 如果当前vcpu的tsc与vcpu记录的硬件tsc值不同,则调整tsc scale值
		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				   &vcpu->hv_clock.tsc_shift,
				   &vcpu->hv_clock.tsc_to_system_mul);
		vcpu->hw_tsc_khz = tgt_tsc_khz;
	}

    // 向pvti类型的hv_clock赋值
	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
	vcpu->last_guest_tsc = tsc_timestamp;

	/* If the host uses TSC clocksource, then it is stable */
	pvclock_flags = 0;
	if (use_master_clock)
		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;

	vcpu->hv_clock.flags = pvclock_flags;

	if (vcpu->pv_time_enabled) // 如果使用半虚拟化(kvmclock)
		kvm_setup_pvclock_page(v); // 就将pv_clock的内容拷贝到pv_clock
	if (v == kvm_get_vcpu(v->kvm, 0))
		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
	return 0;
}

kvm_guest_time_update()做了以下几件事情:

  1. 获取host的tsc value和host kernel boot以来的ns数
  2. 读取当前vcpu的tsc value
  3. 经过一系列的校准,将最终时间赋值给vcpu->hv_clock
  4. 如果vcpu使能了半虚拟化,就调用kvm_setup_pvclock_page

来看kvm_setup_pvclock_page.

static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
{
    ...
        	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock)); // 将hv_clock的内容赋值到pv_time中去
    ...
}

这里的pv_time就是之前我们提到的每个vcpu都有1个的pvti结构.将将hv_clock的内容赋值到pv_time中去,即将最新时间更新到vcpu的pvti结构中去.

system time就这样在pvti结构中被更新了.

host对system time的写入

host对system time的写入一般来说有2种情况,同步写入和异步写入.

同步写入指的是周期性更新guest中system time的值,以和host时间保持一致.

异步写入指的是在特殊事件发生时(如guest suspend时),更新guest中system time的值,防止guest中的时间出错.

host对system time的同步写入

kvm通过pvclock_gtod_register_notifier向timekeeper层注册了一个回调pvclock_gtod_notify(在上面的三大请求trigger点的介绍中有提到),每当Host Kernel时钟更新时(即timekeeping_update被调用时),就会调用pvclock_gtod_notify.

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
			       void *priv)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	struct timekeeper *tk = priv;

	update_pvclock_gtod(tk); // 更新pvclock的内容

	/* disable master clock if host does not trust, or does not
	 * use, TSC based clocksource.
	 */
	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
	    atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource从TSC变为了非TSC时
		queue_work(system_long_wq, &pvclock_gtod_work);

	return 0;
}

static struct pvclock_gtod_data pvclock_gtod_data;

static void update_pvclock_gtod(struct timekeeper *tk)
{
	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
	u64 boot_ns;

	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));

	write_seqcount_begin(&vdata->seq);

	/* copy pvclock gtod data */
	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last; //host时间更新时的clocksource的counter读数
	vdata->clock.mask		= tk->tkr_mono.mask;
	vdata->clock.mult		= tk->tkr_mono.mult;
	vdata->clock.shift		= tk->tkr_mono.shift;

	vdata->boot_ns			= boot_ns; // host时间更新时的s部分,以ns记
	vdata->nsec_base		= tk->tkr_mono.xtime_nsec; // host时间更新时的wallclock的ns部分

	vdata->wall_time_sec            = tk->xtime_sec;// host时间更新时的wallclock的s部分

	write_seqcount_end(&vdata->seq);
}

pvclock_gtod_notify()完成了2件事情:

  1. 调用update_pvclock_gtod更新了pvclock_gtod_data
  2. 检测host的clocksource是否变为了非tsc,如果变了则将作业pvclock_gtod_work入队
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
static void pvclock_gtod_update_fn(struct work_struct *work)
{
	struct kvm *kvm;

	struct kvm_vcpu *vcpu;
	int i;

	mutex_lock(&kvm_lock);
	list_for_each_entry(kvm, &vm_list, vm_list)
		kvm_for_each_vcpu(i, vcpu, kvm)
			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
	atomic_set(&kvm_guest_has_master_clock, 0);
	mutex_unlock(&kvm_lock);
}

可以看到pvclock_gtod_work的实际函数pvclock_gtod_update_fn的作用为:

向所有vcpu发出KVM_REQ_MASTERCLOCK_UPDATE,而后者经过层层调用,更新每个vcpu的pvti结构中的时间数据.

也就是说,每当Host Kernel时钟更新时,如果使用master_clock,kvm会更新每个vcpu的pvti时间.内核的代码中使用tk_clock_read读取clocksource当前counter,但是没有发现上下文中有对读取时间的cpu的限制.

host对system time的异步写入

host对system time的异步写入通过qemu实现,利用kvm_vm_ioctl(KVM_SET_CLOCK),与kvm发生交互.

而kvm中,KVM_SET_CLOCK的ioctl的定义如下:

case KVM_SET_CLOCK: {
		struct kvm_clock_data user_ns;
		u64 now_ns;

		r = -EFAULT;
		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
			goto out;

		r = -EINVAL;
		if (user_ns.flags)
			goto out;

		r = 0;
		/*
		 * TODO: userspace has to take care of races with VCPU_RUN, so
		 * kvm_gen_update_masterclock() can be cut down to locked
		 * pvclock_update_vm_gtod_copy().
		 */
		kvm_gen_update_masterclock(kvm); // 确认guest能否使用masterclock,并向所有vcpu发出时间更新请求
		now_ns = get_kvmclock_ns(kvm); // 读取当前cpu的时间
		kvm->arch.kvmclock_offset += user_ns.clock - now_ns; // 确认当前cpu和qemu传入的cpu时间的offset
		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); // 利用新的offset对所有vcpu的时间进行更新
		break;
	}

可以看到,kvm_vm_ioctl(KVM_SET_CLOCK)做了以下几件事情:

  1. 确认guest能否使用masterclock,并向所有vcpu发出时间更新请求
  2. 读取当前cpu的时间(根据是否使用masterclock,读取时间的方式不同)
  3. 计算当前cpu和qemu传入的cpu时间的offset
  4. 利用新的offset对所有vcpu的时间进行更新

host对system time的异步写入依赖qemu和kvm的交互kvm_vm_ioctl(KVM_SET_CLOCK).

masterclock: 由于我们的kvmclock依赖于Host Boot Time和Host TSC两个量,即使Host TSC同步且Guest TSC同步,在pCPU0和pCPU1分别取两者,前者的差值和后者的差值也可能不相等,并且谁大谁小都有可能,从而可能违反kvmclock的单调性。因此,我们通过只使用一份Master Copy,即Master Clock,来解决这个问题。

---update on 6.1 2020-----

由于对pvclock_gtod_data和各vcpu的pvti结构的更新之间的关系不太清楚,特此研究记录.

与各vcpu的pvti结构对应的host虚拟地址的申请

在kvmclock驱动初始化时,kvmclock_init()中的kvm_register_clock触发wrmsr进而调用kvm_set_msr_common写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW.

在kvm_set_msr_common()中最关键的一句话为:

if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
		     &vcpu->arch.pv_time, data & ~1ULL,
		     sizeof(struct pvclock_vcpu_time_info))) // 计算出写入gpa为data时,对应的hva.
			 																					// 将hva,gpa,内存区域长度,对应的memslot地址,该memslot对应的generation都存入arch.pv_time中
			vcpu->arch.pv_time_enabled = false;
		else
			vcpu->arch.pv_time_enabled = true;

kvm_gfn_to_hva_cache_init的函数原型为:

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			      gpa_t gpa, unsigned long len);

其中,ghc为gfn_to_hva_cache结构体类型,意义为将guest frame number转化成host virtual address的cache.定义为:

struct gfn_to_hva_cache {
	u64 generation; // cache的代数
	gpa_t gpa; // guest物理地址
	unsigned long hva; // host虚拟地址
	unsigned long len; //该cache的大小
	struct kvm_memory_slot *memslot; // 该cache对应的kvm memslot地址
};

kvm_gfn_to_hva_cache_init()函数的实现为:

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			      gpa_t gpa, unsigned long len)
{
	struct kvm_memslots *slots = kvm_memslots(kvm); // slots为整个kvm memslots的地址
	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
}

static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
				       struct gfn_to_hva_cache *ghc,
				       gpa_t gpa, unsigned long len)
{
	int offset = offset_in_page(gpa); // 页内offset
    /* 虽然gpa只是一个地址,但由于从gpa开始,需要len长度的空间,所以存在起始gfn和终点gfn */
	gfn_t start_gfn = gpa >> PAGE_SHIFT; // gpa对应的起始gfn(guest frame number)
	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; // gpa对应的终点gfn
	gfn_t nr_pages_needed = end_gfn - start_gfn + 1; // 所需页的数量
	gfn_t nr_pages_avail;
	int r = start_gfn <= end_gfn ? 0 : -EINVAL; // r判断起始gfn和终点gfn的有效性

    /*  将guest物理地址,memslot的代数,所需存储空间的长度,赋值给gfn_to_hva_cache结构 */
	ghc->gpa = gpa;
	ghc->generation = slots->generation;
	ghc->len = len;
    
	ghc->hva = KVM_HVA_ERR_BAD; // 先将host虚拟地址赋值为无效
    
    
    //-----------------------------下面开始为gpa找对应的hva,存储到赋值给gfn_to_hva_cache结构中去

	/*
	 * If the requested region crosses two memslots, we still
	 * verify that the entire region is valid here.
	 */
	while (!r && start_gfn <= end_gfn) { // 如果请求的空间大小横跨2个memslot,需要确定请求空间的有效性
		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
					   &nr_pages_avail);
		if (kvm_is_error_hva(ghc->hva))
			r = -EFAULT;
		start_gfn += nr_pages_avail;
	}

	/* Use the slow path for cross page reads and writes. */
	if (!r && nr_pages_needed == 1)
		ghc->hva += offset; 
	else
		ghc->memslot = NULL;

	return r;
}

从上面的分析可以看出,以下这段语句的作用为:从kvm_memslots中申请大小为pvclock_vcpu_time_info结构大小的缓存空间,该缓存空间缓存的是物理地址为data中所存地址中指向的内容,该缓存空间对应的host虚拟地址为hva.

kvm_gfn_to_hva_cache_init(vcpu->kvm,
		     &vcpu->arch.pv_time, data & ~1ULL,
		     sizeof(struct pvclock_vcpu_time_info))

在kvmclock驱动初始化写MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW时导致的kvm_set_msr_common()中,data就是每个vcpu都有的pvti结构.让pvti有了一个host虚拟地址.

向各vcpu的pvti结构对应的host虚拟地址写入时间

先不论时间从哪里来,肯定会向pvti结构写入时间.

static int kvm_guest_time_update(struct kvm_vcpu *v)
{
    ...
    if (vcpu->pv_time_enabled) // 在kvm_set_msr_common中建立pvti的gpa对应的hva时就使能了
		kvm_setup_pvclock_page(v);
}

static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
{
    ...
    	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock)); // 将hv_clock(pvti结构体类型)写到vcpu的gfn_to_hva_cache结构的hva去,即写到了vcpu的pvti中去
}

int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
			   void *data, unsigned long len)
{
	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
}

int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
				  void *data, unsigned int offset,
				  unsigned long len)
{
	struct kvm_memslots *slots = kvm_memslots(kvm);
	int r;
	gpa_t gpa = ghc->gpa + offset;

	BUG_ON(len + offset > ghc->len);

	if (slots->generation != ghc->generation)
		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);

	if (unlikely(!ghc->memslot))
		return kvm_write_guest(kvm, gpa, data, len);

	if (kvm_is_error_hva(ghc->hva))
		return -EFAULT;

	r = __copy_to_user((void __user *)ghc->hva + offset, data, len); // 将数据写入gfn_to_hva_cache的hva地址中去
	if (r)
		return -EFAULT;
	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);

	return 0;
}

综上,下面这段语句的意义为:

将vcpu的hv_clock(pvti结构类型)数据写入属于该vcpu的pvti对应的host虚拟地址中去.

kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				&vcpu->hv_clock,
				sizeof(vcpu->hv_clock)); 

pv_clock_gtod_data

pv_clock_gtod_data是一个全局变量, kvm会在每个host的tick时更新该变量的内容.

struct pvclock_gtod_data {
	seqcount_t	seq;

	struct { /* extract of a clocksource struct */
		int vclock_mode;
		u64	cycle_last;
		u64	mask;
		u32	mult;
		u32	shift;
	} clock;

	u64		boot_ns;
	u64		nsec_base;
	u64		wall_time_sec;
};

如何更新的呢?

kvm通过pvclock_gtod_register_notifier向timekeeper层注册了一个回调pvclock_gtod_notify,每当Host Kernel时钟更新时(即timekeeping_update被调用时),就会调用pvclock_gtod_notify,进而调用update_pvclock_gtod更新pvclock_gtod_data的值.

static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
			       void *priv)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	struct timekeeper *tk = priv;

	update_pvclock_gtod(tk); // 更新pvclock_gtod_data的内容

	/* disable master clock if host does not trust, or does not
	 * use, TSC based clocksource.
	 */
	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
	    atomic_read(&kvm_guest_has_master_clock) != 0) // clocksource从TSC变为了非TSC时
		queue_work(system_long_wq, &pvclock_gtod_work);

	return 0;
}

static struct pvclock_gtod_data pvclock_gtod_data;

static void update_pvclock_gtod(struct timekeeper *tk)
{
	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
	u64 boot_ns;

	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));

	write_seqcount_begin(&vdata->seq);

	/* copy pvclock gtod data */
	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode; // 时钟源类型
	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last; //host时间更新时的clocksource的counter读数
	vdata->clock.mask		= tk->tkr_mono.mask;
	vdata->clock.mult		= tk->tkr_mono.mult;
	vdata->clock.shift		= tk->tkr_mono.shift;

	vdata			= boot_ns; // host时间更新时,用来获得当前时间的基础时间,以ns记
	vdata->nsec_base		= tk->tkr_mono.xtime_nsec; // host时间更新时的wallclock的ns部分

	vdata->wall_time_sec            = tk->xtime_sec;// host时间更新时的wallclock的s部分

	write_seqcount_end(&vdata->seq);
}

pvti结构的数据来源vcpu->hv_clock

目前已知:

  1. kvm在kvm_guest_time_update()中更新各vcpu的pvti结构时,是将当前时间赋值给该函数中的hv_clock,然后将hv_clock的内容写入到pvti结构中去.
  2. kvm会在host的每个tick,即每次host更新时间时,将时间相关变量更新到全局变量pvclock_gtod_data中

由此推断,hv_clock肯定跟pvclock_gtod_data有一定的关系.下面寻找他们之间的联系.

首先,在kvm_guest_time_update()中会检查是否使用master_clock即use_master_clock的值,根据该bool值的取值,当前时间的获取方式也不同.

kvm中何时决定use_master_clock的值?暂时不做讨论,这里只需要知道use_master_clock为1时,kvm只使用一份host tsc和guest tsc,其它vcpu复制之.

use_master_clock为True

如果use_master_clock为真,则让:

host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;

那么 ka->master_cycle_now和ka->master_kernel_ns的意义是什么,何时被赋值的?

在pvclock_update_vm_gtod_copy()中,有对master_cycle_now和master_kernel_ns的赋值,以kvm_get_time_and_clockread()形式呈现,kvm_get_time_and_clockread通过do_monotonic_boot()和pvclock_gtod_data中的值来获得master_kernel_ns的值,意义为自host boot 以来的ns数.

而do_monotonic_boot通过vgettsc=>read_tsc,获得master_cycle_now的值,read_tsc通过对比rdtsc指令和pv_clock_gtod_data->clock.cycle_last的返回值,确定tsc value是否后退,如果后退,则返回pv_clock_gtod_data->clock.cycle_last的值,即上一次读取tsc时的值,如果没有后退,则返回rdtsc指令的结果.总之,master_cycle_now代表当前PCPU上的没有后退的tsc值.

// kvm更新master_kernel_ns和master_cycle_now的函数
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
    ...
        host_tsc_clocksource = kvm_get_time_and_clockread(
					&ka->master_kernel_ns,
					&ka->master_cycle_now);
}

static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
    ...
        return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
						      tsc_timestamp)); // 用do_monotonic_boot获得master_kernel_ns,master_cycle_now.
}

static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	unsigned long seq;
	int mode;
	u64 ns;

	do {
		seq = read_seqcount_begin(&gtod->seq);
		ns = gtod->nsec_base;
		ns += vgettsc(tsc_timestamp, &mode);
		ns >>= gtod->clock.shift;
		ns += gtod->boot_ns;
	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
	*t = ns;

	return mode;
}

// 因为我们的host基本都用tsc,所以走的case是VCLOCK_TSC
static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
{
    ...
        case VCLOCK_TSC:
		*mode = VCLOCK_TSC;
		*tsc_timestamp = read_tsc();
		v = (*tsc_timestamp - gtod->clock.cycle_last) &
			gtod->clock.mask;
		break;
}

static u64 read_tsc(void)
{
	u64 ret = (u64)rdtsc_ordered();
	u64 last = pvclock_gtod_data.clock.cycle_last;

	if (likely(ret >= last))
		return ret;

	/*
	 * GCC likes to generate cmov here, but this branch is extremely
	 * predictable (it's just a function of time and the likely is
	 * very likely) and there's a data dependence, so force GCC
	 * to generate a branch instead.  I don't barrier() because
	 * we don't actually need a barrier, and if this function
	 * ever gets inlined it will generate worse code.
	 */
	asm volatile ("");
	return last;
}

pvclock_update_vm_gtod_copy()只在2个kvm代码的2个地方有引用,1是kvm_arch_init_vm(),2是kvm_gen_update_masterclock().后者在2个地方有引用,1是kvm_arch_vm_ioctl(KVM_SET_CLOCK),2是在每次vcpu_enter_guest()时检查到KVM_REQ_MASTERCLOCK_UPDATE请求时.

也就是说,ka->master_kernel_ns和ka->master_cycle_now会在kvm运行的3个地方更新:

  1. 初始化虚拟机(guest)时
  2. 每次进入non-root mode检测到KVM_REQ_MASTERCLOCK_UPDATE请求时
  3. 在userspace(如qemu)主动发起更新时间的请求时

结论: 如果使用master_clock, host_tsc表示当前PCPU上的无回退的TSC值, kenel_ns表示自host启动以来的ns数.也只有use_master_clock为真时,kvm维护的pvclock_gtod_data的内容才会起作用.

use_master_clock为False

如果use_master_clock为假,则让:

host_tsc = rdtsc();
kernel_ns = ktime_get_boottime_ns();

rdtsc()会直接读取当前PCPU的TSC值

ktime_get_boottime_ns()获取自host boot以来的ns数(利用的是host kernel中的timekeeping结构).

结论: 如果不使用master_clock, host_tsc表示当前PCPU(不保证是否回退)的TSC值,kernel_ns表示自host启动以来的ns数.

TSC校准系数的调整及pvti cache的最终赋值

在获得host_tsc和kernel_ns后,利用kvm_read_l1_tsc获得arch层面的tsc_offset和tsc_scale, 并利用这二者对host_tsc进行调整,赋值给tsc_timestamp,那么tsc_timestamp的意义就非常明显了,即"本次计时的TSC时间戳".

tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);

之后将根据vcpu->tsc_catchup的取值,决定是否对arch层面的tsc_offset和tsc_scale进行调整.如果需要调整,根据上面的host_tsc计算此时tsc的理论值是多少,如果理论值比读到的tsc值大,说明guest的tsc_offset和tsc_scale已经无法正确调整host_tsc的值了,需要修正.进而利用理论tsc和当前tsc进行的差值修正guest的tsc_offset和tsc_scale,并将理论tsc值赋值给本次计时的TSC时间戳.

if (vcpu->tsc_catchup) {
    u64 tsc = compute_guest_tsc(v, kernel_ns);
    if (tsc > tsc_timestamp) {
        adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
        tsc_timestamp = tsc;
    }
}

接下来做了:

  1. 如果支持TSC_SCALLING Feature, 就利用该feature调整本地vcpu的目标TSC频率.
if (kvm_has_tsc_control)
		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
  1. 如果本地vcpu的tsc频率与目标tsc频率不同,则重新调整hv_clock的shift和multi系数,以确保本地vcpu的tsc频率与目标tsc频率相等.
	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				   &vcpu->hv_clock.tsc_shift,
				   &vcpu->hv_clock.tsc_to_system_mul);
		vcpu->hw_tsc_khz = tgt_tsc_khz;
	}

之后将本次计时的TSC时间戳,正确的system time(在实际使用时要加上wallclock时间才是标准时间)赋值给hv_clock结构,并将本次计时的时间戳保存在该vcpu的last_guest_tsc变量中.

vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;

然后在kvm_setup_pvclock_page()中将hv_clock中的值更新到pv_time cache中去,该cache在kvmclock_init => WRITE MSR => handle_wrmsr => kvm_set_msr_common => kvm_gfn_to_hva_cache_init 流程中被分配空间及host虚拟地址,每个vcpu都有一个pv_time cache,其中的gpa指向每个vcpu的pvti结构.

if (vcpu->pv_time_enabled)
		kvm_setup_pvclock_page(v); // 将hv_clock中的值赋值到pv_time cache中

以上, 对pvclock_gtod_data和各vcpu的pvti结构的更新之间的关系梳理基本完成.

基本思路遵循:

  1. 建立pvti结构的cache
  2. 在每次kvm更新时间时更新cache中的内容
posted @ 2021-02-24 10:51  EwanHai  阅读(1487)  评论(0编辑  收藏  举报