随笔-处理器微架构-测量cache load latency
measure cache hit latency
主要设计说明:
t1=rdtscp
load data from cache #measure the cost cycles of this instruction
t2=rdtscp
- 计算耗时即t2-t1,其包含rdtsc执行所需开销,所以测量出t_rdtsc并减去
- 需要使用lfence,避免load指令和rdtscp执行并行执行
- 需要使用rdtscp,rdtscp ensuring that all previous instructions are completed before it is executed
m1_rdtscp.c
view
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
static int compare(const void *a, const void *b) {
return (*(uint64_t *)a - *(uint64_t *)b);
}
static void find_min_max_median(uint64_t *arr, size_t n, uint64_t *min, uint64_t *max, uint64_t *median) {
qsort(arr, n, sizeof(uint64_t), compare);
*min = arr[0];
*max = arr[n - 1];
*median = (n % 2 == 0) ? (arr[n / 2 - 1] + arr[n / 2]) / 2 : arr[n / 2];
}
uint64_t measure_rdtsc_latency(void) {
register uint64_t t1, t2, t3, t4, aux;
// returns the TSC in EDX:EAX.
// In x86-64 mode also clears the upper 32 bits of RDX and RAX
asm volatile (
"rdtscp\n\t"
"mov %%rdx, %[t1]\n\t"
"mov %%rax, %[t2]\n\t"
// "mov %%rcx, %[aux]\n\t"
"lfence\n\t"
//"mov %%rcx, %%rbx\n\t"
"nop\n\t"
//"nop\n\t"
//"nop\n\t"
//"nop\n\t"
"rdtscp\n\t"
"mov %%rdx, %[t3]\n\t"
"mov %%rax, %[t4]\n\t"
// "mov %%rcx, %[aux]\n\t"
: [t1] "=r" (t1), [t2] "=r" (t2), [t3] "=r" (t3), [t4] "=r" (t4), [aux] "=r" (aux)
:
: "rax", "rdx", "rcx", "rbx"
);
return ((t3 << 32)|t4) - ((t1 << 32)|t2);
}
#define M_COUNT 1000
int main() {
uint64_t rdtsc_lat[M_COUNT] = {0}, min = 0, max = 0, median = 0;
int i;
for (i = 0; i < M_COUNT; ++i) {
rdtsc_lat[i] = measure_rdtsc_latency();
}
find_min_max_median(rdtsc_lat, M_COUNT, &min, &max, &median);
printf("rdtsc lat: min: %lu, max: %lu, median: %lu\n", min, max, median);
return 0;
}
$ gcc -O0 m1_rdtscp.c -o m1_rdtscp
$ while true; do sleep 0.5; sudo taskset -c 0 ./m1_rdtscp; done
rdtsc lat: min: 39, max: 47, median: 43
rdtsc lat: min: 39, max: 45, median: 42
rdtsc lat: min: 41, max: 47, median: 44
...
m1.c
点击查看代码
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
static int compare(const void *a, const void *b) {
return (*(uint64_t *)a - *(uint64_t *)b);
}
static void find_min_max_median(uint64_t *arr, size_t n, uint64_t *min, uint64_t *max, uint64_t *median) {
qsort(arr, n, sizeof(uint64_t), compare);
*min = arr[0];
*max = arr[n - 1];
*median = (n % 2 == 0) ? (arr[n / 2 - 1] + arr[n / 2]) / 2 : arr[n / 2];
}
static uint64_t measure_cache_load_lat(void *array) {
register uint64_t t1, t2, t3, t4, aux;
// returns the TSC in EDX:EAX.
// In x86-64 mode also clears the upper 32 bits of RDX and RAX
asm volatile (
"rdtscp\n\t"
"mov %%rdx, %[t1]\n\t"
"mov %%rax, %[t2]\n\t"
// "mov %%rcx, %[aux]\n\t"
"lfence\n\t"
"mov (%[array]), %%rdx\n\t"
"rdtscp\n\t"
"mov %%rdx, %[t3]\n\t"
"mov %%rax, %[t4]\n\t"
// "mov %%rcx, %[aux]\n\t"
: [t1] "=r" (t1), [t2] "=r" (t2), [t3] "=r" (t3), [t4] "=r" (t4), [aux] "=r" (aux)
: [array] "r" (array)
: "rax", "rdx", "rcx", "rbx"
);
return ((t3 << 32)|t4) - ((t1 << 32)|t2);
}
#define M_COUNT 5000
int main() {
uint64_t rdtsc_lat[M_COUNT] = {0}, min = 0, max = 0, median = 0;
char *array;
int i;
posix_memalign((void **)&array, 4096, 4096);// 4KB 对齐 总大小4KB
if (array == NULL) {
printf("memory alloc fail\n");
return -1;
}
for (i = 0; i < M_COUNT; ++i) {
rdtsc_lat[i] = measure_cache_load_lat(array);
}
find_min_max_median(rdtsc_lat, M_COUNT, &min, &max, &median);
printf("lat: min: %lu, max: %lu, median: %lu\n", min, max, median);
return 0;
}
$ gcc -O0 m1.c -o m1
$ while true; do sleep 0.5; sudo taskset -c 0 ./m1; done
lat: min: 41, max: 2479, median: 45
lat: min: 43, max: 2397, median: 46
lat: min: 43, max: 2264, median: 46
lat: min: 41, max: 2501, median: 46
lat: min: 41, max: 2614, median: 44
^C
大概2~3个周期
m1_repeat.c
重复load动作,弱化其他因素带来的影响
m1_repeat.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
static int compare(const void *a, const void *b) {
return (*(uint64_t *)a - *(uint64_t *)b);
}
static void find_min_max_median(uint64_t *arr, size_t n, uint64_t *min, uint64_t *max, uint64_t *median) {
qsort(arr, n, sizeof(uint64_t), compare);
*min = arr[0];
*max = arr[n - 1];
*median = (n % 2 == 0) ? (arr[n / 2 - 1] + arr[n / 2]) / 2 : arr[n / 2];
}
static uint64_t measure_cache_load_lat(void *array, int repeat) {
register uint64_t t1, t2, t3, t4, aux;
// returns the TSC in EDX:EAX.
// In x86-64 mode also clears the upper 32 bits of RDX and RAX
asm volatile (
"rdtscp\n\t"
"mov %%rdx, %[t1]\n\t"
"mov %%rax, %[t2]\n\t"
// "mov %%rcx, %[aux]\n\t"
"lfence\n\t"
"1:\n"
"mov (%[array]), %%rbx\n\t"
// "mov 4(%[array]), %%rax\n\t"
// "mov 8(%[array]), %%rcx\n\t"
"dec %[repeat]\n"
"jnz 1b\n"
"rdtscp\n\t"
"mov %%rdx, %[t3]\n\t"
"mov %%rax, %[t4]\n\t"
// "mov %%rcx, %[aux]\n\t"
: [t1] "=r" (t1), [t2] "=r" (t2), [t3] "=r" (t3), [t4] "=r" (t4), [aux] "=r" (aux)
: [array] "r" (array), [repeat] "r" (repeat)
: "rax", "rdx", "rcx", "rbx"
);
return ((t3 << 32)|t4) - ((t1 << 32)|t2);
}
#define M_COUNT 5000
int main() {
uint64_t rdtsc_lat[M_COUNT] = {0}, min = 0, max = 0, median = 0;
char *array;
int i;
posix_memalign((void **)&array, 4096, 4096);// 4KB 对齐 总大小4KB
if (array == NULL) {
printf("memory alloc fail\n");
return -1;
}
for (i = 0; i < M_COUNT; ++i) {
rdtsc_lat[i] = measure_cache_load_lat(array, 1000);
}
find_min_max_median(rdtsc_lat, M_COUNT, &min, &max, &median);
printf("lat: min: %lu, max: %lu, median: %lu\n", min, max, median);
return 0;
}
$ gcc -O0 m1_repeat.c -o m1_r
$ while true; do sleep 0.5; sudo taskset -c 0 ./m1_r; done
lat: min: 1055, max: 18022, median: 1062
lat: min: 1055, max: 17679, median: 1062
lat: min: 1055, max: 18247, median: 1062
lat: min: 1055, max: 19433, median: 1063
lat: min: 1055, max: 17137, median: 1063
1 cycles?
本文来自博客园,作者:LiYanbin,转载请注明原文链接:https://www.cnblogs.com/stellar-liyanbin/p/18687376
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
· 如何调用 DeepSeek 的自然语言处理 API 接口并集成到在线客服系统