随笔-处理器微架构-测量cache load latency

measure cache hit latency

主要设计说明:

t1=rdtscp
load data from cache #measure the cost cycles of this instruction 
t2=rdtscp
  1. 计算耗时即t2-t1,其包含rdtsc执行所需开销,所以测量出t_rdtsc并减去
  2. 需要使用lfence,避免load指令和rdtscp执行并行执行
  3. 需要使用rdtscp,rdtscp ensuring that all previous instructions are completed before it is executed

m1_rdtscp.c

view
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <memory.h>

static int compare(const void *a, const void *b) {
    return (*(uint64_t *)a - *(uint64_t *)b);
}

static void find_min_max_median(uint64_t *arr, size_t n, uint64_t *min, uint64_t *max, uint64_t *median) {
    qsort(arr, n, sizeof(uint64_t), compare);

    *min = arr[0];
    *max = arr[n - 1];
    *median = (n % 2 == 0) ? (arr[n / 2 - 1] + arr[n / 2]) / 2 : arr[n / 2];
}

uint64_t measure_rdtsc_latency(void) {
    register uint64_t t1, t2, t3, t4, aux;

    // returns the TSC in EDX:EAX. 
    // In x86-64 mode also clears the upper 32 bits of RDX and RAX
    asm volatile (
        "rdtscp\n\t" 
        "mov %%rdx, %[t1]\n\t"
        "mov %%rax, %[t2]\n\t"
        // "mov %%rcx, %[aux]\n\t"

        "lfence\n\t"
        //"mov %%rcx, %%rbx\n\t"
        "nop\n\t"
        //"nop\n\t"
        //"nop\n\t"
        //"nop\n\t"

        "rdtscp\n\t" 
        "mov %%rdx, %[t3]\n\t"
        "mov %%rax, %[t4]\n\t"
        // "mov %%rcx, %[aux]\n\t"
        : [t1] "=r" (t1), [t2] "=r" (t2), [t3] "=r" (t3), [t4] "=r" (t4), [aux] "=r" (aux)
        :
        : "rax", "rdx", "rcx", "rbx"
    );

    return ((t3 << 32)|t4) - ((t1 << 32)|t2);
}

#define M_COUNT 1000

int main() {
    uint64_t rdtsc_lat[M_COUNT] = {0}, min = 0, max = 0, median = 0;
    int i;
    for (i = 0; i < M_COUNT; ++i) {
        rdtsc_lat[i] = measure_rdtsc_latency();
    }
    
    find_min_max_median(rdtsc_lat, M_COUNT, &min, &max, &median);
    printf("rdtsc lat: min: %lu, max: %lu, median: %lu\n", min, max, median);
    return 0;
}
$ gcc -O0 m1_rdtscp.c -o m1_rdtscp
$ while true; do sleep 0.5; sudo taskset -c 0 ./m1_rdtscp; done
rdtsc lat: min: 39, max: 47, median: 43
rdtsc lat: min: 39, max: 45, median: 42
rdtsc lat: min: 41, max: 47, median: 44
...

m1.c

点击查看代码
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <memory.h>

static int compare(const void *a, const void *b) {
    return (*(uint64_t *)a - *(uint64_t *)b);
}

static void find_min_max_median(uint64_t *arr, size_t n, uint64_t *min, uint64_t *max, uint64_t *median) {
    qsort(arr, n, sizeof(uint64_t), compare);

    *min = arr[0];
    *max = arr[n - 1];
    *median = (n % 2 == 0) ? (arr[n / 2 - 1] + arr[n / 2]) / 2 : arr[n / 2];
}

static uint64_t measure_cache_load_lat(void *array) {
    register uint64_t t1, t2, t3, t4, aux;

    // returns the TSC in EDX:EAX. 
    // In x86-64 mode also clears the upper 32 bits of RDX and RAX
    asm volatile (
        "rdtscp\n\t" 
        "mov %%rdx, %[t1]\n\t"
        "mov %%rax, %[t2]\n\t"
        // "mov %%rcx, %[aux]\n\t"

        "lfence\n\t"
        "mov   (%[array]), %%rdx\n\t"

        "rdtscp\n\t" 
        "mov %%rdx, %[t3]\n\t"
        "mov %%rax, %[t4]\n\t"
        // "mov %%rcx, %[aux]\n\t"
        : [t1] "=r" (t1), [t2] "=r" (t2), [t3] "=r" (t3), [t4] "=r" (t4), [aux] "=r" (aux)
        : [array] "r" (array)
        : "rax", "rdx", "rcx", "rbx"
    );

    return ((t3 << 32)|t4) - ((t1 << 32)|t2);
}

#define M_COUNT 5000

int main() {
    uint64_t rdtsc_lat[M_COUNT] = {0}, min = 0, max = 0, median = 0;
    char *array;
    int i;

    posix_memalign((void **)&array, 4096, 4096);// 4KB 对齐 总大小4KB
    if (array == NULL) {
        printf("memory alloc fail\n");
        return -1;
    }

    for (i = 0; i < M_COUNT; ++i) {
        rdtsc_lat[i] = measure_cache_load_lat(array);
    }
    find_min_max_median(rdtsc_lat, M_COUNT, &min, &max, &median);
    printf("lat: min: %lu, max: %lu, median: %lu\n", min, max, median);
    return 0;
}
$ gcc -O0 m1.c -o m1
$ while true; do sleep 0.5; sudo taskset -c 0 ./m1; done
lat: min: 41, max: 2479, median: 45
lat: min: 43, max: 2397, median: 46
lat: min: 43, max: 2264, median: 46
lat: min: 41, max: 2501, median: 46
lat: min: 41, max: 2614, median: 44
^C

大概2~3个周期

m1_repeat.c

重复load动作,弱化其他因素带来的影响

m1_repeat.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <memory.h>

static int compare(const void *a, const void *b) {
    return (*(uint64_t *)a - *(uint64_t *)b);
}

static void find_min_max_median(uint64_t *arr, size_t n, uint64_t *min, uint64_t *max, uint64_t *median) {
    qsort(arr, n, sizeof(uint64_t), compare);

    *min = arr[0];
    *max = arr[n - 1];
    *median = (n % 2 == 0) ? (arr[n / 2 - 1] + arr[n / 2]) / 2 : arr[n / 2];
}

static uint64_t measure_cache_load_lat(void *array, int repeat) {
    register uint64_t t1, t2, t3, t4, aux;

    // returns the TSC in EDX:EAX. 
    // In x86-64 mode also clears the upper 32 bits of RDX and RAX
    asm volatile (
        "rdtscp\n\t" 
        "mov %%rdx, %[t1]\n\t"
        "mov %%rax, %[t2]\n\t"
        // "mov %%rcx, %[aux]\n\t"

        "lfence\n\t"
        "1:\n"
        "mov (%[array]), %%rbx\n\t"
        // "mov 4(%[array]), %%rax\n\t"
        // "mov 8(%[array]), %%rcx\n\t"
        "dec %[repeat]\n"
        "jnz 1b\n"

        "rdtscp\n\t" 
        "mov %%rdx, %[t3]\n\t"
        "mov %%rax, %[t4]\n\t"
        // "mov %%rcx, %[aux]\n\t"
        : [t1] "=r" (t1), [t2] "=r" (t2), [t3] "=r" (t3), [t4] "=r" (t4), [aux] "=r" (aux)
        : [array] "r" (array), [repeat] "r" (repeat)
        : "rax", "rdx", "rcx", "rbx"
    );

    return ((t3 << 32)|t4) - ((t1 << 32)|t2);
}

#define M_COUNT 5000

int main() {
    uint64_t rdtsc_lat[M_COUNT] = {0}, min = 0, max = 0, median = 0;
    char *array;
    int i;

    posix_memalign((void **)&array, 4096, 4096);// 4KB 对齐 总大小4KB
    if (array == NULL) {
        printf("memory alloc fail\n");
        return -1;
    }

    for (i = 0; i < M_COUNT; ++i) {
        rdtsc_lat[i] = measure_cache_load_lat(array, 1000);
    }
    find_min_max_median(rdtsc_lat, M_COUNT, &min, &max, &median);
    printf("lat: min: %lu, max: %lu, median: %lu\n", min, max, median);
    return 0;
}
$ gcc -O0 m1_repeat.c -o m1_r 
$ while true; do sleep 0.5; sudo taskset -c 0 ./m1_r; done
lat: min: 1055, max: 18022, median: 1062
lat: min: 1055, max: 17679, median: 1062
lat: min: 1055, max: 18247, median: 1062
lat: min: 1055, max: 19433, median: 1063
lat: min: 1055, max: 17137, median: 1063

1 cycles?

posted @   LiYanbin  阅读(2)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
· 如何调用 DeepSeek 的自然语言处理 API 接口并集成到在线客服系统
点击右上角即可分享
微信分享提示