cpu bench
g++ -O3 bench.cpp
// https://stackoverflow.com/questions/40950254/speed-up-random-memory-access-using-prefetch
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#define BUFFER_SIZE ((unsigned long) 4096 * 100000)
unsigned int randomUint()
{
int value = rand() % UINT_MAX;
return value;
}
unsigned int * createValueBuffer()
{
unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
valueBuffer[i] = randomUint();
}
return (valueBuffer);
}
unsigned int * createIndexBuffer()
{
unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = i;
// indexBuffer[i] = rand() % BUFFER_SIZE;
}
return (indexBuffer);
}
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
unsigned long long sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++) {
//__builtin_prefetch((char *) & valueBuffer[indexBuffer[i]], 0, 3);
}
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
//__builtin_prefetch((char *) & valueBuffer[indexBuffer[i + 40]], 0, 3);
// __builtin_prefetch((char *) & valueBuffer[indexBuffer[i + 4]], 0, 3);
// __builtin_prefetch((char *) &indexBuffer[i + 1], 0, 0);
unsigned int index = indexBuffer[i];
sum += valueBuffer[index];
}
return (sum);
}
unsigned int computeTimeInMicroSeconds()
{
unsigned int * valueBuffer = createValueBuffer();
unsigned int * indexBuffer = createIndexBuffer();
struct timeval startTime, endTime;
gettimeofday(&startTime, NULL);
unsigned long long sum = computeSum(indexBuffer, valueBuffer);
gettimeofday(&endTime, NULL);
printf("Sum = %llu\n", sum);
free(indexBuffer);
free(valueBuffer);
return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);
}
int main()
{
printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
unsigned int timeInMicroSeconds = computeTimeInMicroSeconds();
printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}
Liner Memory Access
Kunpeng-920: 0.868
Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz: 0.697
Hygon C86 7185 32-core Processor: 0.402
AMD Ryzen 7 5800H with Radeon Graphics: 0.137
12th Gen Intel(R) Core(TM) i7-12700KF: 0.147
Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz: 0.219
Intel(R) Xeon(R) Silver 4208 CPU @ 2.10GHz 0.316
Numa: numactl --cpubind=0 --membind=0
Kunpeng-920: 0.854
Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz: 0.316
CPU Bench Run
sysbench --test=cpu --time=20 run
Kunpeng-920: 67409
Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz: 21290
AMD Ryzen 7 5800H with Radeon Graphics: 95467
SIMD ADD
#include <iostream>
#include <vector>
#include <cstddef>
#include <cstdint>
#include <ctime>
constexpr size_t total_process_rows = 3972873;
constexpr size_t chunk_size = 4096;
constexpr size_t loops = total_process_rows / chunk_size;
// constexpr size_t loops = 10000;
struct Summer
{
int64_t result;
};
struct Processer
{
virtual void update_batch(std::vector<int> &vecs, void *data) = 0;
};
struct SumProcesser : Processer
{
void update_batch(std::vector<int> &vecs, void *data) override
{
Summer *summer = reinterpret_cast<Summer *>(data);
size_t num_rows = vecs.size();
for (int i = 0; i < num_rows; ++i)
{
summer->result += vecs[i];
}
}
};
struct FillDataProcesser : Processer
{
void update_batch(std::vector<int> &vecs, void *data) override
{
vecs.resize(chunk_size);
}
};
static void BenchMark()
{
Processer *filler = new FillDataProcesser();
Processer *summer = new SumProcesser();
std::vector<int> column;
column.resize(chunk_size);
Summer res{};
uint64_t tocal_costs = {};
timespec spec_st = {};
timespec spec_ed = {};
for (size_t i = 0; i < loops; ++i)
{
filler->update_batch(column, nullptr);
clock_gettime(CLOCK_MONOTONIC, &spec_st);
summer->update_batch(column, &res);
clock_gettime(CLOCK_MONOTONIC, &spec_ed);
tocal_costs += (spec_ed.tv_sec - spec_st.tv_sec) * 1000L * 1000L * 1000L + (spec_ed.tv_nsec - spec_st.tv_nsec);
}
std::cout << "Result:" << res.result << ", costs:" << tocal_costs/1000.0/1000.0;
}
int main()
{
BenchMark();
return 0;
}
sysbench --test=memory --memory-block-size=1M --memory-total-size=10G --num-threads=1 run