cpu bench

g++ -O3 bench.cpp

// https://stackoverflow.com/questions/40950254/speed-up-random-memory-access-using-prefetch
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>

#define BUFFER_SIZE ((unsigned long) 4096 * 100000)


unsigned int randomUint()
{
  int value = rand() % UINT_MAX;
  return value;
}


unsigned int * createValueBuffer()
{
  unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
  for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
  {
    valueBuffer[i] = randomUint();
  }

  return (valueBuffer);
}


unsigned int * createIndexBuffer()
{
  unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
  for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
  {
    indexBuffer[i] = i;
    // indexBuffer[i] = rand() % BUFFER_SIZE;
  }

  return (indexBuffer);
}


unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
  unsigned long long sum = 0;

  for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++) {
    //__builtin_prefetch((char *) & valueBuffer[indexBuffer[i]], 0, 3);
  }

  for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
  {
    //__builtin_prefetch((char *) & valueBuffer[indexBuffer[i + 40]], 0, 3);
    // __builtin_prefetch((char *) & valueBuffer[indexBuffer[i + 4]], 0, 3);
    // __builtin_prefetch((char *) &indexBuffer[i + 1], 0, 0);
    unsigned int index = indexBuffer[i];
    sum += valueBuffer[index];
  }

  return (sum);
}


unsigned int computeTimeInMicroSeconds()
{
  unsigned int * valueBuffer = createValueBuffer();
  unsigned int * indexBuffer = createIndexBuffer();

  struct timeval startTime, endTime;
  gettimeofday(&startTime, NULL);

  unsigned long long sum = computeSum(indexBuffer, valueBuffer);

  gettimeofday(&endTime, NULL);

  printf("Sum = %llu\n", sum);
  free(indexBuffer);
  free(valueBuffer);

  return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);

}


int main()
{
  printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
  unsigned int timeInMicroSeconds = computeTimeInMicroSeconds();
  printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}

Liner Memory Access

Kunpeng-920: 0.868
Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz: 0.697
Hygon C86 7185 32-core Processor: 0.402
AMD Ryzen 7 5800H with Radeon Graphics: 0.137
12th Gen Intel(R) Core(TM) i7-12700KF: 0.147
Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz: 0.219
Intel(R) Xeon(R) Silver 4208 CPU @ 2.10GHz 0.316

Numa: numactl --cpubind=0 --membind=0

Kunpeng-920: 0.854
Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz: 0.316

CPU Bench Run

sysbench --test=cpu --time=20 run
Kunpeng-920: 67409
Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz: 21290
AMD Ryzen 7 5800H with Radeon Graphics: 95467

SIMD ADD

#include <iostream>
#include <vector>
#include <cstddef>
#include <cstdint>
#include <ctime>

constexpr size_t total_process_rows = 3972873;
constexpr size_t chunk_size = 4096;
constexpr size_t loops = total_process_rows / chunk_size;
// constexpr size_t loops = 10000;

struct Summer
{
    int64_t result;
};

struct Processer
{
    virtual void update_batch(std::vector<int> &vecs, void *data) = 0;
};

struct SumProcesser : Processer
{
    void update_batch(std::vector<int> &vecs, void *data) override
    {
        Summer *summer = reinterpret_cast<Summer *>(data);
        size_t num_rows = vecs.size();
        for (int i = 0; i < num_rows; ++i)
        {
            summer->result += vecs[i];
        }
    }
};

struct FillDataProcesser : Processer
{
    void update_batch(std::vector<int> &vecs, void *data) override
    {
        vecs.resize(chunk_size);
    }
};

static void BenchMark()
{
    Processer *filler = new FillDataProcesser();
    Processer *summer = new SumProcesser();

    std::vector<int> column;
    column.resize(chunk_size);
    Summer res{};
    uint64_t tocal_costs = {};
    timespec spec_st = {};
    timespec spec_ed = {};

    for (size_t i = 0; i < loops; ++i)
    {
        filler->update_batch(column, nullptr);
        clock_gettime(CLOCK_MONOTONIC, &spec_st);
        summer->update_batch(column, &res);
        clock_gettime(CLOCK_MONOTONIC, &spec_ed);
        tocal_costs += (spec_ed.tv_sec - spec_st.tv_sec) * 1000L * 1000L * 1000L + (spec_ed.tv_nsec - spec_st.tv_nsec);
    }
    std::cout << "Result:" << res.result << ", costs:" << tocal_costs/1000.0/1000.0;
}

int main()
{
    BenchMark();
    return 0;
}
  sysbench --test=memory --memory-block-size=1M --memory-total-size=10G --num-threads=1 run
posted @ 2022-07-31 20:44  stdpain  阅读(435)  评论(0编辑  收藏  举报