Here is the code:
#include <stdio.h> #include <xmmintrin.h> #include <windows.h> typedef __m128 Vec; typedef unsigned long long value_t; __forceinline value_t now() { LARGE_INTEGER n; QueryPerformanceCounter(&n); return n.QuadPart; } inline void img_transpose( Vec *dst_img, Vec *src_img, const int src_w, const int src_h) { #pragma omp parallel for for (int j = 0; j < src_w; ++j) { for (int i = 0; i < src_h; ++i) { dst_img[j * src_h + i] = src_img[i * src_w + j]; } } } inline void img_transpose_block( Vec *dst_img, Vec *src_img, const int src_w, const int src_h) { #pragma omp parallel for for (int j = 0; j < src_w; j += 8) { for (int i = 0; i < src_h; i += 8) { const int nsize = min(j + 8, src_w); const int msize = min(i + 8, src_h); for (int n = j; n < nsize; ++n) { for (int m = i; m < msize; ++m) { dst_img[n * src_h + m] = src_img[m * src_w + n]; } } } } } int main(int argc, char *argv[]) { //// performance benchmark //// const int w = 1280; const int h = 720; Vec *a = new Vec [w * h]; Vec *b = new Vec [w * h]; value_t start_time, end_time; LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); double ms_per_tick = 1000.0 / (double)freq.QuadPart; start_time = now(); for (int t = 0; t < 50; ++t) { img_transpose(b, a, w, h); img_transpose(a, b, h, w); } end_time = now(); printf("img_transpose: %f ms\n", (double)(end_time - start_time) * ms_per_tick); start_time = now(); for (int t = 0; t < 50; ++t) { img_transpose_block(b, a, w, h); img_transpose_block(a, b, h, w); } end_time = now(); printf("img_transpose_block: %f ms\n", (double)(end_time - start_time) * ms_per_tick); delete [] a; delete [] b; //// algorithm validation //// const int width = 1080; const int height = 1920; Vec *src_img = new Vec [width * height]; Vec *dst_img = new Vec [height * width]; for (int j = 0; j < height; ++j) { for (int i = 0; i < width; ++i) { src_img[j * width + i].m128_i32[0] = i; src_img[j * width + i].m128_i32[1] = j; } } img_transpose_block(dst_img, src_img, width, height); for (int j = 0; j < width; ++j) { for (int i = 0; i < height; ++i) { int pi = dst_img[j * height + i].m128_i32[0]; int pj = dst_img[j * height + i].m128_i32[1]; if (pi != j || pj != i) { printf("Algorithm is wrong!!!\n"); goto END_OF_PROGRAM; } } } END_OF_PROGRAM: printf("All done\n"); return 0; }