代码优化程序性能
参考自:《深入理解计算机系统》第5章
优化方式:
代码移动(code motion):识别要执行多次(例如在循环里)但是计算结果不会改变的计算。
减少过程调用:减少循环中的判断。
消除不必要的存储器引用:在临时变量中存放结果,消除每次循环迭代中从存储器中读出并将更新值写回的需要。
循环展开:通过每次迭代计算的元素的数量,减少循环的迭代次数。
提高并行性:利用单元的流水线能力。
重新结合变换:减小计算中关键路径上操作的数量,通过更好地利用功能单元的流水线能力得到更好地性能
vec.h
typedef int data_t; //typedef double data_t; /* Create abstract data type for vector */ typedef struct{ long int len; data_t *data; }vec_rec, *vec_ptr; #define IDENT 0 #define OP + vec_ptr new_vec(long int len); void free_vec(vec_ptr v); int get_vec_element(vec_ptr v, long int index, data_t *dest); long int vec_length(vec_ptr v); void combine1(vec_ptr v, data_t *dest); void combine2(vec_ptr v, data_t *dest); void combine3(vec_ptr v, data_t *dest); void combine4(vec_ptr v, data_t *dest); void combine5(vec_ptr v, data_t *dest); void combine6(vec_ptr v, data_t *dest); void combine7(vec_ptr v, data_t *dest);
vec.cpp
#include "vec.h" #include <stdlib.h> /* Create vector of specified length */ vec_ptr new_vec(long int len) { /* Allocate header structure */ vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec)); if (!result) { return NULL; /* Couldn't allocate storage */ } result->len = len; /* Allocate array */ if (len > 0) { data_t *data = (data_t *)calloc(len, sizeof(data_t)); if (!data) { free((void *)result); return NULL; } result->data = data; } else { result->data = NULL; } return result; } void free_vec(vec_ptr v) { if (v) { if (v->data) { free((void*)v->data); } free(v); } } /* * Retrieve vector element and store at dest. * Return 0 (out of bounds) or 1 (successful) */ int get_vec_element(vec_ptr v, long int index, data_t *dest) { if (index < 0 || index >= v->len) { return 0; } *dest = v->data[index]; return 1; } /* Return length of vector */ long int vec_length(vec_ptr v) { return v->len; } /* Implementation with maximum use of data abstraction */ void combine1(vec_ptr v, data_t *dest) { long int i; *dest = IDENT; for (i = 0; i < vec_length(v); i++) { data_t val; get_vec_element(v, i, &val); *dest = *dest OP val; } } // 代码移动(code motion) /* Move call to vec_length out of loop */ void combine2(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); *dest = IDENT; for (i = 0; i < length; i++) { data_t val; get_vec_element(v, i, &val); *dest = *dest OP val; } } data_t *get_vec_start(vec_ptr v) { return v->data; } // 减少过程调用 /* Direct access to vector data */ void combine3(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); data_t *data = get_vec_start(v); *dest = IDENT; for (i = 0; i < length; i++) { *dest = *dest OP data[i]; } } // 消除不必要的存储器引用 /* Accumulate result in local variable */ void combine4(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); data_t *data = get_vec_start(v); data_t acc = IDENT; for (i = 0; i < length; i++) { acc = acc OP data[i]; } *dest = acc; } // 循环展开(两次循环展开) /* Unroll loop by 2 */ void combine5(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length - 1; data_t *data = get_vec_start(v); data_t acc = IDENT; /* Combine 2 elements at a time */ for (i = 0; i < limit; i += 2) { acc = (acc OP data[i]) OP data[i + 1]; } /* Finish any remaining elements */ for (; i < length; i++) { acc = acc OP data[i]; } *dest = acc; } // 提高并行性(两路并行) /* Unroll loop by 2, 2-way parallelism */ void combine6(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length - 1; data_t *data = get_vec_start(v); data_t acc0 = IDENT; data_t acc1 = IDENT; /* Combine 2 elements at a time */ for (i = 0; i < limit; i += 2) { acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i + 1]; } /* Finish any remaining elements */ for (; i < length; i++) { acc0 = acc0 OP data[i]; } *dest = acc0 OP acc1; } // 重新结合变换 /* Change associativity of combining operation */ void combine7(vec_ptr v, data_t *dest) { long int i; long int length = vec_length(v); long int limit = length - 1; data_t *data = get_vec_start(v); data_t acc = IDENT; /* Combine2 elements at a time */ for (i = 0; i < limit; i += 2) { acc = acc OP(data[i] OP data[i + 1]); } /* Finish any remaining elements */ for (; i < length; i++) { acc = acc OP data[i]; } *dest = acc; }
main.cpp
#include <stdio.h> #include <time.h> #include "vec.h" int main() { vec_ptr pt = new_vec(10000); for (int i = 0; i < pt->len; i++) { *(pt->data + i) = i + 1; } data_t dest; clock_t start, end; start = clock(); for (int i = 0; i < 1000; i++) { combine1(pt, &dest); } end = clock(); printf("1--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine2(pt, &dest); } end = clock(); printf("2--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine3(pt, &dest); } end = clock(); printf("3--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine4(pt, &dest); } end = clock(); printf("4--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine5(pt, &dest); } end = clock(); printf("5--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine6(pt, &dest); } end = clock(); printf("6--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine7(pt, &dest); } end = clock(); printf("7--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); free_vec(pt); return 0; }
运行结果