代码优化程序性能

参考自:《深入理解计算机系统》第5章

优化方式:

代码移动(code motion):识别要执行多次(例如在循环里)但是计算结果不会改变的计算。

减少过程调用:减少循环中的判断。

消除不必要的存储器引用:在临时变量中存放结果,消除每次循环迭代中从存储器中读出并将更新值写回的需要。

循环展开:通过每次迭代计算的元素的数量,减少循环的迭代次数。

提高并行性:利用单元的流水线能力。

重新结合变换:减小计算中关键路径上操作的数量,通过更好地利用功能单元的流水线能力得到更好地性能

 

vec.h

typedef int data_t;
//typedef double data_t;


/* Create abstract data type for vector */
typedef struct{
	long int len;
	data_t *data;
}vec_rec, *vec_ptr;

#define IDENT 0
#define OP +

vec_ptr new_vec(long int len);

void free_vec(vec_ptr v);

int get_vec_element(vec_ptr v, long int index, data_t *dest);

long int vec_length(vec_ptr v);

void combine1(vec_ptr v, data_t *dest);

void combine2(vec_ptr v, data_t *dest);

void combine3(vec_ptr v, data_t *dest);

void combine4(vec_ptr v, data_t *dest);

void combine5(vec_ptr v, data_t *dest);

void combine6(vec_ptr v, data_t *dest);

void combine7(vec_ptr v, data_t *dest);

  

vec.cpp

#include "vec.h"
#include <stdlib.h>

/* Create vector of specified length */
vec_ptr new_vec(long int len)
{
	/* Allocate header structure */
	vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
	if (!result)
	{
		return NULL; /* Couldn't allocate storage */
	}
	result->len = len;

	/* Allocate array */
	if (len > 0)
	{
		data_t *data = (data_t *)calloc(len, sizeof(data_t));
		if (!data)
		{
			free((void *)result);
			return NULL;
		}
		result->data = data;
	}
	else
	{
		result->data = NULL;
	}

	return result;
}

void free_vec(vec_ptr v)
{
	if (v)
	{
		if (v->data)
		{
			free((void*)v->data);
		}

		free(v);
	}
}

/*
 * Retrieve vector element and store at dest.
 * Return 0 (out of bounds) or 1 (successful)
 */
int get_vec_element(vec_ptr v, long int index, data_t *dest)
{
	if (index < 0 || index >= v->len)
	{
		return 0;
	}

	*dest = v->data[index];
	return 1;
}

/* Return length of vector */
long int vec_length(vec_ptr v)
{
	return v->len;
}

/* Implementation with maximum use of data abstraction */
void combine1(vec_ptr v, data_t *dest)
{
	long int i;

	*dest = IDENT;
	for (i = 0; i < vec_length(v); i++)
	{
		data_t val;
		get_vec_element(v, i, &val);
		*dest = *dest OP val;
	}
}

// 代码移动(code motion)
/* Move call to vec_length out of loop */
void combine2(vec_ptr v, data_t *dest)
{
	long int i;
	long int length = vec_length(v);

	*dest = IDENT;
	for (i = 0; i < length; i++)
	{
		data_t val;
		get_vec_element(v, i, &val);
		*dest = *dest OP val;
	}
}

data_t *get_vec_start(vec_ptr v)
{
	return v->data;
}

// 减少过程调用
/* Direct access to vector data */
void combine3(vec_ptr v, data_t *dest)
{
	long int i;
	long int length = vec_length(v);
	data_t *data = get_vec_start(v);

	*dest = IDENT;
	for (i = 0; i < length; i++)
	{
		*dest = *dest OP data[i];
	}
}

// 消除不必要的存储器引用
/* Accumulate result in local variable */
void combine4(vec_ptr v, data_t *dest)
{
	long int i;
	long int length = vec_length(v);
	data_t *data = get_vec_start(v);
	data_t acc = IDENT;

	for (i = 0; i < length; i++)
	{
		acc = acc OP data[i];
	}

	*dest = acc;
}

// 循环展开(两次循环展开)
/* Unroll loop by 2 */
void combine5(vec_ptr v, data_t *dest)
{
	long int i;
	long int length = vec_length(v);
	long int limit = length - 1;
	data_t *data = get_vec_start(v);
	data_t acc = IDENT;

	/* Combine 2 elements at a time */
	for (i = 0; i < limit; i += 2)
	{
		acc = (acc OP data[i]) OP data[i + 1];
	}

	/* Finish any remaining elements */
	for (; i < length; i++)
	{
		acc = acc OP data[i];
	}

	*dest = acc;
}

// 提高并行性(两路并行)
/* Unroll loop by 2, 2-way parallelism */
void combine6(vec_ptr v, data_t *dest)
{
	long int i;
	long int length = vec_length(v);
	long int limit = length - 1;
	data_t *data = get_vec_start(v);
	data_t acc0 = IDENT;
	data_t acc1 = IDENT;

	/* Combine 2 elements at a time */
	for (i = 0; i < limit; i += 2)
	{
		acc0 = acc0 OP data[i];
		acc1 = acc1 OP data[i + 1];
	}

	/* Finish any remaining elements */
	for (; i < length; i++)
	{
		acc0 = acc0 OP data[i];
	}
	*dest = acc0 OP acc1;
}

// 重新结合变换
/* Change associativity of combining operation */
void combine7(vec_ptr v, data_t *dest)
{
	long int i;
	long int length = vec_length(v);
	long int limit = length - 1;
	data_t *data = get_vec_start(v);
	data_t acc = IDENT;

	/* Combine2 elements at a time */
	for (i = 0; i < limit; i += 2)
	{
		acc = acc OP(data[i] OP data[i + 1]);
	}

	/* Finish any remaining elements */
	for (; i < length; i++)
	{
		acc = acc OP data[i];
	}

	*dest = acc;
}

  

main.cpp

#include <stdio.h>
#include <time.h>

#include "vec.h"

int main()
{
	vec_ptr pt = new_vec(10000);

	for (int i = 0; i < pt->len; i++)
	{
		*(pt->data + i) = i + 1;
	}

	data_t dest;

	clock_t start, end;
	start = clock();
	for (int i = 0; i < 1000; i++)
	{
		combine1(pt, &dest);
	}
	end = clock();
	printf("1--time:%f----%d\n", (double)(end - start) / CLK_TCK,  dest);

	start = clock();
	for (int i = 0; i < 1000; i++)
	{
		combine2(pt, &dest);
	}
	end = clock();
	printf("2--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest);

	start = clock();
	for (int i = 0; i < 1000; i++)
	{
		combine3(pt, &dest);
	}
	end = clock();
	printf("3--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest);

	start = clock();
	for (int i = 0; i < 1000; i++)
	{
		combine4(pt, &dest);
	}
	end = clock();
	printf("4--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest);

	start = clock();
	for (int i = 0; i < 1000; i++)
	{
		combine5(pt, &dest);
	}
	end = clock();
	printf("5--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest);

	start = clock();
	for (int i = 0; i < 1000; i++)
	{
		combine6(pt, &dest);
	}
	end = clock();
	printf("6--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest);

	start = clock();
	for (int i = 0; i < 1000; i++)
	{
		combine7(pt, &dest);
	}
	end = clock();
	printf("7--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest);
	
	free_vec(pt);

	return 0;
}

  

运行结果

posted @ 2015-06-03 10:57  -学以致用-  阅读(277)  评论(0编辑  收藏  举报