剩余块用switch处理

在做循环展开时,处理完整除块后,还需要在剩余块处理。做了个实验对比,用switch加速剩余块段处理

// switch0.c

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>

int main(int argc, char *argv[])
{
	int res = atoi(argv[1]);
	uint32_t accum = 0;
	int vec[res], *vec_end = vec + res;
	for (int i = 0; i < res; i++)
	{
		vec[i] = rand();
	}

	clock_t start = clock();

	for (uint32_t i = UINT32_MAX; i; i--)
	{

		switch (res)
		{
		// case 20:
		// 	accum += vec[20 - 1];
		// case 19:
		// 	accum += vec[19 - 1];
		// case 18:
		// 	accum += vec[18 - 1];
		// case 17:
		// 	accum += vec[17 - 1];
		case 16:
			accum += vec[16 - 1];
		case 15:
			accum += vec[15 - 1];
		case 14:
			accum += vec[14 - 1];
		case 13:
			accum += vec[13 - 1];
		case 12:
			accum += vec[12 - 1];
		case 11:
			accum += vec[11 - 1];
		case 10:
			accum += vec[10 - 1];
		case 9:
			accum += vec[9 - 1];
		case 8:
			accum += vec[8 - 1];
		case 7:
			accum += vec[7 - 1];
		case 6:
			accum += vec[6 - 1];
		case 5:
			accum += vec[5 - 1];
		case 4:
			accum += vec[4 - 1];
		case 3:
			accum += vec[3 - 1];
		case 2:
			accum += vec[2 - 1];
		case 1:
			accum += vec[1 - 1];
		}
	}

	clock_t end = clock();
	printf("%u\n", accum);
	printf("%lu\n", end - start);
}

// switch1.c

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>

int main(int argc, char *argv[])
{
	int res = atoi(argv[1]);
	uint32_t accum = 0;
	int vec[res], *vec_end = vec + res;
	for (int i = 0; i < res; i++)
	{
		vec[i] = rand();
	}

	clock_t start = clock();

	for (uint32_t i = UINT32_MAX; i; i--)
	{

		switch (res)
		{
		// case 20:
		// 	accum += vec_end[-20];
		// case 19:
		// 	accum += vec_end[-19];
		// case 18:
		// 	accum += vec_end[-18];
		// case 17:
		// 	accum += vec_end[-17];
		case 16:
			accum += vec_end[-16];
		case 15:
			accum += vec_end[-15];
		case 14:
			accum += vec_end[-14];
		case 13:
			accum += vec_end[-13];
		case 12:
			accum += vec_end[-12];
		case 11:
			accum += vec_end[-11];
		case 10:
			accum += vec_end[-10];
		case 9:
			accum += vec_end[-9];
		case 8:
			accum += vec_end[-8];
		case 7:
			accum += vec_end[-7];
		case 6:
			accum += vec_end[-6];
		case 5:
			accum += vec_end[-5];
		case 4:
			accum += vec_end[-4];
		case 3:
			accum += vec_end[-3];
		case 2:
			accum += vec_end[-2];
		case 1:
			accum += vec_end[-1];
		}
	}

	clock_t end = clock();
	printf("%u\n", accum);
	printf("%lu\n", end - start);
}

filename 剩余块大小为7 剩余块大小为14
swtich0.c 9265104 18538175
switch1.c 9250006 18597986

好像性能差不多(我原以为第二种写法会快一些)。

另外在datasketches-cpp/common/include/MurmurHash3.h里看到里类似于第一种段写法。另外或许可以用accum数组代替accum变量来进一步加速

一般intel的cpu的cachelinesize为64

posted @ 2020-12-05 17:14  Tifa_Best  阅读(94)  评论(0编辑  收藏  举报