cuda原子操作

如果不用原子操作,在进行计算直方图时会发生计算冲突

d_b[i]为h_a中数字i有几个

下面的代码将h_a全赋值为3,但d_b[3]却为1

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 10

__global__ void f(int* a , int *b)
{
	int x = blockIdx.x * blockDim.x + threadIdx.x;

	b[a[x]]++;
}

int main()
{
	int h_a[N] , h_b[N]; //d_b[i]为h_a中数字i有几个
	int* d_a, * d_b;

	cudaMalloc((void**)&d_a, N * sizeof(int));
	cudaMalloc((void**)&d_b, N * sizeof(int));
	for (int i = 0; i < N; i++) h_a[i] = 3;

	cudaMemcpy(d_a, h_a, N * sizeof(int) , cudaMemcpyHostToDevice);

	f << <N, 1 >> > (d_a, d_b);
	cudaMemcpy(h_b, d_b, N * sizeof(int) , cudaMemcpyDeviceToHost);

	for (int i = 0; i < N; i++) printf("%d ", h_b[i]);

	return 0;
}

使用共享内存原子操作:

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 10

__global__ void f(int* a , int *b)
{
	__shared__ unsigned int temp[N];
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	temp[x] = 0;  //将共享内存所有元素清0
	__syncthreads();

	atomicAdd(&(b[a[x]]), 1);  //结果现在在共享内存中
	__syncthreads();

	atomicAdd(&(b[x]), temp[x]); //再将结果复制到结果数组中
}

int main()
{
	int h_a[N] , h_b[N];
	int* d_a, * d_b;

	cudaMalloc((void**)&d_a, N * sizeof(int));
	cudaMalloc((void**)&d_b, N * sizeof(int));
	for (int i = 0; i < N; i++) h_a[i] = 3;

	cudaMemcpy(d_a, h_a, N * sizeof(int) , cudaMemcpyHostToDevice);

	f << <N, 1 >> > (d_a, d_b);
	cudaMemcpy(h_b, d_b, N * sizeof(int) , cudaMemcpyDeviceToHost);

	for (int i = 0; i < N; i++) printf("%d ", h_b[i]);

	return 0;
}
posted @ 2024-03-28 16:10  拾墨、  阅读(17)  评论(0编辑  收藏  举报