CUDA 实例练习(一)

题目:将1000000个线程写入到10个数组。

 

#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "gputimer.h"
#define num_threads 1000000
#define block_width 1000
#define array_size 10
void print_array(int * array, int size);
__global__ void increment_atomic(int * g)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    i = i % array_size;
    atomicAdd(&g[i], 1);
}

void print_array(int * array, int size)
{
    for (int i = 0; i < size; i++) {
        printf("%d ", array[i]);
    }
}

int main()
{
    GpuTimer timer;
    printf("%d total threads in %d blocks writing into %d arrays\n", num_threads, num_threads / block_width, array_size);

    int h_array[array_size];
    const int array_bytes = array_size * sizeof(int);

    int * d_array;
    cudaMalloc((void **)&d_array, array_bytes);
    cudaMemset((void *)d_array, 0, array_bytes);

    timer.Start();
    increment_atomic << <num_threads / block_width, block_width >> >(d_array);
    timer.Stop();

    cudaMemcpy(h_array, d_array, array_bytes, cudaMemcpyDeviceToHost);
    print_array(h_array, array_size);
    printf("\nTime elapsed = %g ms\n", timer.Elapsed());
    cudaFree(d_array);

    return 0;
}

编译环境:visual studio 2013

posted @ 2017-07-10 14:29  Jason&Hymer  阅读(2247)  评论(0编辑  收藏  举报