#include <iostream> #include <cuda_runtime.h> #include <device_launch_parameters.h> __global__ void compute(float* a,float* b,float* c){ int d0 = gridDim.z; int d1 = gridDim.y; int d2 = gridDim.x; int d3 = blockDim.z; int d4 = blockDim.y; int d5 = blockDim.x; // 构成了一个tensor是d0 x d1 x d2 x d3 x d4 x d5 int p0 = blockIdx.z; int p1 = blockIdx.y; int p2 = blockIdx.x; int p3 = threadIdx.z; int p4 = threadIdx.y; int p5 = threadIdx.x; int position = (((((p0 * d1) + p1) * d2 + p2) * d3 + p3) * d4 + p4) * d5 + p5; //int position = ((blockIdx.y * gridDim.x) + blockIdx.x + threadIdx.y) * blockDim.x + threadIdx.x; //int position = ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; //int position = (blockDim.x * blockIdx.x + threadIdx.x); // c[position] = a[position] * b[position]; c[position] = a[position] * b[position]; } int main(int argc, char const *argv[]) { /* code */ const int num = 3; float a[num] = {1, 2, 3}; float b[num] = {5, 7, 9}; float c[num] = {0}; size_t size_array = sizeof(c); float* device_a = nullptr; float* device_b = nullptr; float* device_c = nullptr; cudaMalloc(&device_a,size_array); cudaMalloc(&device_b,size_array); cudaMalloc(&device_c,size_array); cudaMemcpy(device_a,a,size_array,cudaMemcpyHostToDevice); cudaMemcpy(device_b,b,size_array,cudaMemcpyHostToDevice); compute<<<1,3>>>(device_a,device_b,device_c); cudaMemcpy(c,device_c,size_array,cudaMemcpyDeviceToHost); for(int i:c){ std::cout<<i<<std::endl; } return 0; }