code buffer
分层分装
#include <iostream>
using namespace std;
__device__ int offset = 1;
struct A_device {
int mH;
int mW;
int *m_d_Dynamic;
int mDynamicSize;
int func( int );
};
class A {
public:
A_device d;
int *m_h_Dynamic;
A(): d((A_device){0,0,NULL,sizeof(int) * 4}), m_h_Dynamic(NULL) {
allocMem();
}
~A() { freeMem(); }
void run_device(int* h_sum, size_t MAXN);
void setValue() {
m_h_Dynamic[0] = 1000;
m_h_Dynamic[1] = 10000;
m_h_Dynamic[2] = 100000;
m_h_Dynamic[3] = 1000000;
cudaMemcpy( d.m_d_Dynamic, m_h_Dynamic, d.mDynamicSize, cudaMemcpyHostToDevice);
}
private:
void freeMem();
void allocMem();
};
__device__
int
A_device::func( int i) {
return m_d_Dynamic[i%4];
}
__global__ void KernelTest( int *sum, A_device d_insA) { //copy h_insA to d_insA in call stack, then copy d_insA from host to device
sum[threadIdx.x] = threadIdx.x + d_insA.func(threadIdx.x) + offset;
}
__host__
void
A::freeMem() {
free(m_h_Dynamic);
cudaFree(d.m_d_Dynamic);
}
void
A::allocMem() {
m_h_Dynamic = (int*)malloc(d.mDynamicSize);
cudaMalloc( (void**)&d.m_d_Dynamic, d.mDynamicSize);
}
void
A::run_device(int* h_sum, size_t MAXN) {
int *d_sum;
setValue();
cudaMalloc( (void**)&d_sum, MAXN * sizeof(int));
cudaMemset( d_sum, 0, MAXN * sizeof(int));
KernelTest<<<1, 128>>>(d_sum, d);
cudaMemcpy( h_sum, d_sum, MAXN * sizeof(int), cudaMemcpyDeviceToHost);
}
int main( int argc, char **argv) {
A h_insA;
int h_sum[128];
h_insA.run_device(h_sum, 128);
for( int i = 0; i < 128; i++ ) {
std::cout << h_sum[i] << " ";
}
cout << endl;
return 0;
}