【CUDA并行编程之八】Cuda实现Kmeans算法
本文主要介绍如何使用CUDA并行计算框架编程实现机器学习中的Kmeans算法,Kmeans算法的详细介绍在这里,本文重点在并行实现的过程。
当然还是简单的回顾一下kmeans算法的串行过程:
伪代码:
- 创建k个点作为起始质心(经常是随机选择)
- 当任意一个点的簇分配结果发生改变时
- 对数据集中的每个数据点
- 对每个质心
- 计算质心与数据点之间的距离
- 将数据点分配到距其最近的簇
- 对每一个簇,计算簇中所有点的均值并将均值作为质心
①line03-04:将每个数据点到多个质心的距离计算进行并行化
②line05:将数据点到某个执行的距离计算进行并行化
KMEANS类:
- class KMEANS
- {
- private:
- int numClusters;
- int numCoords;
- int numObjs;
- int *membership;//[numObjs]
- char *filename;
- float **objects;//[numObjs][numCoords] data objects
- float **clusters;//[numClusters][unmCoords] cluster center
- float threshold;
- int loop_iterations;
- public:
- KMEANS(int k);
- void file_read(char *fn);
- void file_write();
- void cuda_kmeans();
- inline int nextPowerOfTwo(int n);
- void free_memory();
- virtual ~KMEANS();
- };//KMEANS
成员变量:
numClusters:中心点的个数numCoords:每个数据点的维度
numObjs:数据点的个数
membership:每个数据点所属类别的数组,维度为numObjs
filename:读入的文件名
objects:所有数据点,维度为[numObjs][numCoords]
clusters:中心点数据,维度为[numObjs][numCoords]
threshold:控制循环次数的一个域值
loop_iterations:循环的迭代次数
成员函数:
KMEANS(int k):含参构造函数。初始化成员变量
file_read(char *fn):读入文件数据并初始化object以及membership变量
file_write():将计算结果写回到结果文件中去
cuda_kmeans():kmeans计算的入口函数
nextPowerOfTwo(int n):它计算大于等于输入参数n的第一个2的幂次数。
free_memory():释放内存空间
~KMEANS():析构函数
并行的代码主要三个函数:
find_nearest_cluster(...)
compute_delta(...)
euclid_dist_2(...)
首先看一下函数euclid_dist_2(...):
- __host__ __device__ inline static
- float euclid_dist_2(int numCoords,int numObjs,int numClusters,float *objects,float *clusters,int objectId,int clusterId)
- {
- int i;
- float ans = 0;
- for( i=0;i<numCoords;i++ )
- {
- ans += ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) *
- ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) ;
- }
- return ans;
- }
再看一下函数compute_delta(...):
- /*
- * numIntermediates:The actual number of intermediates
- * numIntermediates2:The next power of two
- */
- __global__ static void compute_delta(int *deviceIntermediates,int numIntermediates, int numIntermediates2)
- {
- extern __shared__ unsigned int intermediates[];
- intermediates[threadIdx.x] = (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0 ;
- __syncthreads();
- //numIntermediates2 *must* be a power of two!
- for(unsigned int s = numIntermediates2 /2 ; s > 0 ; s>>=1)
- {
- if(threadIdx.x < s)
- {
- intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
- }
- __syncthreads();
- }
- if(threadIdx.x == 0)
- {
- deviceIntermediates[0] = intermediates[0];
- }
- }
最后再看函数finid_nearest_cluster(...):
- /*
- * objects:[numCoords][numObjs]
- * deviceClusters:[numCoords][numClusters]
- * membership:[numObjs]
- */
- __global__ static void find_nearest_cluster(int numCoords,int numObjs,int numClusters,float *objects, float *deviceClusters,int *membership ,int *intermediates)
- {
- extern __shared__ char sharedMemory[];
- unsigned char *membershipChanged = (unsigned char *)sharedMemory;
- float *clusters = deviceClusters;
- membershipChanged[threadIdx.x] = 0;
- int objectId = blockDim.x * blockIdx.x + threadIdx.x;
- if( objectId < numObjs )
- {
- int index;
- float dist,min_dist;
- /*find the cluster id that has min distance to object*/
- index = 0;
- min_dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0);
- for(int i=0;i<numClusters;i++)
- {
- dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i) ;
- /* no need square root */
- if( dist < min_dist )
- {
- min_dist = dist;
- index = i;
- }
- }
- if( membership[objectId]!=index )
- {
- membershipChanged[threadIdx.x] = 1;
- }
- //assign the membership to object objectId
- membership[objectId] = index;
- __syncthreads(); //for membershipChanged[]
- #if 1
- //blockDim.x *must* be a power of two!
- for(unsigned int s = blockDim.x / 2; s > 0 ;s>>=1)
- {
- if(threadIdx.x < s)
- {
- membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s];//calculate all changed values and save result to membershipChanged[0]
- }
- __syncthreads();
- }
- if(threadIdx.x == 0)
- {
- intermediates[blockIdx.x] = membershipChanged[0];
- }
- #endif
- }
- }//find_nearest_cluster
这三个函数将所有能够并行的地方都进行了并行,实现了整体算法的并行化~
在此呈上全部代码:
kmeans.h:
- #ifndef _H_KMEANS
- #define _H_KMEANS
- #include <assert.h>
- #define malloc2D(name, xDim, yDim, type) do { \
- name = (type **)malloc(xDim * sizeof(type *)); \
- assert(name != NULL); \
- name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \
- assert(name[0] != NULL); \
- for (size_t i = 1; i < xDim; i++) \
- name[i] = name[i-1] + yDim; \
- } while (0)
- double wtime(void);
- #endif
wtime.cu:
- #include <sys/time.h>
- #include <stdio.h>
- #include <stdlib.h>
- double wtime(void)
- {
- double now_time;
- struct timeval etstart;
- struct timezone tzp;
- if (gettimeofday(&etstart, &tzp) == -1)
- perror("Error: calling gettimeofday() not successful.\n");
- now_time = ((double)etstart.tv_sec) + /* in seconds */
- ((double)etstart.tv_usec) / 1000000.0; /* in microseconds */
- return now_time;
- }
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <sys/types.h>
- #include <sys/stat.h>
- #include <unistd.h>
- #include <iostream>
- #include <cassert>
- #include "kmeans.h"
- using namespace std;
- const int MAX_CHAR_PER_LINE = 1024;
- class KMEANS
- {
- private:
- int numClusters;
- int numCoords;
- int numObjs;
- int *membership;//[numObjs]
- char *filename;
- float **objects;//[numObjs][numCoords] data objects
- float **clusters;//[numClusters][unmCoords] cluster center
- float threshold;
- int loop_iterations;
- public:
- KMEANS(int k);
- void file_read(char *fn);
- void file_write();
- void cuda_kmeans();
- inline int nextPowerOfTwo(int n);
- void free_memory();
- virtual ~KMEANS();
- };
- KMEANS::~KMEANS()
- {
- free(membership);
- free(clusters[0]);
- free(clusters);
- free(objects[0]);
- free(objects);
- }
- KMEANS::KMEANS(int k)
- {
- threshold = 0.001;
- numObjs = 0;
- numCoords = 0;
- numClusters = k;
- filename = NULL;
- loop_iterations = 0;
- }
- void KMEANS::file_write()
- {
- FILE *fptr;
- char outFileName[1024];
- //output:the coordinates of the cluster centres
- sprintf(outFileName,"%s.cluster_centres",filename);
- printf("Writingcoordinates of K=%d cluster centers to file \"%s\"\n",numClusters,outFileName);
- fptr = fopen(outFileName,"w");
- for(int i=0;i<numClusters;i++)
- {
- fprintf(fptr,"%d ",i) ;
- for(int j=0;j<numCoords;j++)
- fprintf(fptr,"%f ",clusters[i][j]);
- fprintf(fptr,"\n");
- }
- fclose(fptr);
- //output:the closest cluster centre to each of the data points
- sprintf(outFileName,"%s.membership",filename);
- printf("writing membership of N=%d data objects to file \"%s\" \n",numObjs,outFileName);
- fptr = fopen(outFileName,"w");
- for(int i=0;i<numObjs;i++)
- {
- fprintf(fptr,"%d %d\n",i,membership[i]) ;
- }
- fclose(fptr);
- }
- inline int KMEANS::nextPowerOfTwo(int n)
- {
- n--;
- n = n >> 1 | n;
- n = n >> 2 | n;
- n = n >> 4 | n;
- n = n >> 8 | n;
- n = n >> 16 | n;
- //n = n >> 32 | n; // for 64-bit ints
- return ++n;
- }
- __host__ __device__ inline static
- float euclid_dist_2(int numCoords,int numObjs,int numClusters,float *objects,float *clusters,int objectId,int clusterId)
- {
- int i;
- float ans = 0;
- for( i=0;i<numCoords;i++ )
- {
- ans += ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) *
- ( objects[numObjs * i + objectId] - clusters[numClusters*i + clusterId] ) ;
- }
- return ans;
- }
- /*
- * numIntermediates:The actual number of intermediates
- * numIntermediates2:The next power of two
- */
- __global__ static void compute_delta(int *deviceIntermediates,int numIntermediates, int numIntermediates2)
- {
- extern __shared__ unsigned int intermediates[];
- intermediates[threadIdx.x] = (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0 ;
- __syncthreads();
- //numIntermediates2 *must* be a power of two!
- for(unsigned int s = numIntermediates2 /2 ; s > 0 ; s>>=1)
- {
- if(threadIdx.x < s)
- {
- intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
- }
- __syncthreads();
- }
- if(threadIdx.x == 0)
- {
- deviceIntermediates[0] = intermediates[0];
- }
- }
- /*
- * objects:[numCoords][numObjs]
- * deviceClusters:[numCoords][numClusters]
- * membership:[numObjs]
- */
- __global__ static void find_nearest_cluster(int numCoords,int numObjs,int numClusters,float *objects, float *deviceClusters,int *membership ,int *intermediates)
- {
- extern __shared__ char sharedMemory[];
- unsigned char *membershipChanged = (unsigned char *)sharedMemory;
- float *clusters = deviceClusters;
- membershipChanged[threadIdx.x] = 0;
- int objectId = blockDim.x * blockIdx.x + threadIdx.x;
- if( objectId < numObjs )
- {
- int index;
- float dist,min_dist;
- /*find the cluster id that has min distance to object*/
- index = 0;
- min_dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0);
- for(int i=0;i<numClusters;i++)
- {
- dist = euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i) ;
- /* no need square root */
- if( dist < min_dist )
- {
- min_dist = dist;
- index = i;
- }
- }
- if( membership[objectId]!=index )
- {
- membershipChanged[threadIdx.x] = 1;
- }
- //assign the membership to object objectId
- membership[objectId] = index;
- __syncthreads(); //for membershipChanged[]
- #if 1
- //blockDim.x *must* be a power of two!
- for(unsigned int s = blockDim.x / 2; s > 0 ;s>>=1)
- {
- if(threadIdx.x < s)
- {
- membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s];//calculate all changed values and save result to membershipChanged[0]
- }
- __syncthreads();
- }
- if(threadIdx.x == 0)
- {
- intermediates[blockIdx.x] = membershipChanged[0];
- }
- #endif
- }
- }//find_nearest_cluster
- void KMEANS::cuda_kmeans()
- {
- int index,loop = 0;
- int *newClusterSize;//[numClusters]:no.objects assigned in each new cluster
- float delta; //% of objects changes their clusters
- float **dimObjects;//[numCoords][numObjs]
- float **dimClusters;
- float **newClusters;//[numCoords][numClusters]
- float *deviceObjects; //[numCoords][numObjs]
- float *deviceClusters; //[numCoords][numclusters]
- int *deviceMembership;
- int *deviceIntermediates;
- //Copy objects given in [numObjs][numCoords] layout to new [numCoords][numObjs] layout
- malloc2D(dimObjects,numCoords,numObjs,float);
- for(int i=0;i<numCoords;i++)
- {
- for(int j=0;j<numObjs;j++)
- {
- dimObjects[i][j] = objects[j][i];
- }
- }
- //pick first numClusters elements of objects[] as initial cluster centers
- malloc2D(dimClusters, numCoords, numClusters,float);
- for(int i=0;i<numCoords;i++)
- {
- for(int j=0;j<numClusters;j++)
- {
- dimClusters[i][j] = dimObjects[i][j];
- }
- }
- newClusterSize = new int[numClusters];
- assert(newClusterSize!=NULL);
- malloc2D(newClusters,numCoords,numClusters,float);
- memset(newClusters[0],0,numCoords * numClusters * sizeof(float) );
- //To support reduction,numThreadsPerClusterBlock *must* be a power of two, and it *must* be no larger than the number of bits that will fit into an unsigned char ,the type used to keep track of membership changes in the kernel.
- const unsigned int numThreadsPerClusterBlock = 32;
- const unsigned int numClusterBlocks = (numObjs + numThreadsPerClusterBlock -1)/numThreadsPerClusterBlock;
- const unsigned int numReductionThreads = nextPowerOfTwo(numClusterBlocks);
- const unsigned int clusterBlockSharedDataSize = numThreadsPerClusterBlock * sizeof(unsigned char);
- const unsigned int reductionBlockSharedDataSize = numReductionThreads * sizeof(unsigned int);
- cudaMalloc(&deviceObjects,numObjs*numCoords*sizeof(float));
- cudaMalloc(&deviceClusters,numClusters*numCoords*sizeof(float));
- cudaMalloc(&deviceMembership,numObjs*sizeof(int));
- cudaMalloc(&deviceIntermediates,numReductionThreads*sizeof(unsigned int));
- cudaMemcpy(deviceObjects,dimObjects[0],numObjs*numCoords*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(deviceMembership,membership,numObjs*sizeof(int),cudaMemcpyHostToDevice);
- do
- {
- cudaMemcpy(deviceClusters,dimClusters[0],numClusters*numCoords*sizeof(float),cudaMemcpyHostToDevice);
- find_nearest_cluster<<<numClusterBlocks,numThreadsPerClusterBlock,clusterBlockSharedDataSize>>>(numCoords,numObjs,numClusters,deviceObjects,deviceClusters,deviceMembership,deviceIntermediates);
- cudaDeviceSynchronize();
- compute_delta<<<1,numReductionThreads,reductionBlockSharedDataSize>>>(deviceIntermediates,numClusterBlocks,numReductionThreads);
- cudaDeviceSynchronize();
- int d;
- cudaMemcpy(&d,deviceIntermediates,sizeof(int),cudaMemcpyDeviceToHost);
- delta = (float)d;
- cudaMemcpy(membership,deviceMembership,numObjs*sizeof(int),cudaMemcpyDeviceToHost);
- for(int i=0;i<numObjs;i++)
- {
- //find the array index of nestest
- index = membership[i];
- //update new cluster centers:sum of objects located within
- newClusterSize[index]++;
- for(int j=0;j<numCoords;j++)
- {
- newClusters[j][index] += objects[i][j];
- }
- }
- //average the sum and replace old cluster centers with newClusters
- for(int i=0;i<numClusters;i++)
- {
- for(int j=0;j<numCoords;j++)
- {
- if(newClusterSize[i] > 0)
- dimClusters[j][i] = newClusters[j][i]/newClusterSize[i];
- newClusters[j][i] = 0.0;//set back to 0
- }
- newClusterSize[i] = 0 ; //set back to 0
- }
- delta /= numObjs;
- }while( delta > threshold && loop++ < 500 );
- loop_iterations = loop + 1;
- malloc2D(clusters,numClusters,numCoords,float);
- for(int i=0;i<numClusters;i++)
- {
- for(int j=0;j<numCoords;j++)
- {
- clusters[i][j] = dimClusters[j][i];
- }
- }
- cudaFree(deviceObjects) ;
- cudaFree(deviceClusters);
- cudaFree(deviceMembership);
- cudaFree(deviceMembership);
- free(dimObjects[0]);
- free(dimObjects);
- free(dimClusters[0]);
- free(dimClusters);
- free(newClusters[0]);
- free(newClusters);
- free(newClusterSize);
- }
- void KMEANS::file_read(char *fn)
- {
- FILE *infile;
- char *line = new char[MAX_CHAR_PER_LINE];
- int lineLen = MAX_CHAR_PER_LINE;
- filename = fn;
- infile = fopen(filename,"r");
- assert(infile!=NULL);
- /*find the number of objects*/
- while( fgets(line,lineLen,infile) )
- {
- numObjs++;
- }
- /*find the dimension of each object*/
- rewind(infile);
- while( fgets(line,lineLen,infile)!=NULL )
- {
- if( strtok(line," \t\n")!=0 )
- {
- while( strtok(NULL," \t\n") )
- numCoords++;
- break;
- }
- }
- /*allocate space for object[][] and read all objcet*/
- rewind(infile);
- objects = new float*[numObjs];
- for(int i=0;i<numObjs;i++)
- {
- objects[i] = new float[numCoords];
- }
- int i=0;
- /*read all object*/
- while( fgets(line,lineLen,infile)!=NULL )
- {
- if( strtok(line," \t\n") ==NULL ) continue;
- for(int j=0;j<numCoords;j++)
- {
- objects[i][j] = atof( strtok(NULL," ,\t\n") ) ;
- }
- i++;
- }
- /* membership: the cluster id for each data object */
- membership = new int[numObjs];
- assert(membership!=NULL);
- for(int i=0;i<numObjs;i++)
- membership[i] = -1;
- }
- int main(int argc,char *argv[])
- {
- KMEANS kmeans(atoi(argv[1]));
- kmeans.file_read(argv[2]);
- kmeans.cuda_kmeans();
- kmeans.file_write();
- return 0;
- }
- target:
- nvcc cuda_kmeans.cu
- ./a.out 4 ./Image_data/color100.txt
所有代码和文件数据在这里:http://yunpan.cn/cKBZMPAJ8tcAs(提取码:9476)
运行代码:
kmeans的cuda实现代码相对复杂,在阅读的过程中可能会有困难,有问题请留言~
Author:忆之独秀
Email:leaguenew@qq.com
注明出处:http://blog.csdn.net/lavorange/article/details/41942323