将整个矩阵分解为这样的小块,每次完成一对小块的计算,以提高Cache的命中率。
提示:
图中n=N/m
计算次序为A11*B11, A11*B12,…, A11*B1n,,由于反复使用A11,因此可以提高Cache的命中率。
/* 矩阵分开计算 C=A*B --- C(i,j)等于A的第i行乘以第j列 */ #include <stdio.h> #include <time.h> #include <stdlib.h> #include <math.h> #include <windows.h> /* 生成n*n矩阵 */ void GenerateMatrix(float *m, int n); void PrintMatrix(float *p, int n); void GeneralMul(float *A, float *B, float *C, int n); void ClearMatrix(float *m, int n); /* 矩阵分块计算 */ void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m); /* 两个矩阵的误差 */ float diff(float *C1, float *C0, int n); struct ARG { float *A; int ax, ay; float *B; int bx, by; float *C; int cx, cy; int m; int n; }; int main(int argc, char **argv) { if (argc != 4) { printf("Usage: %s N thread_num M\n", argv[0]); return 0; } int n=atoi(argv[1]); int thread_num = atoi(argv[2]); int m = atoi(argv[3]); float *A = new float[n*n]; float *B = new float[n*n]; float *C = new float[n*n]; float *C0 = new float[n*n]; GenerateMatrix(A, n); GenerateMatrix(B, n); clock_t start; float time_used; ClearMatrix(C0, n); start=clock(); GeneralMul(A, B, C0, n); time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000; printf("General: time = %f\n", time_used); ClearMatrix(C, n); start=clock(); BlockCacul(A, B, C, n, thread_num, m); time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000; printf("Block: time = %f\n", time_used); printf("Difference of two result: %f\n", diff(C0, C, n)); delete [] A; delete [] B; delete [] C; delete [] C0; return 0; } void ClearMatrix(float *m, int n) { for (int i=0; i<n; i++) { for (int j=0; j<n; j++) m[i*n+j]=0.0; } } /* 普通矩阵相乘 */ void GeneralMul(float *A, float *B, float *C, int n) { for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { float *p = C+i*n+j; for (int k=0; k<n; k++) { *p += A[i*n+k]*B[k*n+j]; } } } } DWORD WINAPI Mul_Fun(LPVOID arg) { struct ARG *p = (struct ARG *)arg; float *A = p->A; float *B = p->B; float *C = p->C; int m = p->m; int n = p->n; for (int i=0; i<m; i++) { for (int j=0; j<m; j++) { float *t = C+(i+p->cx)*n+p->cy+j; for (int k=0; k<m; k++) { *t += A[(p->ax+i)*n+p->ay+k]*B[(p->bx+k)*n+p->by+j]; } } } return 0; } void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m) { //m = static_cast<int>(sqrt(m)); struct ARG *args = new struct ARG[thread_num]; HANDLE *handle = new HANDLE[thread_num]; int t=0; int i; for (i=0; i<thread_num; i++) { args[i].A = A; args[i].B = B; args[i].C = C; args[i].m = m; args[i].n = n; } //分成n/m x n/m块 //A i行j列 for (i=0; i<n; i+=m) { for (int j=0; j<n; j+=m) { //B j行k列 for (int k=0; k<n; k+=m) { args[t].ax = i; args[t].ay = j; args[t].bx = j; args[t].by = k; args[t].cx = i; args[t].cy = k; if (t<thread_num) { handle[t] = CreateThread(NULL, 0, Mul_Fun, (LPVOID)(&args[t]), 0, 0 ); t++; } if (t==thread_num) { for (int ii=0; ii<t; ii++) WaitForMultipleObjects(thread_num, &handle[ii],TRUE,INFINITE); t = 0; } } } } } void GenerateMatrix(float *p, int n) { srand(time(NULL)+rand()); for (int i=0; i<n*n; i++) { *p = static_cast<float>(rand())/ (static_cast<float>(rand())+ static_cast<float>(0.55)); p++; } } float diff(float *C1, float *C0, int n) { float rst=0.0; float t; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { t = C1[i*n+j]-C0[i*n+j]; if (t<0) t = -t; rst += t; } } return rst; } void PrintMatrix(float *p, int n) { for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { printf("%.2f\t", p[i*n+j]); } printf("\n"); } printf("\n"); }