矩阵乘法-分块计算

将整个矩阵分解为这样的小块，每次完成一对小块的计算，以提高Cache的命中率。
提示：
图中n=N/m
计算次序为A11*B11, A11*B12,…, A11*B1n,，由于反复使用A11，因此可以提高Cache的命中率。
/*
矩阵分开计算
C=A*B  --- C(i,j)等于A的第i行乘以第j列
*/
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <windows.h>


/*
    生成n*n矩阵
*/
void GenerateMatrix(float *m, int n);
void PrintMatrix(float *p, int n);
void GeneralMul(float *A, float *B, float *C, int n);
void ClearMatrix(float *m, int n);
/*
    矩阵分块计算
*/
void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m);
/*
    两个矩阵的误差
*/
float diff(float *C1, float *C0, int n);

struct ARG {
    float *A;
    int ax, ay;
    float *B;
    int bx, by;
    float *C;
    int cx, cy;
    int m;
    int n;
};
int main(int argc, char **argv)
{
    if (argc != 4)
    {
        printf("Usage: %s N thread_num M\n", argv[0]);
        return 0;
    }
    int n=atoi(argv[1]);
    int thread_num = atoi(argv[2]);
    int m = atoi(argv[3]);
    float *A = new float[n*n];
    float *B = new float[n*n];
    float *C = new float[n*n];
    float *C0 = new float[n*n];

    GenerateMatrix(A, n);
    GenerateMatrix(B, n);


    clock_t start;
    float time_used;

    ClearMatrix(C0, n);
    start=clock();    
    GeneralMul(A, B, C0, n);
    time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000;
    printf("General:   time = %f\n", time_used);
    
    ClearMatrix(C, n);
    start=clock();
    BlockCacul(A, B, C, n, thread_num, m);
    time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000;
    printf("Block:  time = %f\n", time_used);
    printf("Difference of two result: %f\n", diff(C0, C, n));
    
    delete [] A;
    delete [] B;
    delete [] C;
    delete [] C0;
    return 0;
}


void ClearMatrix(float *m, int n)
{
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            m[i*n+j]=0.0;
    }
}
/*
    普通矩阵相乘
*/
void GeneralMul(float *A, float *B, float *C, int n)
{
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
        {
            float *p = C+i*n+j;
            for (int k=0; k<n; k++)
            {
                *p += A[i*n+k]*B[k*n+j];
            }
        }
    }
}


DWORD WINAPI Mul_Fun(LPVOID arg)
{
    struct ARG *p = (struct ARG *)arg;

    float *A = p->A;
    float *B = p->B;
    float *C = p->C;
    int m = p->m;
    int n = p->n;
    for (int i=0; i<m; i++)
    {
        for (int j=0; j<m; j++)
        {
            float *t = C+(i+p->cx)*n+p->cy+j;
            
            for (int k=0; k<m; k++)
            {
                *t += A[(p->ax+i)*n+p->ay+k]*B[(p->bx+k)*n+p->by+j];
            }
        }
    }

    return 0;
}

void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m)
{
    //m = static_cast<int>(sqrt(m));
    struct ARG *args = new struct ARG[thread_num];
    HANDLE *handle = new HANDLE[thread_num];
    int t=0;
    int i;
    for (i=0; i<thread_num; i++)
    {
        args[i].A = A;
        args[i].B = B;
        args[i].C = C;
        args[i].m = m;
        args[i].n = n;
    }
    //分成n/m x n/m块
    //A i行j列
    for (i=0; i<n; i+=m)
    {
        for (int j=0; j<n; j+=m)
        {            
            //B j行k列
            for (int k=0; k<n; k+=m)
            {
                args[t].ax = i;
                args[t].ay = j;
                args[t].bx = j;
                args[t].by = k;
                args[t].cx = i;
                args[t].cy = k;
                if (t<thread_num)
                {                    
                    handle[t] = CreateThread(NULL, 0, Mul_Fun, (LPVOID)(&args[t]), 0, 0 );
                    t++;
                }
                if (t==thread_num)
                {
                    for (int ii=0; ii<t; ii++)
                        WaitForMultipleObjects(thread_num, &handle[ii],TRUE,INFINITE);
                    t = 0;
                }
            } 
        }
    }

}
void GenerateMatrix(float *p, int n)
{
    srand(time(NULL)+rand());
    for (int i=0; i<n*n; i++)
    {
        *p = static_cast<float>(rand())/ (static_cast<float>(rand())+ static_cast<float>(0.55));
        p++;
    }
}

float diff(float *C1, float *C0, int n)
{
    float rst=0.0;
    float t;

    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
        {
            t = C1[i*n+j]-C0[i*n+j];
            if (t<0)
                t = -t;
            rst += t;
        }
    }
    return rst;
}

void PrintMatrix(float *p, int n)
{
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
        {
            printf("%.2f\t", p[i*n+j]);
        }
        printf("\n");
    }
    printf("\n");
}
posted on 2012-05-28 20:43 windflying 阅读(10950) 评论(0) 编辑收藏举报
刷新页面返回顶部
行云

公告