词频统计——软工个人作业小结

Posted on 2018-03-30 23:49 HelenL 阅读(232) 评论(3) 编辑收藏举报

词频统计

一、项目要求及基本功能

　　项目要求：

对源文件（*.txt,*.cpp,*.h,*.cs,*.html,*.js,*.java,*.py,*.php等，文件夹内的所有文件）统计字符数、单词数、行数、词频，统计结果以指定格式输出到默认文件中，以及其他扩展功能，并能够快速地处理多个文件。

使用性能测试工具进行分析，找到性能的瓶颈并改进
对代码进行质量分析，消除所有警告，http://msdn.microsoft.com/en-us/library/dd264897.aspx
设计10个测试样例用于测试，确保程序正常运行（例如：空文件，只包含一个词的文件，只有一行的文件，典型文件等等）
使用Github进行代码管理
撰写博客
进行linux环境移植

　　基本功能：

统计文件的字符数（只需要统计Ascii码，汉字不用考虑）
统计文件的单词总数
统计文件的总行数（任何字符构成的行，都需要统计）
统计文件中各单词的出现次数，输出频率最高的10个。
对给定文件夹及其递归子文件夹下的所有文件进行统计
统计两个单词（词组）在一起的频率，输出频率最高的前10个。

二、需求分析

　　1.PSP表格

　　2.项目难点

1.命令行参数不熟悉，命令行操作不熟悉。

2.文件操作不熟悉，特别是遍历文件夹那个块。

3.大量数据需要处理，空间复杂度和时间复杂度都需要考虑。

三、设计实现

　　1.构架

　　2.代码结构

数据结构：

全局变量：

unsigned long charnumber = 0L;
unsigned long linenumber = 0L;
unsigned long wordnumber = 0L;

存放频率最高的单词和词组：

freNode wordfrequen[10];
freNode_s wordsfrequen[10];

存放单词的哈希表和存放词组的哈希表

hashNPtr wordhash[HASHSIZE] = { NULL };
hashNPtr_s wordshash[HASHSIZE] = { NULL };

结构体：

存放单词的哈希表的结构体，分别储存单词，次数和指向下一个结构体指针（采用拉链法）

typedef struct hashNode {
    char* word;
    unsigned int word_number;
    struct hashNode* next;
        hashNode(){
         word_number=0;
         next=NULL;
    }
}hashNode, *hashNPtr;

存放词组的哈希表的结构体，分别存放两个指向单词结构体的指针，次数和指向下一个结构体的指针（拉链法）

采用两个指向单词结构体的指针：不用单独更新数组的单词，单词更新后数组自动更新。

typedef struct hashNode_s {
    hashNPtr wordA;
    hashNPtr wordB;
    unsigned int word_number;
    struct hashNode_s* next;
    hashNode_s(){
        word_number=0;
        next=NULL;
    }
}hashNode_s, *hashNPtr_s;

存放频数最高的单词的结构体

typedef struct freNode {

    char * p;
    unsigned int count;
    freNode(){
          p = NULL;
          count=0;
     }
}freNode;

存放频数最高的词组的结构体

typedef struct freNode_s {
    char* pA;
    char* pB;
    unsigned int count;
    freNode_s(){
         pA = NULL;
         pB = NULL;
         count=0;
     }
}freNode_s;

各模块功能

1、主函数：

　　从命令行读入文件夹路径

　　调用遍历文件函数

　　排序

　　文件输出字符数，行数，单词数

2.　文件夹遍历：

　　windows下利用_findfirst(),和_findnext()函数。

　　linux则采取助教给的文档里的函数。

3.　统计：

　　（1）字符数，行数统计

　　（2）单词识别

　　（3）单词统计

　　（4）调用将已识别成功的单词和词组插入哈希表

4.　哈希：

　　（1）哈希函数

　　（2）查找和存储

　　（3）调用单词比较和更新函数

5.　单词处理

　　（1）单词比较

　　（2）单词更新

6.　排序

遍历哈希表进行排序

具体代码实现

1.main函数

int main(int argc, char* argv[]) {

    listDir(argv[1]);
    //cout << charnumber << endl;
    //cout << linenumber << endl;
    //cout << wordnumber << endl;
    ofstream outfile;
    outfile.open("result.txt", ios::out);
    outfile << "char_number :" << charnumber << endl;
    outfile << "line_number :" << linenumber << endl;
    outfile << "word_number :" << wordnumber << endl;
    outfile.close();
    hashsort(wordhash, wordfrequen);
    hashsort_s(wordshash, wordsfrequen);
    return 0;
}

View Code

2.遍历文件夹

windows下：

//folder and files operation
//path is a folder
int getfiles(string path, vector<string> &files) {

    //空则返回
    if (path.empty()) {
        cout << "The path is empty!\n";
        return 1;
    }

    //文件信息结构体
    struct _finddata_t fileinfo;
    string p;
    p = path + "\\*";
    //文件句柄
    long hfile = 0L;
    hfile = _findfirst(p.c_str(), &fileinfo);
    if (hfile == -1L)
        cout << "cannot match the folder path\n";
    else {
        do
        {
            //如果是目录，迭代
            //如果不是则加入列表
            if (fileinfo.attrib & _A_SUBDIR) {
                if ((strcmp(fileinfo.name, ".") != 0) && (strcmp(fileinfo.name, "..") != 0)) {
                    getfiles(p.assign(path).append("\\").append(fileinfo.name), files);
                }
            }
            else {
                files.push_back(p.assign(path).append("\\").append(fileinfo.name));
            }
        } while (_findnext(hfile, &fileinfo) == 0);
    }
    _findclose(hfile);
    return 0;
}

//judge a path :dir return 1; file return 0
bool judgedir(string path) {
    int result;
    struct _stat buf;
    result = _stat(path.c_str(), &buf);
    if (_S_IFDIR & buf.st_mode) {
        //cout << "It is a folder."<<endl;
        return true;
    }
    else if (_S_IFREG & buf.st_mode) {
        //cout << "It is a file" << endl;
        return false;
    }
    else return false;
}

View Code

Linux下：

void listDir(char *path) //main函数的argv[1] char * 作为 所需要遍历的路径 传参数给listDir
{
    DIR *pDir; //定义一个DIR类的指针
    struct dirent *ent; //定义一个结构体 dirent的指针，dirent结构体见上
    int i = 0;
    char childpath[512]; //定义一个字符数组，用来存放读取的路径
    pDir = opendir(path); // opendir方法打开path目录，并将地址付给pDir指针
    memset(childpath, 0, sizeof(childpath)); //将字符数组childpath的数组元素全部置零
    while ((ent = readdir(pDir)) != NULL)
        //读取pDir打开的目录，并赋值给ent, 同时判断是否目录为空，不为空则执行循环体
    {
        if (ent->d_type & DT_DIR)
            /*读取 打开目录的文件类型 并与 DT_DIR进行位与运算操作，即如果读取的d_type类型为DT_DIR
            (=4 表示读取的为目录)*/
        {
            if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
                //如果读取的d_name为 . 或者.. 表示读取的是当前目录符和上一目录符,
                //则用contiue跳过，不进行下面的输出
                continue;
            sprintf(childpath, "%s/%s", path, ent->d_name);
            //如果非. ..则将 路径 和 文件名d_name 付给childpath, 并在下一行prinf输出
            //printf("path:%s\n",childpath);原文链接这里是要打印出文件夹的地址
            listDir(childpath);
            //递归读取下层的字目录内容， 因为是递归，所以从外往里逐次输出所有目录（路径+目录名），
            //然后才在else中由内往外逐次输出所有文件名
        }
        else
            //如果读取的d_type类型不是 DT_DIR, 即读取的不是目录，而是文件，
            //则直接输出 d_name, 即输出文件名
        {
            //cout<<ent->d_name<<endl; 输出文件名
            //cout<<childpath<<"/"<<ent->d_name<<endl; 输出带有目录的文件名
            sprintf(childpath, "%s/%s", path, ent->d_name);
            //你可以唯一注意的地方是下一行
            //目前childpath就是你要读入的文件的path了，可以作为你的读入文件的函数的参数
            statistics(childpath);//这里就是你的处理文件的接口！，
        }
    }
}

View Code

3.统计函数（单词识别）：

void statistics(char* filepath) {
    ifstream infile;
    ofstream outfile;
    hashNPtr curr = NULL;
    hashNPtr pre = NULL;
    char words[MAXWORD] = { 0 };
    char c;
    short ctype;
    short i = 0;
    bool preflag = 0;
    int r = 0;
    infile.open(filepath, ios::in);
    while (!infile.eof()) {
        c = infile.get();
        ctype = is_char(c);
        //如果是字符
        if (ctype == 1 || ctype == 2 || ctype == 3) {
            charnumber++;
            //如果是数字
            if (ctype == 1) {
                if (i > 3) {
                    words[i] = c;
                    i++;
                }
                else {
                    memset(words, 0, sizeof(words));
                    i = 0;
                }
            }
            //如果是字母
            else if (ctype == 2) {
                words[i] = c;
                i++;
            }
            //如果是分隔符
            else {
                if (i > 3) {
                    wordnumber++;
                    curr = insertword(words, wordhash, MAXWORD);
                    if (preflag) {
                        insertwords(pre, curr);
                    }
                    pre = curr;
                    preflag = 1;
                }
                i = 0;
                memset(words, 0, sizeof(words));
            }
            //如果是换行符
        }
        else {
            if (ctype == 4)
                linenumber++;
            if (i > 3) {
                wordnumber++;
                curr = insertword(words, wordhash, MAXWORD);
                if (preflag) {
                    insertwords(pre, curr);
                }
                pre = curr;
                preflag = 1;
            }
                i = 0;
                memset(words, 0, sizeof(words));
            }
            //单词统计
        }
        linenumber++;
        infile.close();
    }

View Code

4.哈希

4.1哈希函数

//caculate the hashvalue
unsigned long hash_function(char *p) {
    unsigned long h = 0;
    for (; *p; p++) {
        if ((*p > 64) && (*p < 91)) {
            h = MUTI*h + *p;
        }
        else if ((*p > 96) && (*p < 123)) {
            h = MUTI*h + *p - 32;
        }
    }
    return h % HASHSIZE;
}

View Code

4.2哈希查找与存储

单词：

//insert the word to the hash table
hashNPtr insertword(char* s, hashNPtr *hashp, unsigned int N) {
    unsigned long pos = hash_function(s);
    hashNPtr p;
    for (p = hashp[pos]; p != NULL; p = p->next) {
        if (is_same_word(p->word, s)) {
            update_word(p->word, s);
            (p->word_number)++;
            return p;
        }
    }
    p = new hashNode;
    p->word = new char[N];
    strcpy(p->word, s);
    p->word_number = 1;
    p->next = hashp[pos];
    hashp[pos] = p;
    return p;
}

View Code

词组：

void insertwords(hashNPtr pre, hashNPtr curr) {
    unsigned long pos = (hash_function(pre->word) + hash_function(curr->word)) % HASHSIZE;
    hashNPtr_s p;
    for (p = wordshash[pos]; p != NULL; p = p->next) {
        if (is_same_word(p->wordA->word, pre->word) && is_same_word(p->wordB->word, curr->word)) {
            (p->word_number)++;
            return;
        }
    }
    p = new hashNode_s;
    p->wordA = pre;
    p->wordB = curr;
    p->word_number = 1;
    p->next = wordshash[pos];
    wordshash[pos] = p;

}

View Code

5.字符与单词处理

字符识别：

//judge a char
short is_char(char c) {
    //数字返回1
    if (c >= 48 && c <= 57) return 1;
    //字母返回2
    else if (c >= 65 && c <= 90 || c >= 97 && c <= 122) return 2;
    //字符返回3
    else if (c >= 32 && c <= 126) return 3;
    //换行符返回4
    else if (c == '\n') return 4;
    else return 0;
}

View Code

单词比较：

int is_same_word(char* old, char* s) {
    char* oldd = old;
    char* ss = s;
    bool flag = false;
    for (; *old && *s; old++, s++) {
        //都是字母
        if (*old > 64 && *s > 64) {
            if (flag == true) return 0;
            else {
                if ((*old - *s) != 32 && (*old - *s) != -32 && (*old - *s) != 0) return 0;
            }
        }
        //都是数字
        else if (*old < 64 && *s < 64) {
            if (*old != *s) flag = true;
        }
        //有数字有字母
        else return 0;
    }
    while (*old) {
        if (*old > 64) return 0;
        old++;
    }
    while (*s) {
        if (*s > 64) return 0;
        s++;
    }
    return 1;

}

View Code

单词更新：

int update_word(char* old, char* s) {

    if (strcmp(old, s) > 0) {
        strcpy(old,  s);
        return 1;
    }
    return 0;

}

View Code

6.排序及输出

单词：

//hashsort
void hashsort(hashNPtr *hashp, freNode *frep) {
    int i, j, k;
    hashNPtr q;
    for (i = 0; i < HASHSIZE; i++) {
        for (q = hashp[i]; q != NULL; q = q->next) {
            for (j = 0; j < 10; j++) {
                if (q->word_number >frep[j].count) {
                    for (k = 8; k >= j; k--) frep[k + 1] = frep[k];
                    frep[j].count = q->word_number;
                    frep[j].p = q->word;
                    break;
                }
            }
        }
    }
    ofstream outfile;
    outfile.open("result.txt", ios::out | ios::app);
    outfile << "the top ten frequency of word:" << endl;
    for (i = 0; i < 10; i++) {
        if (frep[i].p == NULL) return;
        outfile << frep[i].p << " " << frep[i].count << endl;
        //cout << frep[i].p << " " << frep[i].count << endl;
    }
    outfile.close();
}

View Code

词组：

void hashsort_s(hashNPtr_s *hashp, freNode_s *frep) {
    int i, j, k;
    hashNPtr_s q;
    for (i = 0; i < HASHSIZE; i++) {
        for (q = hashp[i]; q != NULL; q = q->next) {
            for (j = 0; j < 10; j++) {
                if (q->word_number >frep[j].count) {
                    for (k = 8; k >= j; k--) frep[k + 1] = frep[k];
                    frep[j].count = q->word_number;
                    frep[j].pA = q->wordA->word;
                    frep[j].pB = q->wordB->word;
                    break;
                }
            }
        }
    }
    ofstream outfile;
    outfile.open("result.txt", ios::out | ios::app);
    outfile << "the top ten frequency of phrase:" << endl;
    for (i = 0; i < 10; i++) {
        if (frep[i].pA == NULL || frep[i].pB == NULL) return;
        outfile << frep[i].pA << " " << frep[i].pB << "    " << frep[i].count << endl;
        //cout << frep[i].pA << " " <<  frep[i].pB <<" "<<frep[i].count << endl;
    }
    outfile.close();

}

View Code

HelenL

公告

词频统计——软工个人作业小结