包含停用词的词频统计(map,set非class版本)<< 0919


#include <iostream>
#include <string>
#include <vector>
#include<sstream>//string IO流头文件
#include <fstream>
#include<set>       //使用set的头文件
#include <map>   // 使用map 的头文件
#include <sys/time.h>
#ifndef __STDC_FORMAT_MACROS                                                                                                                              
#define __STDC_FORMAT_MACROS
#endif /* __STDC_FORMAT_MACROS */
#include <inttypes.h>//加上上面的一句,可以直接使用int64_t,使用“%“PRId64”来打印
using namespace std;
#define ERR_EXIT(m) \
    do { \
        perror(m);\
        exit(EXIT_FAILURE);\
    }while(0)

void del_punct(string &word)
{
    for (string::size_type ix = 0;
            ix != word.size();
            ++ix)
    {
        if (ispunct(word[ix]))
        {
            word.erase(ix,1);//string 中的erase的使用方法
            --ix;//小心越界
        }
    }
}

void read_del_punct(const char* filename, set<string> &str)//使用set存储stop_list
{
    ifstream infile;
    infile.open(filename);
    if (!infile)//返回值是不是这样
        throw runtime_error("open file failed!");
    string line;
    while (getline (infile, line))
    {
        istringstream instring(line);//isstrngstream 的用法,不需要关闭
        string word;
        while (instring >> word)
        {
            del_punct(word);
            str.insert(word);
        }
    }
    infile.close();
}

bool in_stoplist(const set<string> &str, const string &word)
{
    set<string>::iterator it = str.find(word);//find 的使用
    if (it == str.end())//遍历寻找set中的元素
        return false;
    return true;
}

void read_del_punct(const char* filename, const set<string> &str, vector<string> &words)//读取文件,删除文本中标点符号,删掉stop_list里面的单词,存至vector
{
    ifstream infile;
    infile.open(filename);
    if (!infile)//返回值是不是这样
        throw runtime_error("open file failed!");
    string word;
    while (infile >> word)
    {
        del_punct(word);
        if (!in_stoplist(str,word))
        {
            words.push_back(word);
        }
    }
}

void map_creat(vector<string> &words, map<string,int> &M)//使用map统计单词
{
    string word;
    for (vector<string>::iterator it = words.begin(); 
            it != words.end();
            ++it)
    {
        M[*it] ++;//map 的语法:直接赋值,自动排序
    }
}

void map_print(map<int,string, greater<int> > &N)//打印map
{
    for (map<int, string>::iterator it = N.begin(); 
            it != N.end();
            ++it)//定义迭代器不需要加上第三个参数
    {
        cout << it -> second << " : " << it -> first << endl;
    }

}

void map_trans(map<string,int> &M, map<int, string, greater<int> > &N)//交换map的key和value
{

    for (map<string,int>::iterator it = M.begin(); 
            it != M.end();
            ++it)
    {
        N[it -> second] = it -> first;
    }
}

int64_t gettime ()//gettime()函数
{
    struct timeval tm;
    memset(&tm,0,sizeof tm);
    if (gettimeofday(&tm, NULL) == -1)//gettimeofday 的用法和返回值
        throw runtime_error("gettimeofday");
    int64_t t = tm.tv_usec;
    t += tm.tv_sec * 1000 * 1000;
    return t;
}
int main(int argc, const char *argv[])
{
    if (argc < 3)//语法是不是这样写?
    {
        // cerr << "Usage : " << argv[0] << " file"  << endl;
        fprintf(stderr, "Usage : %s filename stoplist\n", argv[0]);
        // ERR_EXIT("usage");//不是这么写,usage未定义,此时perror里面没有错误
        exit(EXIT_FAILURE);//exit 为小写!!
    }

    int64_t start,end;
    vector<string> words;
    set<string> str;

    start = gettime();
    read_del_punct (argv[2],str);//读取stoplist
    read_del_punct (argv[1],str,words);//读取文件
    end = gettime();
    cout << "读取文件花费 " << end - start  << " us" << endl;

    map<string, int> M;//map 可以按照value排序,但不能建立在value值改变的基础上,就是说,在统计词频的时候,不要尝试着在按照key排序操作value的基础上同时按照value排序
    start = gettime();
    map_creat (words,M);
    end = gettime();
    cout << "读取单词入map花费 " << end - start  << " us" << endl;

    map<int, string, greater<int> > N;
    start = gettime();
    map_trans(M,N);
    end = gettime();
    cout << "转置map花费 " << end - start  << " us" << endl;
    map_print(N);
    return 0;
}

posted on 2014-09-25 02:26  __hello world  阅读(222)  评论(0编辑  收藏  举报

导航