结对第二次—文献摘要热词统计及进阶需求

作业链接结对第二次—文献摘要热词统计及进阶需求
队员
- 221500201_孙文慈代码测试上传程序到github仓库文档编写查阅资料进度规划
- 226100125_刘杰 WordCount基础需求和进阶需求主要程序的编写描述解题思路设计实现过程进度规划

-Github
-基础需求(https://github.com/swc221500201/PairProject1-C/)
-进阶需求(https://github.com/swc221500201/PairProject2-C/)

PSP表格

PSP2.1	Personal Software Process Stages	预估耗时（分钟）	实际耗时（分钟）
Planning	计划
Estimate	估计这个任务需要多少时间	1420
Development	开发
Analysis	需求分析 (包括学习新技术)	60	50
Design Spec	生成设计文档	100	200
Design Review	设计复审	60	80
Coding Standard	代码规范 (为目前的开发制定合适的规范)	120	180
Design	具体设计	120	170
Coding	具体编码	660	720
Code Review	代码复审	60	120
Test	测试（自我测试，修改代码，提交修改）	90	100
Reporting	报告
Test Repor	测试报告	60	90
Size Measurement	计算工作量	60	70
Postmortem & Process Improvement Plan	事后总结, 并提出过程改进计划	30	60
	合计	1420	1840

解题思路

刚刚接触题目之后第一反应就是用c++的文件流操作来挨个读取和分隔单词，并且很快的写出了可以运行的源码，后续单词统计排序功能，自己手动写了一些程序，虽然实现了排序功能，不过在文本大小超过1M以上，速度就明显更不上了，在CSDN和博客园上学习了几篇相关的博客，发现用哈希表是个不错的选择，而C++的map容器又正好适合这个功能，内部的红黑树实现使得插入和排序速度快了不少，经过改进之后速度上有了很大提升。

实现过程

在考虑好实现基础功能和进阶需求之后：

针对基础需求：考虑到要尽量做到功能独立即"Core"模块化，于是按要求实现了三个主要函数以及一些辅助的bool判断函数来满足题目需求。因为功能比较简单所以可以在一个源文件中实现。

int countCharacter(string f);//f为要统计的文件路径 返回int类型的字符个数
int countLine(string f);//f为要统计的文件路径 返回int类型的行数
int countWords(string f);//f为要统计的文件路径 返回int类型的单词个数
void sortwordCount(string f, string resultTxt);//排序文件f单词并输出到文件 resultTxt

针对进阶需求：相比于基础功能，进阶功能要求多了命令行运行，以及词频统计，权重统计等进一步要求，所以总体上是在基础功能的函数之上重写了一些功能来满足。

void countWordsWithWeight(string f, string resultTxt, int w);//权重统计,f为要统计的文件路径
//w为权重选择 并输出到文件 resultTxt
void countGroupWordsWithLength(string f, string resultTxt, int n);//f为要统计的文件路径 并输出到文件 resultTxt ，n为用户自定义词组长度
.....

以下是使用命令行运行时的主要写法，即把argv[i]挨个判断并执行相关操作

for (int i = 0; i < argc; ++i) {
    if (string(argv[i]) == "-i"){        
    }
    else if(){      
    }
    ......
}

针对爬虫：爬虫原计划使用c++的类库实现，不过代码量实在巨大，后续改用java的jsoup爬取整个html文档，并生成可以修改的DOM文档，在DOM文档中使用正则匹配找到了每一篇论文的href超链接，之后对这个href链接挨个发送请求，从返回的文本中解析出.ptitle等我们需要的数据内容。总的来说不是很难，不过自己分析DOM文档结构的时候比较难找到规律，好在这个网站没有设置反爬虫的陷阱。而且不需要翻页等复杂操作。

改进思路

爬虫java

爬虫的爬取速度太慢，爬取全部900多条论文消息需要花费近一分钟，如果能够启用线程应该能大幅度提高速度，不过限制于时间没有实现。
基础功能

其实我觉得统计行数，统计单词，统计字符的三个功能可以集中于一个函数，这样可以在分析统计的时候节约两遍的读取时间。不过助教说了这是为了使功能独立。
进阶功能

进阶功能基于基础功能，所以还是觉得要改进的是基础功能。

代码说明

基础需求

//f:要进行字符统计的文件路径 
//返回值 字符数
//c >= 0 && c <= 127 因为爬取的文件包含法语字符所以用c++的 isascii(c)判断会报错
int countCharacter(string f) {
	int ascii = 0;
	ifstream read;
	read.open(f, ios::in);
	char c;
	while (!read.eof()) {
		read >> c;
		if (c >= 0 && c <= 127)
			ascii++;
	}
	read.close();
	return ascii;
}

//f:要进行统计的文件路径 
//返回值 行数
//eachline.empty()划去空行
int countLine(string f) {
	ifstream input(f, ios::in);
	string eachline;
	int line = 0;
	while (getline(input, eachline))
	{
		if (!eachline.empty())
			line++;
	}
	input.close();
	return line;
}

//f:要进行统计的文件路径 
//返回值 单词数
//里面的isword()为自定义的单词判断函数 自动过滤不符合题意单词
int countWords(string f) {
	int wordNum = 0;
	ifstream input;
	input.open(f, ios::in);
	string aline;
	string content;
	string::size_type start = 0;
	string::size_type end = aline.find_first_of(" ");//空格作为单词分隔符

	while (getline(input, aline))
	{
		//为了避免溢出，保存一个string对象size的最安全的方法就是使用标准库类型string：：size_type
		string::size_type start = 0;
		string::size_type end = aline.find_first_of(" ");//空格作为单词分隔符
		while (end != string::npos) //npos就是这一行到头啦；
		{
			string content = aline.substr(start, end - start);
			if (isword(content))//这个单词从未出现
				wordNum++;
			start = end + 1;
			end = aline.find_first_of(" ", start);//空格作为单词分隔符
		}

	}
	input.close();
	return wordNum;
}


//f:要进行统计的文件路径 
//返回值 无
//利用map容器来存储统计单词词频 multimap来实现单词字典顺序输
void sortwordCount(string f) {

	ofstream out(resultTxt,ios::app);
	ifstream input;

	input.open(f, ios::in);
	string eachline;
	map<string, int> mapA; //第一个存单词,第二个存单词出现的次数;

	while (getline(input, eachline))
	{
		//为了避免溢出，保存一个string对象size的最安全的方法就是使用标准库类型string：：size_type
		string::size_type start = 0;
		string::size_type end = eachline.find_first_of(" ");//空格作为单词分隔符
		while (end != string::npos) //npos就是这一行到头啦；
		{
			string content = eachline.substr(start, end - start);
			if (isword(content)) {
				tolowerString(content);//把content内容转换为小写 便于输出和统计
				
				//if (!isLetter(content[end])&&!isdigit(content[end]))
				//	content.erase(content.end());
				map<string, int>::iterator it = mapA.find(content);
				if (it == mapA.end())//这个单词从未出现
					mapA.insert(pair<string, int>(content, 1));//赋值的时候只接受pair类型；
				else
					++it->second;//单词存在

			}
			start = end + 1;
			end = eachline.find_first_of(" ", start);//空格作为单词分隔符
		}

	}

	multimap<int, string, greater<int> > mapB;//按int排序的multimap

//转移mapA
	for (map<string, int>::iterator it1 = mapA.begin(); it1 != mapA.end(); ++it1)
	{
		mapB.insert(pair<int, string>(it1->second, it1->first));
	}


	//界面输出前十
	int i = 0;
	for (map<int, string>::iterator it2 = mapB.begin(); i < 10&&it2!=mapB.end(); ++it2, ++i)
		cout <<"<"<<it2->second <<">:"<< it2->first << endl;
	//输出排序好的map
	
	for (map<int, string>::iterator it2 = mapB.begin(); it2 != mapB.end(); ++it2)
	{
		//		if ((it2->first) > 1)
		out << "<" << it2->second << ">:" << it2->first << endl;
	}

	out.close();
	input.close();
}

进阶需求

词频统计

//f:要进行统计的文件路径 
//返回值 无
//实现方法与基础功能里差不多，区别在于过滤了title： 和abstract：
//为了简洁以下只显示关键段代码
void sortwordCount(string f) {
	if (isAbstract(eachline.substr(start, end - start)) || isTitle(eachline.substr(start, end - start))) {
	start = end + 1;
	end = eachline.find_first_of(" ", start);//空格作为单词分隔符
    }
}


//f:要进行统计的文件路径 w 是否开启权重统计 0关闭 1开启
//返回值 无
//实现方法与基础功能里差不多，区别在于增加了权重统计功能
void countWordsWithWeight(string f, int w) {

		flag = isTitle(eachline.substr(start, end - start));
		///不统计title和abstract
		if (isAbstract(eachline.substr(start, end - start))||isTitle(eachline.substr(start, end - start))) {
			start = end + 1;
			end = eachline.find_first_of(" ", start);//空格作为单词分隔符
		}
		while (end != string::npos) //npos就是这一行到头啦；
		{
			string content = eachline.substr(start, end - start);

			if (isword(content)) {
				tolowerString(content);//把content内容转换为小写 便于输出和统计
				map<string, int>::iterator it = mapA.find(content);
				if (it == mapA.end())//这个单词从未出现
					mapA.insert(pair<string, int>(content, 1));//赋值的时候只接受pair类型	
				else {
					if (w == 0 || flag == false)
						++it->second;//单词存在
					else if (w == 1 && flag == true)
					{
						it->second+=10;//单词存在+= 10;
					}		
				}
			}
			start = end + 1;
			end = eachline.find_first_of(" ", start);//空格作为单词分隔符
		}

	}

}

//f:要进行统计的文件路径 n 用户自定义的词组长度
//返回值 无
//实现方法：从流里读取到第n个分隔符后截断
void countGroupWordsWithLength(string f,int n) {

	while (getline(input, eachline))
	{
		//为了避免溢出，保存一个string对象size的最安全的方法就是使用标准库类型string：：size_type
		string::size_type start = 0;
		string::size_type end = eachline.find_first_of(" ");//空格作为单词分隔符
						///不统计title和abstract
		if (isAbstract(eachline.substr(start, end - start)) || isTitle(eachline.substr(start, end - start))) {
			start = end + 1;
			end = eachline.find_first_of(" ", start);//空格作为单词分隔符
		}

		content = eachline.substr(start, end-start);
		for (i = 0; i < content.size() && cntNum < n; ++i) {
			if (content[i]==' ')
				cntNum++;
		}
		end = end + i;
}

爬取论文信息

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;

public class  Spiter {
	// TODO Auto-generated method stub
	public static void main(String[] args) throws IOException {
		Document doc=Jsoup.connect("http://openaccess.thecvf.com/CVPR2018.py").maxBodySize(0).get();
		Elements listClass = doc.getElementsByAttributeValue("class", "ptitle");
		Document paper;
		int num=0;
		File file=new File("spider.txt");
		Writer out=new FileWriter(file);
			try {			
				System.out.print("爬取开始\n");
				for(Element element:listClass) {
					
					String link = element.getElementsByTag("a").attr("href");
					link="http://openaccess.thecvf.com/"+link;
					paper=Jsoup.connect(link).get();
					
					Element Etitle=paper.getElementById("papertitle");					
					Element Eabstr=paper.getElementById("abstract");
					String abstr=Eabstr.text();
					String title=Etitle.text();
					out.write(num+"\r\n");
					out.write("Title: "+title+"\r\n");
					out.write("Abstract: "+abstr+"\r\n"); // \r\n即为换行
					
					out.write("\r\n");
					out.write("\r\n");
					num++;
					out.flush(); // 把缓存区内容压入文件

				}
				System.out.print("爬取结束");
				
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}		
			out.close();
		}
}

测试代码

File *input = fopen("input.txt","r");
File *ans = fopen("ans.txt","r");
String getAns,getInput;
while(!getline(getInput,input)){
    getline(getAns,input);
    if(!getAns.equal(getInput))
        showMessage();
}
cout<<"success"<<endl;

困难

很久没有接触到文件的操作，对于c++的api比较生疏，重新熟悉的过程花了不少时间，另外在爬虫上也花了不少时间。例如Jsoup.connect（）函数会限定一个默认的1M大小，使得我爬取的数据只有500多条，然而其他队伍用python做出来的却有900多条，正当准备重写的时候，队友发现了这个问题，为我们节省了不少时间。以及后期需求不断变更，每次都要重新考虑。不过好在这次的配合默契了不少。

总结

    通过这次作业，我们可以说重新学习了一遍c++，对相关知识有了更深入的理解和掌握。完成基础需求后发现时间还比较充裕，便去尝试写了一下进阶需求，在这个过程中接触了爬虫，开始时遇到了一些问题，好在队友间相互配合探讨，成功发现并解决了问题，使我们进一步体会到了合作的优势。

posted on 2019-03-15 21:09 MikasaAKM 阅读(152) 评论(3) 编辑收藏举报

刷新页面返回顶部

导航

公告