黄聪：TSE分析及完全注释[6] 倒排索引的建立的程序分析(转)

TSE建立索引在运行程序上的大致步骤可以简化分为以下几步：

1、运行命令#./DocIndex
会用到一个文件 tianwang.raw.520 //爬取回来的原始文件，包含多个网页的所有信息，所以很大，这也是一个有待解决的问题，到底存成大文件（如果过大会超过2G或4G的限制，而且文件过大索引效率过低）还是小文件（文件数过多用于打开关闭文件句柄的消耗过大）还有待思考，还就是存储方案的解决最终肯定是要存为分布式的，最终总文件量肯定是会上TB的，TSE只支持小型的搜索引擎需求。
会产生一下三个文件 Doc.idx, Url.idx, DocId2Url.idx //Data文件夹中的Doc.idx DocId2Url.idx和Doc.idx

2、运行命令#sort Url.idx|uniq > Url.idx.sort_uniq //Data文件夹中的Url.idx.sort_uniq
会用到一个文件 Url.idx文件 //md5 hash 之后的url完整地址和document id值对
会产生一个文件 Url.idx.sort_uniq //URL消重，md5 hash排序，提高检索效率

3、运行命令#./DocSegment Tianwang.raw.2559638448
会用到一个文件 Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文件，每个页面包含http头，分词为后面建立到排索引做准备
会产生一个文件 Tianwang.raw.2559638448.seg //分词文件，由一行document id号和一行文档分词组（只对每个文档<html></html>中<head>< /head><body></body>等文字标记中的文本进行分组）构成

4、运行命令#./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //建立独立的正向索引

5、运行命令
#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6、运行命令#./CrtInvertedIdx moon.fidx.sort > sun.iidx //建立倒排索引

我们先从建立索引的第一个程序DocIndex.cpp开始分析。(注释约定：Tianwang.raw.2559638448是抓回来合并成的大文件，后面就叫大文件，里面包含了很多篇html文档，里面的文档有规律的分隔就叫做一篇一篇的文档)

//DocIndex.h start-------------------------------------------------------------




#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include 
#include

#include
#include
#include
#include
#include

using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //前台搜索结果数据集返回条数

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp中要用到
const unsigned MAX_DOC_IDX_ID = 22104;

//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //倒排索引文件
//朱德 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古力 1085 1222

//9万多条字元文件包括特殊符号，标点，汉字
const string DOC_IDX_NAME("./Data/Doc.idx"); //倒排索引文件
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp中要用到
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp中要用到

//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");

#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------

//DocIndex.cpp start-----------------------------------------------------------

#include 
#include 
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include 

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
	//ifstream ifs("Tianwang.raw.3023555472");
	//iceway(mnsc)
	ifstream ifs(DOC_FILE_NAME.c_str());	//打开Tianwang.raw.3023555472文件，最原始的文件
	if (!ifs) 
	{
    	cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
    	return -1;
    }
	ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary);	//建立并打开Url.idx文件
	if( !ofsUrl )
	{
		cout << "error open file " << endl;
	} 

	ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary);	//建立并打开Doc.idx文件
	if( !ofsDoc )
	{
		cout << "error open file " << endl;
	} 

	ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary);	//建立并打开DocId2Url.idx文件
	if( !ofsDocId2Url )
	{
		cout << "error open file " << endl;
	} 

	int cnt=0;	//文档编号从0开始计算
	string strLine,strPage;
	CUrl iUrl;
	CDocument iDocument;
	CMD5 iMD5;
	
	int nOffset = ifs.tellg();
	while (getline(ifs, strLine)) 
	{
		if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
		{
			nOffset = ifs.tellg();
			continue;
		}

		if (!strncmp(strLine.c_str(), "version: 1.0", 12))	//判断第一行是否是version: 1.0如果是就解析下去
		{	
			if(!getline(ifs, strLine)) break;
			if (!strncmp(strLine.c_str(), "url: ", 4))	//判断第二行是否是url: 如果是则解析下去
			{
				iUrl.m_sUrl = strLine.substr(5);	//截取url: 五个字符之后的url内容
				iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() );	//对url用md5 hash处理
				iUrl.m_sChecksum = iMD5.ToString();	//将字符数组组合成字符串这个函数在Md5.h中实现

			} else 
			{
				continue;
			}

			while (getline(ifs, strLine)) 
			{
				if (!strncmp(strLine.c_str(), "length: ", 8))	//一直读下去直到判断澹澹(相对第五行)惺欠袷莑ength: 是则接下下去
				{
					sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength));	//将该块所代表网页的实际网页内容长度放入iDocument数据结构中
					break;
				}
			}

			getline(ifs, strLine);	//跳过相对第六行故意留的一个空行

			iDocument.m_nDocId = cnt;	//将文档编号赋值到iDocument数据结构中
			iDocument.m_nPos = nOffset;	//文档结尾在大文件中的结束行号
			char *pContent = new char[iDocument.m_nLength+1];	//新建该文档长度的字符串指针

			memset(pContent, 0, iDocument.m_nLength+1);	//每一位初始化为0
			ifs.read(pContent, iDocument.m_nLength);	//根据获得的文档长度读取澹(其中包含协议头)读取文档内容
			iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
			iDocument.m_sChecksum = iMD5.ToString();	//将字符数组组合成字符串这个函数在Md5.h中实现
			
			delete[] pContent;
			
			ofsUrl << iUrl.m_sChecksum ;	//将md5hash后的url写入Url.idx文件
			ofsUrl << "\t" << iDocument.m_nDocId << endl;	//在一行中一个tab距离分隔，将文件编号写入Url.idx文件

			ofsDoc << iDocument.m_nDocId ;	//将文件编号写入Doc.idx文件
			ofsDoc << "\t" << iDocument.m_nPos ;	//在一行中一个tab距离分隔，将该文档结束行号澹(同样也是下一文档开始行号)写入Doc.idx文件
			//ofsDoc << "\t" << iDocument.m_nLength ;
			ofsDoc << "\t" << iDocument.m_sChecksum << endl;	//在一行中一个tab距离分隔，将md5hash后的url写入Doc.idx文件

			ofsDocId2Url << iDocument.m_nDocId ;	//将文件编号写入DocId2Url.idx文件
			ofsDocId2Url << "\t" << iUrl.m_sUrl << endl;	//将该文档的完整url写入DocId2Url.idx文件

			cnt++;	//文档编号加一说明该以文档分析完毕，生成下一文档的编号
		}

		nOffset = ifs.tellg();

	}

	//最后一行只有文档号和上一篇文档结束号
	ofsDoc << cnt ; 
	ofsDoc << "\t" << nOffset << endl;


	return(0);
}

//DocIndex.cpp end-----------------------------------------------------------

posted on 2011-11-08 16:28 黄聪阅读(608) 评论(0) 编辑收藏举报

刷新页面返回顶部

黄聪

公告