利用whoosh对mongoDB的中文文档建立全文检索
1、建立索引
#coding=utf-8 from __future__ import unicode_literals __author__ = 'zh' import sys,os from whoosh.index import create_in,open_dir from whoosh.fields import * from jieba.analyse import ChineseAnalyzer import pymongo import json from pymongo.collection import Collection from pymongo import database class CreatIndex: def __init__(self): self.mongoClient = pymongo.MongoClient('192.168.229.128',27017) self.websdb = pymongo.database.Database(self.mongoClient,'webdb') self.pagesCollection = Collection(self.websdb,'pages') def BuiltIndex(self): analyzer = ChineseAnalyzer() # 索引模版 schema = Schema( U_id=ID(stored=True), # md5=ID(stored=True), title=TEXT(stored=True,analyzer=analyzer), location=TEXT(stored=True), publish_time=DATETIME(stored=True,sortable=True), content=TEXT(stored=False,analyzer=analyzer) ) from whoosh.filedb.filestore import FileStorage storage = FileStorage("../whoosh_index") if not os.path.exists("../whoosh_index"): os.mkdir("../whoosh_index") ix = storage.create_index(schema) print '建立索引文件!' else: ix=storage.open_index() # if not os.path.exists("whoosh_index"): # os.mkdir("whoosh_index") # ix = create_in("whoosh_index", schema) # for create new index # #ix = open_dir("tmp") # for read only writer = ix.writer() try: num=0 while(True): # break try: row=self.pagesCollection.find_one({'indexed':{'$exists':False}}) if row!=None: publish_time=None if row.has_key('publish_time'): publish_time=row['publish_time'] if str(publish_time)=='' or str(publish_time)=='0': publish_time=None location='' if row.has_key('location'): location=json.JSONEncoder().encode(row['location']) writer.add_document( U_id=''.join(str(row['_id'])), # md5=row['md5'], title=row['name'], location=''.join(location), publish_time=publish_time, content=row['information'] ) self.pagesCollection.update_one({"_id":row["_id"]},{"$set":{"indexed":True}}) num+=1 print row["_id"],"已建立索引!" else: writer.commit() print "全部处理完毕" # time.sleep(3600) # self.BuiltIndex() break except: print row["_id"],"异常" break except: writer.commit() print "异常" # print '已处理',num,'共计', self.pagesCollection.find({'indexed':{'$exists':True}}).count() print '已处理',num,'共计', self.pagesCollection.find().count() creatindext = CreatIndex() creatindext.BuiltIndex()
注:注意编码
2、检索
from __future__ import unicode_literals #coding=utf-8 __author__ = 'zh' # from whoosh.qparser import QueryParser from whoosh import qparser,sorting # from jieba.analyse import ChineseAnalyzer from whoosh.index import open_dir from whoosh.query import * # import pymongo import datetime # from pymongo.collection import Collection # from pymongo import database class FullText: def __init__(self,index_home='whoosh_index'): self.index_home = index_home self.ix = open_dir(self.index_home) self.searcher = self.ix.searcher() # 全文检索,目前主要利用关键字 def Query(self,parameter): # analyzer = ChineseAnalyzer() # ix = open_dir(self.index_home) # for read only # searcher = ix.searcher() # print ix.schema['content'] # 按照字段查询,可联合查询,MultifieldParser list=parameter['keys'] if len(list)==1: parser = qparser.QueryParser(list[0], schema=self.ix.schema) if len(list)>1: parser = qparser.MultifieldParser(list, schema=self.ix.schema) # else: # return None # print ix.schema keywords = parameter['keywords'] # print keywords q = parser.parse(keywords) # mf = sorting.MultiFacet() scores = sorting.ScoreFacet() date = sorting.FieldFacet("publish_time", reverse=True) # 是否分页返回OR全部返回,默认全部返回 _limit=None if parameter.has_key('page') and parameter.has_key('pagesize'): page=parameter['page'] pagesize=parameter['pagesize'] if page > 0 and pagesize !=0: _limit=page*pagesize # 是否按照location字段过滤,默认不过滤 allow_q=None if parameter.has_key('includeFields') and parameter['includeFields'].__contains__(u'location'): allow_q = qparser.query.Term("location", u"coordinates") # 时间分组,暂时不用 # start = datetime.datetime(2000, 1, 1) # end = datetime.datetime.now() # gap = datetime.timedelta(days=365) # bdayfacet = sorting.DateRangeFacet("publish_time", start, end, gap) results = self.searcher.search(q, limit=_limit,filter=allow_q,sortedby=[scores,date]) # results = searcher.search(q, limit=_limit,filter=restrict_q, # groupedby=bdayfacet,sortedby=[scores,date]) # print results.estimated_length() return results fulltext_query = fulltext.FullText()
注:支持多字段检索、分类、排序等
whoosh参考
提供陕西省POI数据(300万条,sqlserver备份文件)