利用whoosh对mongoDB的中文文档建立全文检索

1、建立索引

#coding=utf-8
from __future__ import unicode_literals
__author__ = 'zh'
import sys,os
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from jieba.analyse import ChineseAnalyzer
import pymongo
import json
from pymongo.collection import Collection
from pymongo import database

class CreatIndex:
    def __init__(self):
        self.mongoClient = pymongo.MongoClient('192.168.229.128',27017)
        self.websdb = pymongo.database.Database(self.mongoClient,'webdb')
        self.pagesCollection = Collection(self.websdb,'pages')
    def BuiltIndex(self):
        analyzer = ChineseAnalyzer()
        # 索引模版
        schema = Schema(
            U_id=ID(stored=True),
            # md5=ID(stored=True),
            title=TEXT(stored=True,analyzer=analyzer),
            location=TEXT(stored=True),
            publish_time=DATETIME(stored=True,sortable=True),
            content=TEXT(stored=False,analyzer=analyzer)
        )
        from whoosh.filedb.filestore import FileStorage
        storage = FileStorage("../whoosh_index")
        if not os.path.exists("../whoosh_index"):
            os.mkdir("../whoosh_index")
            ix = storage.create_index(schema)
            print '建立索引文件!'
        else:
            ix=storage.open_index()

        # if not os.path.exists("whoosh_index"):
        #     os.mkdir("whoosh_index")
        #     ix = create_in("whoosh_index", schema) # for create new index
        # #ix = open_dir("tmp") # for read only
        writer = ix.writer()
        try:
            num=0
            while(True):
                # break
                try:
                    row=self.pagesCollection.find_one({'indexed':{'$exists':False}})
                    if row!=None:
                        publish_time=None
                        if row.has_key('publish_time'):
                            publish_time=row['publish_time']
                            if str(publish_time)=='' or str(publish_time)=='0':
                                publish_time=None
                        location=''
                        if row.has_key('location'):
                            location=json.JSONEncoder().encode(row['location'])

                        writer.add_document(
                        U_id=''.join(str(row['_id'])),
                        # md5=row['md5'],
                        title=row['name'],
                        location=''.join(location),
                        publish_time=publish_time,
                        content=row['information']
                        )
                        self.pagesCollection.update_one({"_id":row["_id"]},{"$set":{"indexed":True}})
                        num+=1
                        print row["_id"],"已建立索引!"
                    else:
                        writer.commit()
                        print "全部处理完毕"
                        # time.sleep(3600)
                        # self.BuiltIndex()
                        break
                except:
                    print row["_id"],"异常"
                    break
        except:
            writer.commit()
            print "异常"
        # print '已处理',num,'共计', self.pagesCollection.find({'indexed':{'$exists':True}}).count()
            print '已处理',num,'共计', self.pagesCollection.find().count()

creatindext = CreatIndex()
creatindext.BuiltIndex()
View Code

注:注意编码

2、检索

from __future__ import unicode_literals
#coding=utf-8
__author__ = 'zh'
# from whoosh.qparser import QueryParser
from whoosh import qparser,sorting
# from jieba.analyse import ChineseAnalyzer
from whoosh.index import open_dir
from whoosh.query import *
# import pymongo
import datetime
# from pymongo.collection import Collection
# from pymongo import database

class FullText:
    def __init__(self,index_home='whoosh_index'):
        self.index_home = index_home
        self.ix = open_dir(self.index_home)
        self.searcher = self.ix.searcher()

    # 全文检索,目前主要利用关键字
    def Query(self,parameter):
        # analyzer = ChineseAnalyzer()
        # ix = open_dir(self.index_home) # for read only

        # searcher = ix.searcher()
        # print ix.schema['content']
        # 按照字段查询,可联合查询,MultifieldParser
        list=parameter['keys']
        if len(list)==1:
            parser = qparser.QueryParser(list[0], schema=self.ix.schema)
        if len(list)>1:
            parser = qparser.MultifieldParser(list, schema=self.ix.schema)
        # else:
        #     return None
        # print ix.schema
        keywords = parameter['keywords']
        # print keywords
        q = parser.parse(keywords)

        # mf = sorting.MultiFacet()
        scores = sorting.ScoreFacet()
        date = sorting.FieldFacet("publish_time", reverse=True)

        # 是否分页返回OR全部返回,默认全部返回
        _limit=None
        if parameter.has_key('page') and parameter.has_key('pagesize'):
            page=parameter['page']
            pagesize=parameter['pagesize']
            if page > 0 and pagesize !=0:
                _limit=page*pagesize

        # 是否按照location字段过滤,默认不过滤
        allow_q=None
        if parameter.has_key('includeFields') and parameter['includeFields'].__contains__(u'location'):
            allow_q = qparser.query.Term("location", u"coordinates")

        #  时间分组,暂时不用
        # start = datetime.datetime(2000, 1, 1)
        # end = datetime.datetime.now()
        # gap = datetime.timedelta(days=365)
        # bdayfacet = sorting.DateRangeFacet("publish_time", start, end, gap)

        results = self.searcher.search(q, limit=_limit,filter=allow_q,sortedby=[scores,date])
        # results = searcher.search(q, limit=_limit,filter=restrict_q,
        #                           groupedby=bdayfacet,sortedby=[scores,date])
        # print results.estimated_length()
        return results
fulltext_query = fulltext.FullText()
View Code

注:支持多字段检索、分类、排序等

whoosh参考

提供陕西省POI数据(300万条,sqlserver备份文件

posted on 2017-01-24 09:20  米仓山下  阅读(2582)  评论(0编辑  收藏  举报

导航