Redis实现搜索和排序

明日更新文字。

 

建立反向索引

基于文件建立单词与文档的反向索引,使用集合存储。

# # #!/usr/bin/env python
# # # -*- coding: UTF-8 -*-
import jieba
import codecs
import redis
import uuid
#分词
def cut_words(file):
    with open(file, 'r',encoding="utf-8") as f:
        text = f.read()
        words = jieba.cut_for_search(text)
        #print(len(words),words) #查看分词结果
    return words

#去停用词
def drop_Disable_Words(cut_res,stopwords):
    res = []
    for word in cut_res:
        if(len(word)) > 2:
            if word in stopwords or word =="\n" or word =="\u3000":
                continue
            res.append(word)
    #print(len(res),res) #查看去停用词结果
    return res

#读取停用词
def read_stop_word(file_path):
    file = file_path
    stopwords = codecs.open(file,'r',encoding='utf8').readlines()
    stopwords = [ w.strip() for w in stopwords ]
    return stopwords

#建立反向索引
def index_document(conn,docid,keywords):
    pipe = conn.pipeline(True) #管道里执行的命令可以保证执行的原子性
    for keyword in keywords:
        pipe.sadd(keyword,docid)
    return len(pipe.execute())


def _set_conmon(conn,method,names,ttl = 30,execute = True):
    id = str(uuid.uuid4())

#读取原始语料、停用词表
files = ['file1.txt','file2.txt']
stopwords = read_stop_word("stop_word.txt")

dic = {}
#分词、去停用词
corpus = []
for file in files:
    #分词
    cut_res = cut_words(file)
    #去停用词
    res = drop_Disable_Words(cut_res,stopwords)
    #记录页标和位置
    corpus.append(res)
print(corpus)
pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True)
conn = redis.Redis(connection_pool=pool)
pipeline = conn.pipeline(True)
for i in range(0,len(corpus)):
    for word in corpus[i]:
        pipeline.sadd('idx:'+word,files[i])
    print(len(pipeline.execute()))

对单词进行搜索

#搜索
def _set_common(conn,method,names,ttl=30,execute =True):
    id = str(uuid.uuid4())
    pipeline = conn.pipeline(True)
    names = ['idx:' + name for name in names]
    getattr(pipeline,method)('idx:' + id,*names)
    pipeline.expire('idx:' + id,ttl)
    if execute:
        print(pipeline.execute())
    return id
#交集计算
def intersect(conn,items,ttl = 30,_execute=True):
    return _set_common(conn,'sinterstore',items,ttl,_execute)
#并集计算
def union(conn,items,ttl = 30,_execute=True):
    return _set_common(conn,'sunionstore',items,ttl,_execute)
#差集计算
def difference(conn,items,ttl = 30,_execute=True):
    return _set_common(conn,'sdiffstore',items,ttl,_execute)

names = ["DirectX","Unity3D","STL"]
pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True)
conn = redis.Redis(connection_pool=pool)
id = union(conn,names)
print(id)
print(conn.smembers('idx:'+id))
# redis.exceptions.ResponseError: WRONGTYPE Operation against a key holding the wrong kind of value
# 错误原因:
# redis的存储数据的类型和代码试图读取该数据时使用的函数不一致.
# print(conn.sunion("idx:DirectX", "idx:STL"))

 

posted @ 2020-06-03 00:13  -零  阅读(1859)  评论(0编辑  收藏  举报