不同关键词查找方法性能比较

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:ShidongDu time:2020/6/3
import time
import pandas as pd
import re

# 结点类
class node:
    def __init__(self, ch):
        self.ch = ch  # 结点值
        self.fail = None  # Fail指针
        self.tail = 0  # 尾标志:标志为 i 表示第 i 个模式串串尾
        self.child = []  # 子结点
        self.childvalue = []  # 子结点的值

# AC自动机类
class Aho_Corasick:
    def __init__(self):
        self.root = node("")  # 初始化根结点
        self.count = 0  # 模式串个数

    # 第一步:模式串建树
    def insert(self, strkey):
        self.count += 1  # 插入模式串,模式串数量加一
        p = self.root
        for i in strkey:
            if i not in p.childvalue:  # 若字符不存在,添加子结点
                child = node(i)
                p.child.append(child)
                p.childvalue.append(i)
                p = child
            else:  # 否则,转到子结点
                p = p.child[p.childvalue.index(i)]
        p.tail = self.count  # 修改尾标志

    # 第二步:修改Fail指针
    def ac_automation(self):
        queuelist = [self.root]  # 用列表代替队列
        while len(queuelist):  # BFS遍历字典树
            temp = queuelist[0]
            queuelist.remove(temp)  # 取出队首元素
            for i in temp.child:
                if temp == self.root:  # 根的子结点Fail指向根自己
                    i.fail = self.root
                else:
                    p = temp.fail  # 转到Fail指针
                    while p:
                        if i.ch in p.childvalue:  # 若结点值在该结点的子结点中,则将Fail指向该结点的对应子结点
                            i.fail = p.child[p.childvalue.index(i.ch)]
                            break
                        p = p.fail  # 否则,转到Fail指针继续回溯
                    if not p:  # 若p==None,表示当前结点值在之前都没出现过,则其Fail指向根结点
                        i.fail = self.root
                queuelist.append(i)  # 将当前结点的所有子结点加到队列中

    # 第三步:模式匹配
    def runkmp(self, strmode):
        p = self.root
        cnt = {}  # 使用字典记录成功匹配的状态
        for i in strmode:  # 遍历目标串
            while i not in p.childvalue and p is not self.root:
                p = p.fail
            if i in p.childvalue:  # 若找到匹配成功的字符结点,则指向那个结点,否则指向根结点
                p = p.child[p.childvalue.index(i)]
            else:
                p = self.root
            temp = p
            while temp is not self.root:
                if temp.tail:  # 尾标志为0不处理
                    if temp.tail not in cnt:
                        cnt.setdefault(temp.tail)
                        cnt[temp.tail] = 1
                    else:
                        cnt[temp.tail] += 1
                temp = temp.fail
        return cnt  # 返回匹配状态
        # 如果只需要知道是否匹配成功,则return bool(cnt)即可
        # 如果需要知道成功匹配的模式串种数,则return len(cnt)即可

class Trie:
    # word_end = -1

    def __init__(self):
        """
        Initialize your data structure here.
        """
        self.name = 'Tire'
        self.root = {}
        self.word_end = -1

    def insert(self, word):
        """
        Inserts a word into the trie.
        :type word: str
        :rtype: void
        """
        curNode = self.root
        for c in word:
            if not c in curNode:
                curNode[c] = {}
            curNode = curNode[c]

        curNode[self.word_end] = True

    def search(self, word):
        """
        Returns if the word is in the trie.
        :type word: str
        :rtype: bool
        """
        curNode = self.root
        for c in word:
            if not c in curNode:
                return False
            curNode = curNode[c]

        # Doesn't end here
        if self.word_end not in curNode:
            return False

        return True

    def startsWith(self, prefix):
        """
        Returns if there is any word in the trie that starts with the given prefix.
        :type prefix: str
        :rtype: bool
        """
        curNode = self.root
        for c in prefix:
            if not c in curNode:
                return False
            curNode = curNode[c]

        return True


class Solution:
    def __init__(self, dict_file: str, besides=None):

        self.besides = besides
        self.key_word_list = []
        self.a_dict = self.read_xls(dict_file)

    def read_xls(self, file) -> dict:
        a_dict = {}

        a = pd.read_excel(file)
        category = {}
        for cate in a.keys():
            if 'Unnamed' not in cate:
                category[cate] = set()

        for _ in category.keys():
            for __ in a[_]:
                category[_].add(__)

        for key in a.keys():
            if 'Unnamed' not in key:
                a_dict[key] = []
                for word in a[key]:
                    if type(word) == type('str'):
                        self.key_word_list.append(word)
                        a_dict[key].append(word)
                        if self.besides:
                            self.besides.insert(word)
                    else:
                        break
        return a_dict

####################################################################
    def BF(self, word, doc):
        res = []
        length = len(word)
        for i in range(len(doc)-length):
            if doc[i: i+length] == word:
               res.append(word)
        return (word, len(res)) if res else None

####################################################################
    def KMP(self, word: str, doc: str):
        res  =[]
        def get_next(word: str):
            n = len(word)
            next = [0 for _ in range(n)]
            next[0] = -1

            j = -1
            for i in range(1, n):
                while (j != -1 and word[i] != word[j + 1]):
                    j = next[j]
                if word[i] == word[j + 1]: j += 1
                next[i] = j
            return next

        next = get_next(word)
        m = len(doc)
        n = len(word)
        j = -1
        for i in range(m):
            while(j!=-1 and doc[i] != word[j+1]):
                j = next[j]
            if doc[i]==word[j+1]:
                j+=1
            if j == n-1:
                res.append(word)
                j = next[j]
        return (word, len(res)) if res else None

####################################################################
    def Re(self, word: str, doc: str):
        res = re.search(word, doc)
        return (word, 1) if res else None

####################################################################
    def Tire_Tree(self, doc: str):
        res = []
        for i in range(len(doc)):
            if doc[i] not in self.besides.root:
                continue
            else:
                tmp = self.besides.root[doc[i]]
                j = i+1
                while j <= len(doc)-1:
                    if doc[j] in tmp:
                        if -1 in tmp[doc[j]] :
                            res.append(doc[i: j+1])
                            break
                        else:
                            tmp = tmp[doc[j]]
                            j += 1
                    else:
                        break
        if res:
            return res

####################################################################
    def Aho_Corasick(self, doc: str):
        res = []
        d = self.besides.runkmp(doc)
        for key in d.keys():
            res.append( (self.key_word_list[key], d[key]) )
        return res

####################################################################


    def operation(self, algorithm: str, file_name: str):
        res = []
        if algorithm == 'BF':
            algo = self.BF
        if algorithm == 'KMP':
            algo = self.KMP
        if algorithm == 'Re':
            algo = self.Re
        if algorithm == 'Tire':
            algo = self.Tire_Tree
            with open(file_name, 'r', encoding='utf-8') as f:
                textlines = f.readlines()
                for text in textlines:
                    word_pos = []
                    word_pos.append( (algo(text), text) )
                    res.append((word_pos, text))
            return res

        if algorithm == 'Aho_Corasick':
            algo = self.Aho_Corasick
            self.besides.ac_automation()
            with open(file_name, 'r', encoding='utf-8') as f:
                textlines = f.readlines()
                for text in textlines:
                    word_nums = []
                    word_nums.append( (algo(text), text) )
                    res.append((word_nums, text))
            return res

        with open(file_name, 'r', encoding='utf-8') as f:
            textlines = f.readlines()
            for text in textlines:
                word_pos = []
                for key in self.a_dict.keys():
                    for word in self.a_dict[key]:
                        tmp = algo(word, text)
                        if tmp:
                            word_pos.append((tmp, key))
                res.append( (word_pos, text) )
        return res



if __name__ == '__main__':
    time1 = time.time()
    tire = Trie()
    aho_corasick = Aho_Corasick()
    solution = Solution('key_word-update.xlsx', aho_corasick)
    res  = solution.operation('Aho_Corasick', 'all.txt')
    # solution = Solution('key_word-update.xlsx')
    # res = solution.operation('Re', 'all.txt')
    with open('res.txt', 'w', encoding='utf-8') as f:
        for _ in res:
            f.write(str(_) + '\n')
    time2 = time.time()
    print(time2 - time1)

 

posted @ 2020-06-11 14:46  今夜无风  阅读(282)  评论(0编辑  收藏  举报