python实现目录中制定内容查找

#【函数实现代码】------------------------------------------------------------------------------------------------------------------------------------------------------------------

#_*_coding=utf-8_*_
__author__ = 'fang'
__date__ = '2019/2/25 9:58'

import os,codecs,chardet
from time import time
print(__file__)
thistime = time()

def endWith(s, *endstring):
    """
    检查文件名称是否以endstring为结尾
    :param s: 
    :param endstring: 
    :return: 
    """
    array = map(s.endswith, endstring)
    if True in array:
        return True
    else:
        return False

def writeResultLog(allExistsKeywords):
    """
    获取查询结果的内容
    将全部已搜索到的关键字列表中的内容保存到result.log文件中
    :param allExistsKeywords: 
    :return: 
    """

    __logfilename = "result.log"  # 相对路径,文件在.py文件所在的目录中
    # 行分隔符
    ls = os.linesep
    # 结果日志文件名
    try:
        fobj = open(__logfilename, 'w',errors='ignore')
    except IOError as e:
        print("*** file open error:", e)

    else:
        # print(allExistsKeywords)
        fobj.writelines(['%s%s' % (keyword, ls) for keyword in allExistsKeywords])
        fobj.close()
#判断文件的编码方式是否是utf8格式文件，是返回True否则False
# def existBOM(file_obj):
#     code = file_obj.read(3)
#     file_obj.close()
#     if code == codecs.BOM_UTF8:  # 判断是否包含EF BB BF
#         return True   #如果要去掉头部信息的话s = s[len(codecs.BOM_UTF8):]
#     return False

def searchFilesContent(url):
    """
    从searchkeywords.txt文件中初始化待搜索关键字列表
    :param dirname: 
    :return: 
    """
    filename = "searchkeywords.txt"  # 相对路径,文件在.py文件所在的目录中,搜索关键字的文件
    # 待搜索关键字列表
    allSearchKeywords = []
    # 遍历文件当前行已搜索到的关键字列表
    existsKeywordsThisLine = []
    allExistsKeywords = []
    #放置所有搜索的文件
    thistime = time()
    try:
        fobj = open(filename, 'r')
    except IOError as e:
        print("*** file open error:", e)
    else:
        for eachLine in fobj:
            allSearchKeywords.append(eachLine.strip('\n'))  # 使用strip函数去除每行的换行符
        fobj.close()

    # 从excludekeywords.txt文件中初始化要排除的搜索关键字列表
    filename = "excludekeywords.txt"  # 相对路径,文件在.py文件所在的目录中
    # 要排除的搜索关键字列表
    allExcludedKeywords = []
    try:
        fobj = open(filename, 'r')
    except IOError as e:
        print("*** file open error:", e)

    else:
        for eachLine in fobj:
            allExcludedKeywords.append(eachLine.strip('\n'))  # 使用strip函数去除每行的换行符
        fobj.close()

    # 从全部已搜索到的关键字列表排除掉不用搜索的关键字
    for excluedkw in allExcludedKeywords:
        if (excluedkw in allSearchKeywords):
            allSearchKeywords.remove(excluedkw)
    # 遍历打开所有要在其中搜索内容的文件，若待搜索关键字列表为空，则不再继续遍历
    for root, dirs, files in os.walk(url):
        for file in files:
            if endWith(file, '.txt', '.py'):  # 只在扩展名为'.txt', '.py'文件中搜索
                # 打开文件
                filename = root + os.sep + file  # 绝对路径
                filename = filename.replace("\\","\\\\")  # 将路径中的单反斜杠替换为双反斜杠，因为单反斜杠可能会导致将路径中的内容进行转义了，replace函数中"\\"表示单反斜杠，"\\\\"表示双反斜杠
                try:
                    # ==========开始读取目录中的文件遍历查找的过程
                    fobj = codecs.open(filename, 'r', 'utf_8_sig', errors='ignore')
                except IOError as e:
                    print("*** file open error:", e)
                else:
                    # 遍历文件的每一行
                    allSearchKeywords_1 = allSearchKeywords
                    for fileLine in fobj:
                        # 判断当前行是否包含所有搜索关键字
                        for keyword in allSearchKeywords:
                            # 若包含，并添加到该行已搜索到的关键字列表中
                            if keyword.upper() in fileLine.upper():  # 将搜索关键字和该行文本内容都转换为大写后再进行匹配
                                existsKeywordsThisLine.append(keyword)

                        # 将这些搜索到的关键字添加到全部已搜索到的关键字列表中，并包含文件名信息
                        for keyword in existsKeywordsThisLine:
                            allExistsKeywords.append(keyword + "\t" + filename.replace("\\\\", "\\"))
                        if allSearchKeywords is None:
                            existsKeywordsThisLine = []
                            break
                        # 清空该行已搜索到的关键字列表内容
                        existsKeywordsThisLine = []
                    allSearchKeywords = allSearchKeywords_1
                    fobj.close()
                    # 全部文件遍历结束
    writeResultLog(allExistsKeywords)
    print("DONE!", )
# 仅当本python模块直接执行时，才执行如下语句，若被别的python模块引入，则不执行
if __name__ == '__main__':
    url = r"E:\python_data"
    searchFilesContent(url)
    search_time = time() - thistime
    print('The code run {:.0f}m {:.0f}s'.format(search_time // 60, search_time % 60))

#【类实现代码】------------------------------------------------------------------------------------------------------------------------------------------------------------------

#_*_coding=utf-8_*_
__author__ = 'fang'
__date__ = '2019/2/25 9:58'

import os,codecs,chardet
from multiprocessing import Process,Queue, Lock,current_process
from time import time
print(__file__)
class File_Search(object):
    def __init__(self, url):
        """初始化"""
        self.__url = url

    def endWith(self, s, *endstring):
        """
        检查文件名称是否以endstring为结尾
        :param s:
        :param endstring:
        :return:
        """
        array = map(s.endswith, endstring)
        if True in array:
            return True
        else:
            return False

    def proc_read(self):
        allExistsKeywords = []
        while True:
            try:
                data = self.q.get()
                allExistsKeywords.extend(data)
            except:
                print("get读取查询到的数据结束，数据是",allExistsKeywords)
                self.writeResultLog(allExistsKeywords)
                break
        return time() - self.thistime

    def writeResultLog(self):
        """
        获取查询结果的内容
        将全部已搜索到的关键字列表中的内容保存到result.log文件中
        :param allExistsKeywords:
        :return:
        """
        allExistsKeywords = []
        while True:
            try:
                data = self.q.get(block=False)
                allExistsKeywords.extend(data)
            except :
                print("差最后一步保存就可以了....")
                break
        self.__logfilename = "result.log" # 相对路径,文件在.py文件所在的目录中
        # 行分隔符
        ls = os.linesep
        # 结果日志文件名
        try:
            fobj = open(self.__logfilename, 'w',errors='ignore')
        except IOError as e:
            print("*** file open error:", e)

        else:
            # print(allExistsKeywords)
            fobj.writelines(['%s%s' % (keyword, ls) for keyword in allExistsKeywords])
            fobj.close()
        return time() - self.thistime
    #判断文件的编码方式是否是utf8格式文件，是返回True否则False
    # def existBOM(file_obj):
    #     code = file_obj.read(3)
    #     file_obj.close()
    #     if code == codecs.BOM_UTF8: # 判断是否包含EF BB BF
    #         return True   #如果要去掉头部信息的话s = s[len(codecs.BOM_UTF8):]
    #     return False

    def searchFilesContent(self):
        """
        从searchkeywords.txt文件中初始化待搜索关键字列表
        :param dirname:
        :return:
        """
        self.__filename = "searchkeywords.txt" # 相对路径,文件在.py文件所在的目录中,搜索关键字的文件
        # 待搜索关键字列表
        self.__allSearchKeywords = []
        # 遍历文件当前行已搜索到的关键字列表
        self.__existsKeywordsThisLine = []
        # 全部已搜索到的关键字列表
        self.__allExistsKeywords = []
        #子进程创建Queue，并传给各个子进程
        self.q = Queue()
        #放置所有搜索的文件
        self.__filename_list = []
        #放置所有进程的列表
        self.process_list = []
        #进程间锁
        self.lock = Lock()
        self.thistime = time()
        filename_list = []
        try:
            fobj = open(self.__filename, 'r')
        except IOError as e:
            print("*** file open error:", e)
        else:
            for eachLine in fobj:
                self.__allSearchKeywords.append(eachLine.strip('\n')) # 使用strip函数去除每行的换行符
            fobj.close()

        # 从excludekeywords.txt文件中初始化要排除的搜索关键字列表
        filename = "excludekeywords.txt" # 相对路径,文件在.py文件所在的目录中
        # 要排除的搜索关键字列表
        allExcludedKeywords = []
        try:
            fobj = open(filename, 'r')
        except IOError as e:
            print("*** file open error:", e)

        else:
            for eachLine in fobj:
                allExcludedKeywords.append(eachLine.strip('\n')) # 使用strip函数去除每行的换行符
            fobj.close()

        # 从全部已搜索到的关键字列表排除掉不用搜索的关键字
        for excluedkw in allExcludedKeywords:
            if (excluedkw in self.__allSearchKeywords):
                self.__allSearchKeywords.remove(excluedkw)
        # 遍历打开所有要在其中搜索内容的文件，若待搜索关键字列表为空，则不再继续遍历
        for root, dirs, files in os.walk(self.__url):
            for file in files:
                if self.endWith(file, '.txt', '.py'): # 只在扩展名为'.txt', '.py'文件中搜索
                    # 打开文件
                    filename = root + os.sep + file # 绝对路径
                    filename = filename.replace("\\","\\\\") # 将路径中的单反斜杠替换为双反斜杠，因为单反斜杠可能会导致将路径中的内容进行转义了，replace函数中"\\"表示单反斜杠，"\\\\"表示双反斜杠
                    # filename_list.append(filename)
                    self.pp = Process(target=file_search.run, args=(self.q, self.lock, filename))
                    self.pp.start()
                    self.process_list.append(self.pp)
        # return filename_list
    # print("DONE!", )
    def run(self, q, l, filename):
        """
        多进程搜索文件，查找并记录到进程队列中
        :param q: 进程间通信队列
        :param l: 进程间控制锁
        :param filename: 接收的文件的绝对路径
        :return:
        """
        l.acquire()
        try:
            # ==========开始读取目录中的文件遍历查找的过程
            fobj = codecs.open(filename, 'r', 'utf_8_sig', errors='ignore')
        except IOError as e:
            print("*** file open error:", e)
        else:
            # 遍历文件的每一行
            for fileLine in fobj:
                # 判断当前行是否包含所有搜索关键字
                for keyword in self.__allSearchKeywords:
                    # 若包含，并添加到该行已搜索到的关键字列表中
                    if keyword.upper() in fileLine.upper(): # 将搜索关键字和该行文本内容都转换为大写后再进行匹配
                        self.__existsKeywordsThisLine.append(keyword)

                # 将这些搜索到的关键字添加到全部已搜索到的关键字列表中，并包含文件名信息
                for keyword in self.__existsKeywordsThisLine:
                    self.__allExistsKeywords.append(keyword + "\t" + filename.replace("\\\\", "\\"))

                # 清空该行已搜索到的关键字列表内容
                self.__existsKeywordsThisLine = []
            fobj.close()
            # 全部文件遍历结束
            # self.writeResultLog(self.__allExistsKeywords)
            q.put(self.__allExistsKeywords) #放置到队列中
            print('当前进程的名字是： ', current_process().name,'已放置到队列中....')
        l.release()

        def __del__(self):
            """
            析构函数
            join所完成的工作就是线程同步，即主线程任务结束之后，进入阻塞状态，一直等待其他的子线程执行结束之后，主线程再终止
            :param self:
            :return:
            """
            for process in self.process_list:
                process.join()

# 仅当本python模块直接执行时，才执行如下语句，若被别的python模块引入，则不执行
if __name__ == '__main__':
    url = r"E:\python_data"
    file_search = File_Search(url)
    file_search.searchFilesContent()
    search_time = file_search.writeResultLog()
    print('The code run {:.0f}m {:.0f}s'.format(search_time // 60, search_time % 60))

以上函数实现和类实现的功能是一样的，但是类实现需要的时间相比函数实现要长很多，贴在这里期待帮助初学的我指点一二，如何进行代码优化

posted @ 2019-02-28 18:13 朝鲁梦_FQM 阅读(431) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

朝鲁梦_FQM

python实现目录中制定内容查找

公告