查找大文件中的字符串的位置极前后50个字符串

import os
import asyncio
import time


async def fun(i, f1, find_str, count, end_offset):
# print(i*end_offset-1)
for j in range(count):
if i == 0:
f1.seek(j * 1024 * 512, 0)
else:
f1.seek((i * end_offset - 1) + j * 1024 * 512, 0)
r = f1.read(1024 * 513)
try:
index = r.index(find_str)
if index - 50 >= 0:
q_str = r.decode('utf-8')[index - 50: index]
else:
q_str = r.decode('utf-8')[: index]
h_str = r.decode('utf-8')[index+len(find_str.decode('utf-8')): index+len(find_str.decode('utf-8')) + 50]
return [i, j * 1024 * 512 + index, q_str, h_str]
except:
continue


def find_path(file_path, find_str, child_file_number):
s_time = time.time()
all_size = os.stat(file_path).st_size # 获取文件的大小,字节数
end_offset = int(all_size / child_file_number)
count = int(end_offset / (1024 * 512)) + 1
f1 = open(file_path, 'rb')
tasks = [asyncio.ensure_future(fun(i, f1, find_str, count, end_offset)) for i in range(child_file_number)]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

for i in range(len(tasks)):
task = tasks[i]
if task.result() is None:
if i < child_file_number - 1:
continue
else:
return 'no find', time.time() - s_time
else:
return task.result()[0] * end_offset + task.result()[1], task.result()[2], task.result()[3], time.time() - s_time
f1.close()


if __name__ == '__main__':
print(find_path('D:\\wyz\\subscript_query\\1.txt', aaa', 40))
posted @ 2021-07-27 16:24  wyz_1  阅读(96)  评论(0编辑  收藏  举报