搜索本地pdf文件内容
import fitz # PyMuPDF
import re
from pathlib import Path
from colorama import Fore
import sys
import os
def search_pdf(pdf_path, format,keywords,context_len=10):
relp=os.path.relpath(pdf_path,'C:/Users/tellw')
if not os.path.exists(relp):
ds,filename=relp.rsplit('\\',1)
if not os.path.exists(ds):
os.makedirs(ds)
if format=='pdf':
with open(relp,'w',encoding='utf8') as f:
# 打开PDF文件
document = fitz.open(pdf_path)
# 搜索PDF中的文本
for page_num in range(len(document)):
page = document[page_num]
text = re.sub(r'\s','',page.get_text()).lower()
f.write(text)
# 关闭PDF文档
document.close()
elif format=='txt':
with open(pdf_path,'r',encoding='utf8') as f:
text=f.read()
with open(relp,'w',encoding='utf8') as f:
f.write(re.sub(r'\s','',text).lower()) # \s匹配任意的空白符,包括空格,制表符(Tab),换行符,中文全角空格等。
with open(relp,'r',encoding='utf8') as f:
text=f.read()
search_re='.{0,20}'.join(keywords)
search_res=re.findall(f'.{{0,{context_len}}}{search_re}.{{0,{context_len}}}',text)
for sr in search_res:
for kw in keywords:
sr=re.sub(kw,f'{Fore.RED}{kw}{Fore.BLACK}',sr)
print(sr+'\t\t\t\t'+str(pdf_path)+'\n')
if len(sys.argv)>=2:
keywords=sys.argv[1:]
else:
sys.exit(1)
dirs=['C:/Users/tellw/open_title/file_updates','C:/Users/tellw/open_title/papers/benchmark','C:/Users/tellw/open_title/papers/edge_computing','C:/Users/tellw/open_title/papers/guidance','C:/Users/tellw/open_title/papers/methodology','C:/Users/tellw/open_title/papers/misc','C:/Users/tellw/open_title/papers/other-themes-benchmark','C:/Users/tellw/open_title/papers/speech_recognition','C:/Users/tellw/open_title/papers/test','C:/Users/tellw/open_title/papers/to_c','C:/Users/tellw/open_title/papers/books']
pdf_file_paths=[]
txt_file_paths=[]
for d in dirs:
pdf_file_paths.extend(list(Path(d).glob('*.pdf')))
for d in dirs:
txt_file_paths.extend(list(Path(d).glob('*.txt')))
os.chdir('C:/Users/tellw/open_title/paper_search_space')
context_len=30
for pdf_file_path in pdf_file_paths:
search_pdf(pdf_file_path, 'pdf',keywords,context_len)
for txt_file_path in txt_file_paths:
search_pdf(txt_file_path,'txt',keywords,context_len)
首先由百度gpt给出搜索pdf文件中关键字的代码,后在其上改进。搜索关键词作为脚本的参数,['搜','索','内','容']
,首先找到目标文档————pdf文件和txt文件,在其对应的搜索空间里,去掉源文件中的空格、换行符、大小写形式等与搜索结果展示无关的内容,在搜索空间里按照正则表达式.{0,30}搜.{0,20}索.{0,20}内.{0,20}容.{0,30}
搜索目标字符串
创建于2404061003,修改于2412042104