搜索本地pdf文件内容

import fitz  # PyMuPDF
import re
from pathlib import Path
from colorama import Fore
import sys
import os
 
def search_pdf(pdf_path, format,keywords,context_len=10):
	relp=os.path.relpath(pdf_path,'C:/Users/tellw')
	if not os.path.exists(relp):
		ds,filename=relp.rsplit('\\',1)
		if not os.path.exists(ds):
			os.makedirs(ds)
		if format=='pdf':
			with open(relp,'w',encoding='utf8') as f:
				# 打开PDF文件
				document = fitz.open(pdf_path)
				# 搜索PDF中的文本
				for page_num in range(len(document)):
					page = document[page_num]
					text = re.sub(r'\s','',page.get_text()).lower()
					f.write(text)

				# 关闭PDF文档
				document.close()
		elif format=='txt':
			with open(pdf_path,'r',encoding='utf8') as f:
				text=f.read()
			with open(relp,'w',encoding='utf8') as f:
				f.write(re.sub(r'\s','',text).lower()) # \s匹配任意的空白符,包括空格,制表符(Tab),换行符,中文全角空格等。
	with open(relp,'r',encoding='utf8') as f:
		text=f.read()
	search_re='.{0,20}'.join(keywords)
	search_res=re.findall(f'.{{0,{context_len}}}{search_re}.{{0,{context_len}}}',text)
	for sr in search_res:
		for kw in keywords:
			sr=re.sub(kw,f'{Fore.RED}{kw}{Fore.BLACK}',sr)
		print(sr+'\t\t\t\t'+str(pdf_path)+'\n')

if len(sys.argv)>=2:
	keywords=sys.argv[1:]
else:
	sys.exit(1)
dirs=['C:/Users/tellw/open_title/file_updates','C:/Users/tellw/open_title/papers/benchmark','C:/Users/tellw/open_title/papers/edge_computing','C:/Users/tellw/open_title/papers/guidance','C:/Users/tellw/open_title/papers/methodology','C:/Users/tellw/open_title/papers/misc','C:/Users/tellw/open_title/papers/other-themes-benchmark','C:/Users/tellw/open_title/papers/speech_recognition','C:/Users/tellw/open_title/papers/test','C:/Users/tellw/open_title/papers/to_c','C:/Users/tellw/open_title/papers/books']
pdf_file_paths=[]
txt_file_paths=[]
for d in dirs:
	pdf_file_paths.extend(list(Path(d).glob('*.pdf')))
for d in dirs:
	txt_file_paths.extend(list(Path(d).glob('*.txt')))
os.chdir('C:/Users/tellw/open_title/paper_search_space')
context_len=30
for pdf_file_path in pdf_file_paths:
	search_pdf(pdf_file_path, 'pdf',keywords,context_len)
for txt_file_path in txt_file_paths:
	search_pdf(txt_file_path,'txt',keywords,context_len)

首先由百度gpt给出搜索pdf文件中关键字的代码,后在其上改进。搜索关键词作为脚本的参数,['搜','索','内','容'],首先找到目标文档————pdf文件和txt文件,在其对应的搜索空间里,去掉源文件中的空格、换行符、大小写形式等与搜索结果展示无关的内容,在搜索空间里按照正则表达式.{0,30}搜.{0,20}索.{0,20}内.{0,20}容.{0,30}搜索目标字符串

创建于2404061003,修改于2412042104

posted @ 2024-12-04 21:09  园糯  阅读(2)  评论(0编辑  收藏  举报