Python对pdf中的关键字过滤(pdfminer3k或pdfminer使用)
最近在实习,老板一下子发给了我120份研报,然而很多都是没用的。聪明的大脑一定要想办法让电脑帮助自己完成简单的工作!
下面是Python筛选含有“”丙烯“”关键字的程序,由于文件的保密性只能贴出代码。
注意:
pip install pdfminer3k而不是pdfminer
导入的时候名字是pdfminer,原因我才是python版本的问题
# -*- coding: utf-8 -*- """ Created on Fri May 10 16:54:16 2019 @author: didi.lv """ import os from io import StringIO import shutil # 注意:一定要pip install pdfminer3k 而不是pdfminer from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams # 读取pdf的函数,返回内容 def readPdf(pdf_file): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams) process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file) device.close() content = retstr.getvalue() retstr.close() return content def file_name(file_dir): names = [] for root, dirs, files in os.walk(file_dir): names.append(files) return files if __name__ == '__main__': file_dir = r'C:\\Users\didi.lv\Desktop\filenames' file_names_str = str(file_name(file_dir)) name_ = file_names_str.split('.pdf\', ') # 简单的check下这个代码的细节,需要理解 name_temp1 = name_[0] name_[0] = name_temp1[1:] name_temp2 = name_[-1] name_[-1] = name_temp2[0:-6] i = 0 for name_check in name_[48:]: print('--------------------------------------------------------') i += 1 print(i) name_check = name_check[1:] name_check += '.pdf' name_check_open = r'C:\\Users\didi.lv\Desktop\filenames' + '\\' + name_check pdf_file = open(name_check_open, 'rb') content = readPdf(pdf_file) if '丙烯' in content: # 注意这里是从原始位置filenames1复制到目标位置filenames2 file_origin = r'C:\\Users\didi.lv\Desktop\filenames1' + '\\' + name_check file_target = r'C:\\Users\didi.lv\Desktop\filenames2' + '\\' + name_check shutil.copyfile(file_origin,file_target) print('copy No. %d file' %i)
原文:https://blog.csdn.net/Eric2016_Lv/article/details/90082280