使用Python将pdf文件转换成word,csv

一:下载所需要的库

1 :pdfminer    安装库命令 pip install pdfminer3k

pdfminer3k是pdfminer的Python 3端口。PDFMiner是从PDF文档中提取信息的工具。与其他PDF相关的工具不同,它完全专注于获取和分析文本数据。PDFMiner允许获取页面中文本的确切位置,以及其他信息,如字体或线条。它包含一个PDF转换器,可以将PDF文件转换为其他文本格式(如HTML)。它有一个可扩展的PDF解析器,可用于其他目的而不是文本分析。

2:  docx          安装库命令 pip install python_docx

Python DocX目前是Python OpenXML的一部分,你可以用它打开Word 2007及以后的文档,而用它保存的文档可以在Microsoft Office 2007/2010, Microsoft Mac Office 2008, Google Docs, OpenOffice.org 3, and Apple iWork 08中打开。

复制代码
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,process_pdf
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
document = Document()
import warnings
warnings.filterwarnings("ignore")
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from urllib.request import urlopen
import pandas as pd

def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close()
    return content
def save_to_file(file_name, contents):
    fh = open(file_name, 'w')
    fh.write(contents)
    fh.close()

save_to_file('mobiles.txt', 'your contents str')


def main():
    pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
    outputString = readPDF(pdfFile)
#c.word save_to_file(
'c.csv',outputString) if __name__ == '__main__': main()
复制代码

使用docx 保存为word

复制代码
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
document = Document()
import warnings
warnings.filterwarnings("ignore")
import os
file_name=os.open('/Users/dudu/Desktop/test1/a.pdf',os.O_RDWR )

def main():

    fn = open(file_name,'rb')
    parser = PDFParser(fn)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    resource = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource,laparams=laparams)
    interpreter = PDFPageInterpreter(resource,device)
    for i in doc.get_pages():
        interpreter.process_page(i)
        layout = device.get_result()
        for out in layout:
            if hasattr(out,"get_text"):
                content = out.get_text().replace(u'\xa0', u' ') 
                document.add_paragraph(
                    content, style='ListBullet'   
                )
            document.save('a'+'.docx')
    print ('处理完成')
 
if __name__ == '__main__':
    main()
复制代码

加下面的公众号,我会定期发一些资料。

posted @   DUDU1992  阅读(15359)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示