python获取pdf文本
1.pdf文件获取文本
import pdfplumber with pdfplumber.open("4.pdf") as pdf: first_page = pdf.pages[0] print(first_page.chars[0]) # 获取pdf信息 print(first_page.extract_text()) # 获取文本 print(first_page.extract_tables()) # 获取表格
2.pdf单页纵向切割
from PyPDF4 import PdfFileReader, PdfFileWriter import math input_file_path = '11.pdf' output_file_path_l = 'l.pdf' output_file_path_r = 'r.pdf' pdf_input = PdfFileReader(open(input_file_path, 'rb')) pdf_output = PdfFileWriter() page0 = pdf_input.getPage(0) width = float(page0.mediaBox.getWidth()) height = float(page0.mediaBox.getHeight()) page_height = width new_page_count = math.ceil(height / page_height) for i in range(new_page_count): pdf_input = PdfFileReader(open(input_file_path, 'rb')) new_page = pdf_input.getPage(0) y = page_height * i new_page.mediaBox.lowerLeft = (0, height - page_height * (i + 1)) new_page.mediaBox.lowerRight = (width/2, height - page_height * (i + 1)) new_page.mediaBox.upperLeft = (0, height - y) new_page.mediaBox.upperRight = (width/2, height - y) pdf_output.addPage(new_page) # pdf_output.write(open(output_file_path_l, 'wb')) for i in range(new_page_count): pdf_input = PdfFileReader(open(input_file_path, 'rb')) new_page = pdf_input.getPage(0) y = page_height * i new_page.mediaBox.lowerLeft = (width/2, height - page_height * (i + 1)) new_page.mediaBox.lowerRight = (width, height - page_height * (i + 1)) new_page.mediaBox.upperLeft = (width/2, height - y) new_page.mediaBox.upperRight = (width, height - y) pdf_output.addPage(new_page) pdf_output.write(open(output_file_path_r, 'wb'))