python操作PDF------提取PDF文字内容
# 安装 pip install pdfplumber import pdfplumber # 利用pdfplumber提取文字 with pdfplumber.open('基于python的网页爬虫.pdf') as pdf: first_page = pdf.pages[0] print(first_page.extract_text()) # 利用pdfplumber单个提取表格 with pdfplumber.open('基于python的网页爬虫.pdf') as pdf: first_page = pdf.pages[0] print(first_page.extract_table()) # 利用pdfplumber多个提取表格 with pdfplumber.open('基于python的网页爬虫.pdf') as pdf: first_page = pdf.pages[0] for table in first_page.extract_tables(): print(table) # 利用pdfplumber单个提取财报 table_settings: 提取表格是的设定 with pdfplumber.open('基于python的网页爬虫.pdf') as pdf: first_page = pdf.pages[0] table = first_page.extract_tables( table_settings={ 'vertical_strategy': 'text', 'horizontal_strategy': 'text' } ) new_table = [] for row in table: new_row = [] # 如果不是空行 if not ''.join([str(item) for item in row]) == '': # 合并单词 new_row.append(''.join([str(item) if item else '' for item in row[:3]])) new_row += row[3:] new_table.append(new_row) print(new_table)