使用python自动化下载pdf文档

使用python输入PDF编号自动下载freepatentsonline.com的文档

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:requests_freepatentsonline_com.py
# Author:lgsp_Harold
import os
import requests
from lxml import etree
 
dir_path = './files/freepatentsonline_com/'
 
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
 
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
}
 
 
while True:
    number = input('(输入Q退出)输入pdf编号:').strip()
    if number == 'Q':
        break
    url = 'https://www.freepatentsonline.com/' + number + '.pdf'
    pdf_response = requests.get(url=url, headers=headers)
 
    doc = etree.HTML(pdf_response.text)
    download = doc.xpath('//center[@style="border: 2px inset;"]/iframe/@src')[0]
 
    # [url=https://s3.amazonaws.com/pdf.sumobrain.com/US9039490B2.pdf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIBOKHYOLP4MBMRGQ%2F20210715%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210715T000000Z&X-Amz-Expires=173822&X-Amz-SignedHeaders=host&X-Amz-Signature=ade0d0aad351dc65cb130810793964e11a6970120fe6bb3258a9728424db6a42#view=FitH]https://s3.amazonaws.com/pdf.sum ... 424db6a42#view=FitH[/url]
    download_url = download.replace('#view=FitH', '')
 
    file = requests.get(download_url, headers=headers)
 
    file_path = './files/freepatentsonline_com/' + number + '.pdf'
 
    with open(file_path, 'wb') as f:
        f.write(file.content)
    print('%s-PDF成功下载' % number)

 

posted @ 2021-08-18 16:01  嘆世殘者——華帥  阅读(875)  评论(0编辑  收藏  举报