selenium 爬取详情页附件链接，并且下载

code1

import sys,os
sys.path.append("/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1])+'/lib')
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys

#coding=utf-8

import requests
import time
import os


def formatFloat(num):
    return '{:.2f}'.format(num)


#下载文件
def downloadFile(name, url):
    headers = {'Proxy-Connection':'keep-alive'}
    r = requests.get(url, stream=True, headers=headers)
    length = float(r.headers['content-length'])
    f = open(name, 'wb')
    count = 0
    count_tmp = 0
    time1 = time.time()
    for chunk in r.iter_content(chunk_size = 512):
        if chunk:
            f.write(chunk)
            count += len(chunk)
            if time.time() - time1 > 2:
                p = count / length * 100
                speed = (count - count_tmp) / 1024 / 1024 / 2
                count_tmp = count
                print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                time1 = time.time()
    f.close()
    
#文件保存目录
file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files")
if(not os.path.exists(file_dir)):
    os.mkdir(file_dir)


def asleep(driver):
    driver.implicitly_wait(3.5)
    time.sleep(2) 


'''
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
driver = webdriver.Chrome(options=chrome_options)
'''
driver = webdriver.Chrome()


asleep(driver)
driver.get("http://www.baidu.com/#/login")
driver.maximize_window()
driver.find_element_by_xpath(".//div[@class='login-container-r']/div[2]/input").send_keys("abc")
driver.find_element_by_xpath(".//div[@class='login-container-r']/div[3]/input").send_keys("icloudeep123")
driver.find_element_by_xpath(".//div[@class='login-container-r']/div[4]/input").send_keys("12345")


driver.find_element_by_xpath(".//div[@class='login-container-r']/div[6]").click()


asleep(driver)


#下载失败的合同
error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt")
#保存未下载的合同列表
with open(error,"r") as f:
    for i in f.readlines():
        contractNo=i.strip()
        detail_url="http://www.baidu.com/#/contractDetail?contractNum={}".format(contractNo)
        driver.get(detail_url)
        asleep(driver)
    　　contractUrl=driver.find_element_by_xpath(".//div[@class='float-left column list-r']/div[4]/div[2]/a").get_attribute("href")

        try:
            downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl)
            print("下载成功！")
        except Exception as e:
            print(e)
            print(contractNo)

code2

#coding=utf-8

import requests
import time
import os

def formatFloat(num):
    return '{:.2f}'.format(num)

#下载文件
def downloadFile(name, url):
    headers = {'Proxy-Connection':'keep-alive'}
    r = requests.get(url, stream=True, headers=headers)
    length = float(r.headers['content-length'])
    f = open(name, 'wb')
    count = 0
    count_tmp = 0
    time1 = time.time()
    for chunk in r.iter_content(chunk_size = 512):
        if chunk:
            f.write(chunk)
            count += len(chunk)
            if time.time() - time1 > 2:
                p = count / length * 100
                speed = (count - count_tmp) / 1024 / 1024 / 2
                count_tmp = count
                print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                time1 = time.time()
    f.close()
    
#文件保存目录
file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files")
if(not os.path.exists(file_dir)):
    os.mkdir(file_dir)

#html合同
record=os.path.join(os.path.dirname((os.path.abspath(__file__))),"records.txt")
#保存未下载的合同列表
with open(record,"r") as f:
    print(f.readlines())

#下载失败的合同
error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt")
#保存未下载的合同列表
with open(error,"r") as f:
    for i in f.readlines():
        contractNo=i.strip()
        contractUrl="http://www.baidu.com/contract/view?contractNo={}".format(contractNo)

        try:
            downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl)
        except Exception as e:
            print(e)
            print(contractNo)

posted @ 2020-12-28 11:27 anobscureretreat 阅读(613) 评论(0) 编辑收藏举报

刷新页面返回顶部