selenium 爬取详情页附件链接,并且下载
code1
import sys,os sys.path.append("/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1])+'/lib') from selenium import webdriver import time from selenium.webdriver.common.keys import Keys #coding=utf-8 import requests import time import os def formatFloat(num): return '{:.2f}'.format(num) #下载文件 def downloadFile(name, url): headers = {'Proxy-Connection':'keep-alive'} r = requests.get(url, stream=True, headers=headers) length = float(r.headers['content-length']) f = open(name, 'wb') count = 0 count_tmp = 0 time1 = time.time() for chunk in r.iter_content(chunk_size = 512): if chunk: f.write(chunk) count += len(chunk) if time.time() - time1 > 2: p = count / length * 100 speed = (count - count_tmp) / 1024 / 1024 / 2 count_tmp = count print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S') time1 = time.time() f.close() #文件保存目录 file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files") if(not os.path.exists(file_dir)): os.mkdir(file_dir) def asleep(driver): driver.implicitly_wait(3.5) time.sleep(2) ''' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('headless') driver = webdriver.Chrome(options=chrome_options) ''' driver = webdriver.Chrome() asleep(driver) driver.get("http://www.baidu.com/#/login") driver.maximize_window() driver.find_element_by_xpath(".//div[@class='login-container-r']/div[2]/input").send_keys("abc") driver.find_element_by_xpath(".//div[@class='login-container-r']/div[3]/input").send_keys("icloudeep123") driver.find_element_by_xpath(".//div[@class='login-container-r']/div[4]/input").send_keys("12345") driver.find_element_by_xpath(".//div[@class='login-container-r']/div[6]").click() asleep(driver) #下载失败的合同 error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt") #保存未下载的合同列表 with open(error,"r") as f: for i in f.readlines(): contractNo=i.strip() detail_url="http://www.baidu.com/#/contractDetail?contractNum={}".format(contractNo) driver.get(detail_url) asleep(driver) contractUrl=driver.find_element_by_xpath(".//div[@class='float-left column list-r']/div[4]/div[2]/a").get_attribute("href") try: downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl) print("下载成功!") except Exception as e: print(e) print(contractNo)
code2
#coding=utf-8 import requests import time import os def formatFloat(num): return '{:.2f}'.format(num) #下载文件 def downloadFile(name, url): headers = {'Proxy-Connection':'keep-alive'} r = requests.get(url, stream=True, headers=headers) length = float(r.headers['content-length']) f = open(name, 'wb') count = 0 count_tmp = 0 time1 = time.time() for chunk in r.iter_content(chunk_size = 512): if chunk: f.write(chunk) count += len(chunk) if time.time() - time1 > 2: p = count / length * 100 speed = (count - count_tmp) / 1024 / 1024 / 2 count_tmp = count print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S') time1 = time.time() f.close() #文件保存目录 file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files") if(not os.path.exists(file_dir)): os.mkdir(file_dir) #html合同 record=os.path.join(os.path.dirname((os.path.abspath(__file__))),"records.txt") #保存未下载的合同列表 with open(record,"r") as f: print(f.readlines()) #下载失败的合同 error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt") #保存未下载的合同列表 with open(error,"r") as f: for i in f.readlines(): contractNo=i.strip() contractUrl="http://www.baidu.com/contract/view?contractNo={}".format(contractNo) try: downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl) except Exception as e: print(e) print(contractNo)