selenium实现百度图片爬取
因为是百度图片是瀑布流ajax异步上传的数据,所以这里用到抓包工具来抓取链接(fiddler)
好了直接上代码,
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 import requests,time 4 from queue import Queue 5 from urllib import request 6 import os,gevent 7 from lxml import etree 8 9 10 11 12 def get_img(html): 13 html = html.get() 14 15 html = etree.HTML(html) 16 17 img_url = html.xpath('//div[@id="imgid"]/div[last()]//li/@data-objurl') 18 # print(img_url) 19 path = './baidupic/' 20 if not os.path.exists(path): 21 os.makedirs(path) 22 23 for url in img_url: 24 print(url) 25 # response = requests.get(url) 26 # img = response.content 27 try: 28 fname = url.split('/')[-1] 29 request.urlretrieve(url,os.path.join(path, fname)) 30 print('下载成功') 31 except: 32 print('图片不存在') 33 34 35 def get_page(): 36 #创建数据队列 37 q = Queue() 38 39 #百度图片搜索地址 40 base_url = 'https://image.baidu.com/' 41 #返回浏览器对象 42 browser = webdriver.Chrome(executable_path=r'C:\Users\zhaozhi\Desktop\chromedriver.exe') 43 #模拟访问 44 browser.get(base_url) 45 #输入搜索关键字 46 browser.find_element_by_id('kw').send_keys('美女') 47 #按键 48 browser.find_element_by_class_name('s_search').click() 49 # time.sleep(2) 50 for i in range(10): 51 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') 52 # time.sleep(2) 53 # html = browser.page_source 54 55 56 q.put(browser.page_source) 57 # browser.close() 58 # print(browser.page_source) 59 g_list=[] 60 for i in range(20): 61 g= gevent.spawn(get_img,q) 62 g_list.append(g) 63 64 gevent.joinall(g_list) 65 66 67 68 69 70 71 72 73 # browser.save_screenshot('baidupic.png') 74 # print(browser.page_source) 75 # browser.find_element(By_) 76 77 if __name__ == '__main__': 78 get_page()