selenium+phantomjs+pyquery 爬取淘宝商品信息
1 from selenium import webdriver 2 from selenium.common.exceptions import TimeoutException 3 from selenium.webdriver.common.by import By 4 from selenium.webdriver.support.ui import WebDriverWait 5 from selenium.webdriver.support import expected_conditions as EC 6 import re 7 from pyquery import PyQuery as pq 8 from config import * 9 import pymongo 10 11 client = pymongo.MongoClient(MONGO_URL) 12 db =client[MONGO_DB] 13 14 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 15 wait = WebDriverWait(browser, 10)# 等待时长10秒,默认0.5秒询问一次,等待页面加载完成,找到某个条件发生后再继续执行后续代码,如果超过设置时间检测不到则抛出异常 16 browser.set_window_size(1400,900) 17 def search(): 18 print("正在搜索") 19 try: 20 browser.get('https://www.taobao.com/') 21 input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))#模拟输入框 22 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button"))) 23 #模拟搜索按钮 24 input.send_keys('美食')#添加input 25 submit.click()#模拟按下搜索按钮 26 total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))) 27 get_product() 28 return total.text 29 except TimeoutException: 30 return search() 31 def next_page(page_number):#翻页,把当前页码清除后,直接跳转到想去的页码 32 print("正在翻页",page_number) 33 try: 34 input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) 35 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) 36 input.clear() 37 input.send_keys(page_number) 38 submit.click() 39 wait.until(EC.text_to_be_present_in_element( 40 (By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number))) 41 get_product() 42 except TimeoutException: 43 next_page(page_number) 44 def get_product():#获得每页商品内容,pyquery not understand 45 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item"))) 46 html =browser.page_source 47 # print(html) 48 doc = pq(html) 49 items = doc("#mainsrp-itemlist .items .item").items()#注意class名字后空格! 50 for item in items: 51 product = { 52 'image':item.find('.pic .img').attr('src'), 53 'price':item.find('.price').text(), 54 'deal':item.find('.deal-cnt').text()[:-3], 55 'title':item.find('.title').text(), 56 'shop':item.find('.shop').text(), 57 'location':item.find('.location').text()#find 查找的是div class的名字 别的标签不可以 58 } 59 print(product) 60 save_to_mongo(product) 61 def save_to_mongo(result):#将数据存储到mongodb 62 try: 63 if db[MONGO_TABLE].insert(result): 64 print('存储成功->',result) 65 except Exception: 66 print('存储失败->',result) 67 def main(): 68 total = search() 69 total = int(re.compile('\d+').search(total).group(0)) 70 for i in range(2,20): 71 next_page(i) 72 browser.close() 73 74 if __name__ == '__main__': 75 main()
MONGO_URL = 'localhost' MONGO_DB = 'taobao' MONGO_TABLE = 'product' SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']