Dbass

导航

selenium+phantomjs+pyquery 爬取淘宝商品信息

 1 from selenium import webdriver
 2 from selenium.common.exceptions import TimeoutException
 3 from selenium.webdriver.common.by import By
 4 from selenium.webdriver.support.ui import WebDriverWait
 5 from selenium.webdriver.support import expected_conditions as EC
 6 import re
 7 from pyquery import PyQuery as pq
 8 from config import *
 9 import pymongo
10 
11 client = pymongo.MongoClient(MONGO_URL)
12 db =client[MONGO_DB]
13 
14 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
15 wait = WebDriverWait(browser, 10)# 等待时长10秒,默认0.5秒询问一次,等待页面加载完成,找到某个条件发生后再继续执行后续代码,如果超过设置时间检测不到则抛出异常
16 browser.set_window_size(1400,900)
17 def search():
18     print("正在搜索")
19     try:
20         browser.get('https://www.taobao.com/')
21         input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))#模拟输入框
22         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
23         #模拟搜索按钮
24         input.send_keys('美食')#添加input
25         submit.click()#模拟按下搜索按钮
26         total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total")))
27         get_product()
28         return total.text
29     except TimeoutException:
30         return search()
31 def next_page(page_number):#翻页,把当前页码清除后,直接跳转到想去的页码
32     print("正在翻页",page_number)
33     try:
34         input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
35         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
36         input.clear()
37         input.send_keys(page_number)
38         submit.click()
39         wait.until(EC.text_to_be_present_in_element(
40             (By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
41         get_product()
42     except TimeoutException:
43         next_page(page_number)
44 def get_product():#获得每页商品内容,pyquery not understand
45     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
46     html =browser.page_source
47     # print(html)
48     doc = pq(html)
49     items = doc("#mainsrp-itemlist .items .item").items()#注意class名字后空格!
50     for item in items:
51         product = {
52             'image':item.find('.pic .img').attr('src'),
53             'price':item.find('.price').text(),
54             'deal':item.find('.deal-cnt').text()[:-3],
55             'title':item.find('.title').text(),
56             'shop':item.find('.shop').text(),
57             'location':item.find('.location').text()#find 查找的是div class的名字 别的标签不可以
58         }
59         print(product)
60         save_to_mongo(product)
61 def save_to_mongo(result):#将数据存储到mongodb
62     try:
63         if db[MONGO_TABLE].insert(result):
64             print('存储成功->',result)
65     except Exception:
66         print('存储失败->',result)
67 def main():
68     total = search()
69     total = int(re.compile('\d+').search(total).group(0))
70     for i in range(2,20):
71         next_page(i)
72     browser.close()
73 
74 if __name__ == '__main__':
75     main()

MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'

SERVICE_ARGS  = ['--load-images=false', '--disk-cache=true']
config

 

posted on 2017-11-30 16:38  Dbass  阅读(252)  评论(0编辑  收藏  举报