使用selenium爬取天猫美食店铺

'''利用selenium爬取网页内容'''

import re

import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *

# driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

# driver.set_window_size(1400,900)            #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误

def search():
    print('正在搜索')
    try:
        driver.get('http://www.tmall.com')
        s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mq')))
        sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mallSearch > form > fieldset > div > button')))
        s_input.send_keys(KEYWORD)
        sumbit.click()
        shop = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_Filter > a.fType-w')))
        shop.click()
        totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#content > div > div.ui-page > div > b.ui-page-skip > form')))
        get_shopname()
        return totle.text
    except TimeoutException:
        print('TimeOut')
        return search()

def next_page(page_num):
    print('正在翻页', page_num)
    try:
        s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > input.ui-page-skipTo')))
        sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > button')))
        s_input.clear()
        s_input.send_keys(page_num)
        sumbit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#content > div > div.ui-page > div > b.ui-page-num > b.ui-page-cur'),str(page_num)))
        get_shopname()
    except TimeoutException:
        print('TimeOut')
        next_page(page_num)

def get_shopname(): #获取店面名称、链接
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#J_ItemList .shopBox .shopHeader')))
    html = driver.page_source
    doc = pq(html)
    items = doc('#J_ItemList .shopBox .shopHeader').items()
    for item in items:
        shopname = {
            'shopmessage': item.find('.shopHeader-info').text(),
            'shoplink': 'http:' + item.find('.sHe-shop').attr('href'),
            'shop_score':item.find('.shopDsr-con').text()
        }
        print(shopname)

# def login():
#     login_sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_Quick2Static')))
#     print('点击使用用户名和密码登录')
#     login_sumbit.click()
#     user = driver.find_element_by_id('TPL_username_1')
#     print('输入用户名')
#     user.send_keys(USER)
#     password = driver.find_element_by_id('TPL_password_1')
#     print('输入密码')
#     password.send_keys(PASSWORD)
#     sumbit = driver.find_element_by_id('J_SubmitStatic')
#     sumbit.click()
#     return driver.page_source

def main():
    totle = search()
    totle = int(re.compile('(\d+)').search(totle).group(1))
    for num in range(2,totle+1):
        next_page(num)
        time.sleep(2)

if __name__ == '__main__':
    main()
View Code

 

posted @ 2017-07-23 21:36  睚一  阅读(381)  评论(0编辑  收藏  举报