使用Selenium爬取淘宝商品

由于PhantomJS已经停止更新,所以使用chrome浏览器的headless模式代替,代码如下:

from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser=webdriver.Chrome(options=chrome_options)

browser.get('https://www.baidu.com/')
print(browser.current_url)

 爬取淘宝的代码:

别人的代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
from pyquery import PyQuery as pq
from pymongo import MongoClient


client = MongoClient()
db = client['MONGO_DB']


browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)


#使用webdriver打开chrome,打开淘宝页面,搜索美食关键字,返回总页数
def search():
    try:
        browser.get('https://www.taobao.com')
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))
        input.send_keys('ipad')
        submit.click()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        get_products()
        return total.text
    except TimeoutException:
        print('timeout!')
        return search()


#进行页面的跳转,输入下一页的页号,然后点击确定按钮,在高亮区域判定是否正确跳转
def next_page(page_num):
    try:
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > '
                                                                           'div.form > input')))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_num)
        submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num)))
        get_products()
    except TimeoutException:
        next_page(page_num)

#获取商品详情
def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
    html = browser.page_source
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image':item.find('.pic .img').attr('src'),
            'price':item.find('.price').text(),
            'deal':item.find('.deal-cnt').text()[:-3],
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text(),
        }
        print(product)
        save_to_mongo(product)

def save_to_mongo(result):
    try:
        if db['MONGO_DB'].insert(result):
            print('存储成功',result)
    except Exception:
        print('存储失败',result)


def main():
    total = search()
    total = int(re.search('(\d+)',total).group(1))   #'\d'表示匹配数字
    for i in range(2,total+1):
        next_page(i)


if __name__ == '__main__':
    main()

崔老师的代码:

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq
from pymongo import MongoClient

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
KEYWORD = 'iPad'

def index_page(page):
    try:
        url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
        browser.get(url)
        if page > 1:
            input = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
            submit = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
            input.clear()
            input.send_keys(page)
            submit.click()
        wait.until(
            EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
        get_products()
    except TimeoutException:
        index_page(page)



def get_products():
    """
    提取商品数据
    """
    html = browser.page_source
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image': item.find('.pic .img').attr('data-src'),
            'price': item.find('.price').text(),
            'deal': item.find('.deal-cnt').text(),
            'title': item.find('.title').text(),
            'shop': item.find('.shop').text(),
            'location': item.find('.location').text()
        }
        print(product)
        save_to_mongo(product)


MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_COLLECTION = 'products'
client = MongoClient(MONGO_URL)
db = client[MONGO_DB]
def save_to_mongo(result):
    """
    保存至MongoDB
    :param result: 结果
    """
    try:
        if db[MONGO_COLLECTION].insert(result):
            print('存储到MongoDB成功')
    except Exception:
        print('存储到MongoDB失败')


MAX_PAGE = 100
if __name__ == '__main__':
    for i in range(1, MAX_PAGE + 1):
        index_page(i)

 其他人帮助的代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from urllib.parse import quote
from pyquery import PyQuery as pq
import os
import openpyxl
import random
import time

browser=webdriver.Chrome()
base_url='https://s.taobao.com/search?q='
keywords='ipad'
url=base_url+quote(keywords)
wait=WebDriverWait(browser,15)

page_max=100

def log_out(browser):
    login_switch=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'div.login-switch i#J_Quick2Static')))
    login_switch.click()
    weibo_login=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.weibo-login')))
    weibo_login.click()
    username=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.inp.username input')))
    password=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.inp.password input')))
    username.send_keys('xxx')
    password.send_keys('xxx')
    submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'span[node-type="submitStates"]')))
    submit.click()
    
    
def get_page(page):
    print('正在打印 %d 页'%page)
    try:
        if page==1:
            browser.get(url)
            if '手机扫码,安全登录' in browser.page_source:
                log_out(browser)
        else:
            input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'input.J_Input')))
            submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'span.J_Submit')))
            input.clear()
            input.send_keys(page)
            submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'div#mainsrp-pager ul.items li.item.active span'),str(page)))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.m-itemlist')))
        if browser.page_source!=None:
            return browser
        else:
            get_page(page)
    except TimeoutException as e:
        get_page(page)
        
def get_products(browser):
    html=browser.page_source
    doc=pq(html)
    for item in doc('#mainsrp-itemlist .items .item').items():
        image=item.find('.pic .img').attr('data-src')
        price=item.find('.price').text().replace('\n','')
        deal=item.find('.deal-cnt').text()
        title=item.find('.title').text()
        shop=item.find('.shop').text()
        location=item.find('.location').text().replace(' ','')
        yield [image,price,deal,title,shop,location]
            
def save(out):
    filename='taobao_'+keywords+'.xlsx'
    if not os.path.exists(filename):
        workbook=openpyxl.Workbook()
        sheet=workbook.create_sheet(index=0,title=keywords)
        sheet.append(['图片','价格','成交人数','商品','店铺','地点'])
        workbook.save(filename)
    workbook=openpyxl.load_workbook(filename)
    sheet=workbook[keywords]
    for row in out:
        print(row)
        sheet.append(row)
    workbook.save(filename)
        
def main():
    for page in range(1,page_max+1):
        browser=get_page(page)
        out=get_products(browser)
        save(out)
        time.sleep(random.randint(1,5))

if __name__=='__main__':
    main()

 

自己的代码:

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
from pymongo import MongoClient

#创建mogodb数据对象
client=MongoClient()
db=client['taobao']
collection=db['taobao']

browser=webdriver.Chrome()
wait=WebDriverWait(browser,10)
max_page=100

def index_page():
    try:
        browser.get('https://www.taobao.com')
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
        input.send_keys('ipad')
        submit.click()
        print('正在爬取第', page, '')
        get_products()
        num = browser.find_element_by_link_text('下一页')
        num.click()
    except TimeoutException:
        print('time out!')
        return index_page()

# 提取商品数据
def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'mainsrp-itemlist.items.item')))
    html=browser.page_source
    doc=pq(html)
    items=doc('#mainsrp-itemlist.items.item').items()
    for item in items:
        product={
            'image': item.find('.pic a img').attr('data-src'),
            'price': item.find('.price').text(),
            'deal': item.finc('.deal-cnt').text(),
            'title': item.find('.title').text(),
            'shop': item.find('.shop').text(),
            'location': item.find('.location').text()
        }
        print(product)
        save_to_mogo(product)

#保存到mogodb
def save_to_mogo(result):
    try:
        if collection.insert(result):
            print('保存成功',result)
    except Exception:
        print('保存失败',result)

if __name__=='__main__':
    for page in range(2, max_page + 1):
        index_page(page)

 

posted @ 2019-05-31 00:13  舒畅123  阅读(869)  评论(0编辑  收藏  举报