使用Selenium爬取淘宝商品
由于PhantomJS已经停止更新,所以使用chrome浏览器的headless模式代替,代码如下:
from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') browser=webdriver.Chrome(options=chrome_options) browser.get('https://www.baidu.com/') print(browser.current_url)
爬取淘宝的代码:
别人的代码:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import re from pyquery import PyQuery as pq from pymongo import MongoClient client = MongoClient() db = client['MONGO_DB'] browser = webdriver.Chrome() wait = WebDriverWait(browser,10) #使用webdriver打开chrome,打开淘宝页面,搜索美食关键字,返回总页数 def search(): try: browser.get('https://www.taobao.com') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q'))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))) input.send_keys('ipad') submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) get_products() return total.text except TimeoutException: print('timeout!') return search() #进行页面的跳转,输入下一页的页号,然后点击确定按钮,在高亮区域判定是否正确跳转 def next_page(page_num): try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ' 'div.form > input'))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_num) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num))) get_products() except TimeoutException: next_page(page_num) #获取商品详情 def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image':item.find('.pic .img').attr('src'), 'price':item.find('.price').text(), 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text(), } print(product) save_to_mongo(product) def save_to_mongo(result): try: if db['MONGO_DB'].insert(result): print('存储成功',result) except Exception: print('存储失败',result) def main(): total = search() total = int(re.search('(\d+)',total).group(1)) #'\d'表示匹配数字 for i in range(2,total+1): next_page(i) if __name__ == '__main__': main()
崔老师的代码:
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from urllib.parse import quote from pyquery import PyQuery as pq from pymongo import MongoClient browser = webdriver.Chrome() wait = WebDriverWait(browser, 10) KEYWORD = 'iPad' def index_page(page): try: url = 'https://s.taobao.com/search?q=' + quote(KEYWORD) browser.get(url) if page > 1: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click() wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) get_products() except TimeoutException: index_page(page) def get_products(): """ 提取商品数据 """ html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image': item.find('.pic .img').attr('data-src'), 'price': item.find('.price').text(), 'deal': item.find('.deal-cnt').text(), 'title': item.find('.title').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } print(product) save_to_mongo(product) MONGO_URL = 'localhost' MONGO_DB = 'taobao' MONGO_COLLECTION = 'products' client = MongoClient(MONGO_URL) db = client[MONGO_DB] def save_to_mongo(result): """ 保存至MongoDB :param result: 结果 """ try: if db[MONGO_COLLECTION].insert(result): print('存储到MongoDB成功') except Exception: print('存储到MongoDB失败') MAX_PAGE = 100 if __name__ == '__main__': for i in range(1, MAX_PAGE + 1): index_page(i)
其他人帮助的代码
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.common.exceptions import TimeoutException from urllib.parse import quote from pyquery import PyQuery as pq import os import openpyxl import random import time browser=webdriver.Chrome() base_url='https://s.taobao.com/search?q=' keywords='ipad' url=base_url+quote(keywords) wait=WebDriverWait(browser,15) page_max=100 def log_out(browser): login_switch=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'div.login-switch i#J_Quick2Static'))) login_switch.click() weibo_login=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.weibo-login'))) weibo_login.click() username=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.inp.username input'))) password=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.inp.password input'))) username.send_keys('xxx') password.send_keys('xxx') submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'span[node-type="submitStates"]'))) submit.click() def get_page(page): print('正在打印 %d 页'%page) try: if page==1: browser.get(url) if '手机扫码,安全登录' in browser.page_source: log_out(browser) else: input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'input.J_Input'))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'span.J_Submit'))) input.clear() input.send_keys(page) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'div#mainsrp-pager ul.items li.item.active span'),str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.m-itemlist'))) if browser.page_source!=None: return browser else: get_page(page) except TimeoutException as e: get_page(page) def get_products(browser): html=browser.page_source doc=pq(html) for item in doc('#mainsrp-itemlist .items .item').items(): image=item.find('.pic .img').attr('data-src') price=item.find('.price').text().replace('\n','') deal=item.find('.deal-cnt').text() title=item.find('.title').text() shop=item.find('.shop').text() location=item.find('.location').text().replace(' ','') yield [image,price,deal,title,shop,location] def save(out): filename='taobao_'+keywords+'.xlsx' if not os.path.exists(filename): workbook=openpyxl.Workbook() sheet=workbook.create_sheet(index=0,title=keywords) sheet.append(['图片','价格','成交人数','商品','店铺','地点']) workbook.save(filename) workbook=openpyxl.load_workbook(filename) sheet=workbook[keywords] for row in out: print(row) sheet.append(row) workbook.save(filename) def main(): for page in range(1,page_max+1): browser=get_page(page) out=get_products(browser) save(out) time.sleep(random.randint(1,5)) if __name__=='__main__': main()
自己的代码:
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from pyquery import PyQuery as pq from pymongo import MongoClient #创建mogodb数据对象 client=MongoClient() db=client['taobao'] collection=db['taobao'] browser=webdriver.Chrome() wait=WebDriverWait(browser,10) max_page=100 def index_page(): try: browser.get('https://www.taobao.com') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) input.send_keys('ipad') submit.click() print('正在爬取第', page, '页') get_products() num = browser.find_element_by_link_text('下一页') num.click() except TimeoutException: print('time out!') return index_page() # 提取商品数据 def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'mainsrp-itemlist.items.item'))) html=browser.page_source doc=pq(html) items=doc('#mainsrp-itemlist.items.item').items() for item in items: product={ 'image': item.find('.pic a img').attr('data-src'), 'price': item.find('.price').text(), 'deal': item.finc('.deal-cnt').text(), 'title': item.find('.title').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } print(product) save_to_mogo(product) #保存到mogodb def save_to_mogo(result): try: if collection.insert(result): print('保存成功',result) except Exception: print('保存失败',result) if __name__=='__main__': for page in range(2, max_page + 1): index_page(page)