selenium+plantomJS
#!/usr/bin/env python # -*- coding:utf-8 -*- """ 流程框架: 1.搜索关键词,利用selenium驱动浏览器搜索关键词,查询得到商品列表 2.分析页码并翻页,得到商品页码数,模拟翻页,得到后续页面的商品列表 3.分析提取商品内容,利用PyQuery分析源码,解析得到商品列表 4.存储至MongoDB,将商品列表信息存储到Mongodb数据库 """ from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re from pyquery import PyQuery as pq import pymongo LOCATION = "localhost" MONGO_DB = 'taobao' MONGO_TABLE = "taobao" KEYWORD = '零食' client = pymongo.MongoClient(LOCATION) db = client[MONGO_DB] driver = webdriver.PhantomJS(service_args=['--load-images=false', ]) wait = WebDriverWait(driver, 20) driver.set_window_size(1366, 768) def search(keyword): print("正在搜索关键字:%s" % keyword) try: driver.get('http://www.taobao.com') element = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "#q")) ) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search'))) element.clear() element.send_keys(keyword) submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.total'))) get_product() return total.text except TimeoutException: search(keyword) def next_page(page_number): print("正在翻页:%s" % page_number) flag = False try: element = WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, 'input.input:nth-child(2)'))) submit = WebDriverWait(driver, 20).until( EC.element_to_be_clickable((By.CSS_SELECTOR, 'span.btn:nth-child(4)'))) element.clear() element.send_keys(page_number) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'span.num'), str(page_number))) get_product() flag = True except TimeoutException: next_page(page_number) except Exception as e: print(e) return flag def get_product(): print("正在获取产品信息...") try: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) html = driver.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { "img": item.find(".pic-box div a img").attr('src'), 'title': item.find(".title a").text(), "sales": item.find(".deal-cnt").text(), "shop": item.find(".shopname").text(), "location": item.find(".location").text(), "price": item.find(".price strong").text() } save_to_mongo(product) except Exception as e: print(e) def save_to_mongo(result): try: if db[MONGO_TABLE].insert(result): print("产品信息成功保存到mongodb", result) except Exception as e: print("保存失败!", e) def main(): page = search(KEYWORD) page = re.compile("\d+").search(page).group(0) flag = '' for i in range(2, int(page) + 1): flag = next_page(i) return flag if __name__ == "__main__": main()