seleniun 爬取淘宝网
1 import re 2 from selenium import webdriver 3 from selenium.common.exceptions import TimeoutException 4 from selenium.webdriver.common.by import By 5 from selenium.webdriver.support.ui import WebDriverWait 6 from selenium.webdriver.support import expected_conditions as EC 7 from pyquery import PyQuery as pq 8 9 import pymongo 10 11 MONGO_URL = 'localhost' 12 MONGO_DB = 'taobao' 13 MONGO_TABLE = 'product' 14 15 SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] 16 17 KEYWORD = '美食' 18 19 client = pymongo.MongoClient(MONGO_URL) 20 db = client[MONGO_DB] 21 22 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 23 wait = WebDriverWait(browser, 10) 24 25 browser.set_window_size(1400, 900) 26 27 def search(): 28 print('正在搜索') 29 try: 30 browser.get('https://www.taobao.com') 31 input = wait.until( 32 EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) 33 ) 34 submit = wait.until( 35 EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) 36 input.send_keys(KEYWORD) 37 submit.click() 38 total = wait.until( 39 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))) 40 get_products() 41 return total.text 42 except TimeoutException: 43 return search() 44 45 46 def next_page(page_number): 47 print('正在翻页', page_number) 48 try: 49 input = wait.until( 50 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) 51 ) 52 submit = wait.until(EC.element_to_be_clickable( 53 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) 54 input.clear() 55 input.send_keys(page_number) 56 submit.click() 57 wait.until(EC.text_to_be_present_in_element( 58 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number))) 59 get_products() 60 except TimeoutException: 61 next_page(page_number) 62 63 64 def get_products(): 65 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) 66 html = browser.page_source 67 doc = pq(html) 68 items = doc('#mainsrp-itemlist .items .item').items() 69 for item in items: 70 product = { 71 'image': item.find('.pic .img').attr('src'), 72 'price': item.find('.price').text(), 73 'deal': item.find('.deal-cnt').text()[:-3], 74 'title': item.find('.title').text(), 75 'shop': item.find('.shop').text(), 76 'location': item.find('.location').text() 77 } 78 print(product) 79 save_to_mongo(product) 80 81 82 def save_to_mongo(result): 83 try: 84 if db[MONGO_TABLE].insert(result): 85 print('存储到MONGODB成功', result) 86 except Exception: 87 print('存储到MONGODB失败', result) 88 89 90 def main(): 91 try: 92 total = search() 93 total = int(re.compile('(\d+)').search(total).group(1)) 94 for i in range(2, total + 1): 95 next_page(i) 96 except Exception: 97 print('出错啦') 98 finally: 99 browser.close() 100 101 if __name__ == '__main__': 102 main()