selenium模拟浏览器爬取淘宝产品信息
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.support.ui import WebDriverWait 4 from selenium.webdriver.support import expected_conditions as EC 5 from selenium.common.exceptions import TimeoutException 6 import re 7 from pyquery import PyQuery 8 from day01.config import * 9 import pymongo 10 client = pymongo.MongoClient(MONGO_URL) #连接mongodb 11 db = client[MONGO_DB] 12 13 browser = webdriver.Chrome() 14 wait = WebDriverWait(browser,10) 15 16 def search(): 17 try: 18 browser.get("https://www.taobao.com") 19 # 输入框 20 input_box = wait.until( 21 EC.presence_of_element_located((By.CSS_SELECTOR,"#q")) 22 ) 23 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button"))) 24 input_box.send_keys("美食") 25 submit.click() 26 login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-id"))) 27 if login is not None: 28 login.send_keys("********") 29 password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-password"))) 30 password.send_keys("*********") 31 login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#login-form > div.fm-btn > button"))) 32 login_button.click() 33 else: 34 pass 35 total_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total"))) 36 get_products() 37 return total_page.text 38 except TimeoutException: 39 return search() 40 # finally: 41 # browser.quit() 42 43 def next_page(page_number): 44 "操作翻页" 45 try: 46 input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input"))) 47 confirm_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) 48 input_page.clear() 49 input_page.send_keys(page_number) 50 confirm_button.click() 51 # 判断页码数是否在当前页,用来判断元素中存在指定文本的 52 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number))) 53 get_products() 54 except TimeoutException: 55 next_page(page_number) 56 57 def get_products(): 58 "获取产品信息" 59 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item"))) 60 html = browser.page_source #可以返回网页源码 61 doc = PyQuery(html) #使用pyquery解析网页 62 items = doc('#mainsrp-itemlist .items .item').items() 63 for item in items: 64 product = { 65 'image':item.find('.pic .img').attr('src'),#获取标签属性 66 'price':item.find('.price').text(), #价格 67 'deal':item.find('.deal-cnt').text()[:-3], #成交量 68 'title':item.find('.title').text(), 69 'shop':item.find('.shop').text(), 70 'location':item.find('.location').text() 71 } 72 # print(product) 73 save_to_mongo(product) 74 # from day01.connectMongo import ConnectMongo 75 # con = ConnectMongo() 76 # con.insert_one_data(product,"table") 77 78 def save_to_mongo(result): 79 try: 80 if db[MONGO_TABLE].insert(result): 81 print("存储到mongodb成功") 82 except Exception as e: 83 print("存储到mongodb异常,%s"%e) 84 85 86 def main(): 87 result = search() 88 total = int(re.compile("(\d+)").search(result).group(1)) 89 for i in range(2,total+1): 90 next_page(i) 91 92 if __name__ == '__main__': 93 main()