selenium爬去数据+存储

1 爬去数据代码

#coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#加载TimeoutException模块,用于进行超时处理
from selenium.common.exceptions import TimeoutException
#正则表达式
import re,sys
from pyquery import PyQuery as pq
from config import *
#加载数据库操作模块
import mysqlOp 

driver=webdriver.Chrome()
#使用phantomJs浏览器驱动
#driver=webdriver.PhantomJS()
driver.get("https://www.taobao.com")
driver.set_window_size(1400,900)

wait=WebDriverWait(driver, 10)
def search():
        try:
                input=wait.until(EC.presence_of_element_located(By.CSS_SELECTOR,"#q"))
                submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
                input.clear()
                input.send_keys("美食")
                submit.click()
                #获取第一页的数据
                get_goods()
        except TimeoutException :
                search()
#获取总页码
def get_total():
        #查找总页码
        total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
        return total.text
#翻页
def next_page(page):
        try:
                input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
                submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
                input.clear()
                input.send_keys(page)
                submit.click()
                wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page)))

                #获取当前页的数据
                count=get_goods()
        except TimeoutException:
               next_page(page)
        return count
def get_goods():
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
        #mainsrp-itemlist > div > div > div:nth-child(1) > div.item.J_MouserOnverReq.item-ad
        #mainsrp-itemlist > div > div > div:nth-child(1)
        html=driver.page_source
        doc=pq(html)
        items=doc("#mainsrp-itemlist .items .item").items()
        count=0
        for item in items:
                goods={
                    'image':item.find('.pic .img').attr('src'),
                    'price':item.find('.price').text(),
                    'deal' :item.find('.deal-cnt').text()[:-3],
                    'title':item.find('.title').text(),
                    'shop':item.find('.shop').text(),
                    'location':item.find('.location').text()

                }
                print(goods)
                #将数据插入数据库
                mysqlOp.mysqlOp(goods)
                count+=1
        return count
def main():
        search()
        total=get_total()
        #使用正则表达式提取页码
        total=int(re.compile(r"(\d+)").search(total).group(1))
        print(total)
        total_count=0
        for i in range(2,total+1):
                count=next_page(i)
                total_count +=count
        print(total_count)
                
        
if __name__=="__main__":
        main()

 2 存入到mysql中

创建一个mysqlOp.py的文件

#coding=utf-8
from pymysql import *
def mysqlOp(goods):
        conn=connect(host='127.0.0.1', port=3306, user='root', passwd='1qaz2wsx#EDC', db='taobao_meishi', charset='utf8')
        cursor=conn.cursor()
        cursor.execute("insert into goods(image,price,deal,title,shop,location) values(%s,%s,%s,%s,%s,%s)",(goods['image'],goods['price'],goods['deal'],goods['title'],goods['shop'],goods['location']))
        conn.commit()
        cursor.close()
        conn.close()

 

posted @ 2018-07-30 17:31  音量  阅读(668)  评论(0编辑  收藏  举报