第五次作业

作业①

爬取手机信息

要求：
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架爬取京东商城某类商品信息及图片。
候选网站：http://www.jd.com/
关键词：学生自由选择
输出信息：MYSQL的输出信息如下：

代码：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time

class MySpider:
    headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    imagePath = "download"

    def startUp(self, url, key):
        # # Initializing Chrome browser
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)

        # Initializing variables
        self.threads = []
        self.No = 0
        self.imgNo = 0
        # Initializing database
        try:
            self.con = sqlite3.connect("phones.db")
            self.cursor = self.con.cursor()
            try:
                # 如果有表就删除
                self.cursor.execute("drop table phones")
            except:
                pass
            try:
                #  建立新的表
                sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
            except:
                pass

        except Exception as err:
            print(err)
            # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img) #删除目录下的文件
                os.remove(s)
        except Exception as err:
            print(err)
        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)    #传入参数
        keyInput.send_keys(Keys.ENTER)   #回车键

    def closeUp(self):       #关闭数据库连接和浏览器驱动
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err);    

    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):  #插入数据
        try:
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
        except Exception as err:
            print(err) 

    def showDB(self):    #数据库表的展示
        try:
            con = sqlite3.connect("phones.db")
            cursor = con.cursor()
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")
            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))
            con.close()
        except Exception as err:
            print(err)

    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)  

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                # We find that the image is either in src or in data-lazy-img attribute
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""

                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")

                except:
                    note = ""
                    mark = ""
                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            except:
                nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                nextPage.click()
                self.processSpider()
        except Exception as err:
            print(err)       

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")

url = "http://www.jd.com"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "手机")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

运行结果部分展示：

心得体会

主要就是复现一下课上的代码，增加了解

实验②

爬取股票信息

要求：
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站：东方财富网：http://quote.eastmoney.com/center/gridlist.html#hs_a_board

输出信息：MYSQL数据库存储和输出格式如下，表头应是英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计表头：

代码：

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from time import sleep
import pymysql

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)

driver.get("http://quote.eastmoney.com/center/gridlist.html#hs_a_board")
driver.maximize_window()
conn = None
cursor = None
conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider', charset='utf8')
cursor = conn.cursor()
modules = [ "hs_a_board","sh_a_board", "sz_a_board"]   #遍历板块
for module in modules:
    sleep(1)
    driver.find_element_by_xpath('//*[@id="nav_{}"]'.format(module)).click()   #板块点击
    sleep(2)
    driver.execute_script("var q=document.documentElement.scrollTop=10000")   #下拉底部
    while True:
        trs = driver.find_elements_by_xpath('//*[@id="table_wrapper-table"]/tbody/tr')
        for tr in trs:
            id = tr.find_element_by_xpath("./td[1]").text
            no = tr.find_element_by_xpath("./td[2]/a").text
            name = tr.find_element_by_xpath("./td[3]/a").text
            latest_price = tr.find_element_by_xpath("./td[5]/span").text
            range = tr.find_element_by_xpath("./td[6]/span").text
            amount = tr.find_element_by_xpath("./td[7]/span").text
            trading = tr.find_element_by_xpath("./td[8]").text
            transaction = tr.find_element_by_xpath("./td[9]").text
            print(id, no, name, latest_price, range, amount, trading, transaction)
            try:
                cursor.execute('insert into stocks values("%s","%s","%s","%s","%s","%s","%s","%s")' %
                               (id, no, name, latest_price, range, amount, trading, transaction))   #插入数据
                conn.commit()
            except:
                conn.rollback()
        try:
            driver.find_element_by_class_name("next paginate_button disabled")
        except:
            driver.find_element_by_xpath('//*[@id="main-table_paginate"]/a[@class="next paginate_button"]').click()   #翻页
            sleep(2)
    driver.execute_script("var q=document.documentElement.scrollTop=0")    #返回顶部
driver.quit()
cursor.close()
conn.close()

运行结果部分展示：

心得体会：

加深对selenium的了解程度

实验③

模拟登录mooc以及爬取课程信息

要求：

熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息（课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介）
候选网站：中国mooc网：https://www.icourse163.org

输出信息：MYSQL数据库存储和输出格式：

代码：

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from time import sleep
import pymysql
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)

driver.get("https://www.icourse163.org/")
driver.maximize_window()
sleep(2)

driver.find_element_by_xpath('//div[@class="unlogin"]//a[@class="f-f0 navLoginBtn"]').click()   #登录或注册
sleep(2)
driver.find_element_by_class_name('ux-login-set-scan-code_ft_back').click()              #其他登录方式
sleep(2)
driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']//li[@class='']").click()
sleep(2)
driver.switch_to.frame(driver.find_element_by_xpath("//div[@class='ux-login-set-container']//iframe"))
driver.find_element_by_xpath('//input[@id="phoneipt"]').send_keys("******")        #输入账号
sleep(2)
driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys("******")   #输入密码
sleep(2)
driver.find_element_by_xpath('//div[@class="f-cb loginbox"]//a[@id="submitBtn"]').click()  #点击登录
sleep(3)
driver.find_element_by_xpath(
    '//div[@class="u-baseinputui"]/input[@class="j-textarea inputtxt"]').send_keys("python")    #输入要找的课程
sleep(2)
driver.find_element_by_xpath('//div[@class="u-search-icon"]/span[@class="u-icon-search2 j-searchBtn"]').click()   #点击搜索
sleep(2)
conn = None
cursor = None
conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider', charset='utf8')
cursor = conn.cursor()
id=0
while True:
    sleep(2)
    divs = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')
    for i in range(len(divs)):
        try:
            div = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')[i + 1]
            div.click()
            sleep(3)
            current_window = driver.window_handles[-1]   #切换到最新打开的页面
            driver.switch_to.window(current_window)
            sleep(2)
            id += 1
            course = driver.find_element_by_xpath('//span[@class="course-title f-ib f-vam"]').text
            process = driver.find_element_by_xpath(
                '//div[@class="course-enroll-info_course-info_term-info_term-time"]/span[2]').text
            college = driver.find_element_by_xpath('//*[@id="j-teacher"]/div/a/img').get_attribute("alt")
            count = driver.find_element_by_xpath(
                '//span[@class="course-enroll-info_course-enroll_price-enroll_enroll-count"]').text
            brief = driver.find_element_by_xpath('//*[@id="j-rectxt2"]').text
            teacher = driver.find_element_by_xpath('//div[@class="cnt f-fl"][1]/h3').text
            team = ""
            teas = driver.find_elements_by_xpath('//div[@class="um-list-slider_con"]/div')
            if len(teas)>1:
                for tea in teas:
                    team = team + tea.find_element_by_xpath('.//div[@class="cnt f-fl"]/h3').text   #教师拼接
            else:
                team=teacher
            print(course, college, teacher, team, process, brief)
            try:
                cursor.execute('insert into mooc values("%s","%s","%s","%s","%s","%s","%s","%s")' %
                               (id, course, college, teacher, team, count, process, brief))  # 插入数据
                conn.commit()
            except:
                conn.rollback()
            driver.close()
            sleep(2)
            previous_window = driver.window_handles[0]   #切换回最开始打开的界面
            sleep(2)
            driver.switch_to.window(previous_window)
            sleep(2)
            try:
                driver.find_element_by_xpath(
                    '//li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-disable-gh"]')
            except:
                driver.find_element_by_xpath(
                    '//li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-main-gh"]').click()
                sleep(3)
        except Exception as e:
            print(e)
driver.quit()
cursor.close()
conn.close()

运行结果部分展示：

心得体会：

遇到了问题：Message: stale element reference: element is not attached to the page document
divs = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')
for div in divs:
div.click()
...
这样就容易点击了第一个div之后，页面出现刷新的情况，再想点第二个就会报这个错
可以改成：
divs = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')
for i in range(len(divs)):
driver.find_element_by_xpath('//a[@class="class name"][i+1]').click()
需要在页面刷新后重新定位再进行操作。

posted on 2020-11-21 21:34 无名狼狈阅读(75) 评论(0) 编辑收藏举报

刷新页面返回顶部

无名狼狈

第五次作业

作业①

爬取手机信息

心得体会

实验②

爬取股票信息

心得体会：

实验③

模拟登录mooc以及爬取课程信息

心得体会：

导航

公告