第五次作业

作业①:
（1）要求：
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架爬取京东商城某类商品信息及图片。
（2）code：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time


class MySpider:
    headers = {
        "User-Agent": "SMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.361"
        }

    imagePath = "download"

    def startUp(self, url, key):
        # Initializing Chrome browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        # Initializing variables
        self.threads = []
        self.No = 0
        self.imgNo = 0
        # Initializing database
        try:
            self.con = sqlite3.connect("phones.db")
            self.cursor = self.con.cursor()
            try:
                # 如果有表就删除
                self.cursor.execute("drop table phones")
            except:
                pass
            try:
                #  建立新的表
                sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
            except:
                pass

        except Exception as err:
            print(err)
            # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)
        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER)

    def closeUp(self):
        try:

            self.con.commit()
            self.con.close()
            self.driver.close()

        except Exception as err:
            print(err)

    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
        try:
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
        except Exception as err:
            print(err)

    def showDB(self):
        try:
            con = sqlite3.connect("phones.db")
            cursor = con.cursor()
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")

            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))

            con.close()
        except Exception as err:
            print(err)

    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                # We find that the image is either in src or in data-lazy-img attribute
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""

                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//a//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")

                except:
                    note = ""
                    mark = ""
                    src2 = ""
                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                    print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-prev disabled']")
            except:
                nextPage = self.driver.find_elements_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                nextPage.click()
                self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")


url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=3b49f8ac7fda4e90be468e2e23de7bfc"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "手机")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

结果：

（2）心得体会：
一开始因为不是很理解selenium，所以在参考老师给的代码的时候，有一行的对齐缩进是错误的，以至于调试了半天，一直爬取错误，到最后才终于成功了。

作业②
（1）要求:
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
（2）code:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time
class gupiao:
    header = {
        "User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072531 Minefield/3.0.2pre"
    }
    def start(self, url):#开始只需要利用url模拟浏览器搜索爬取
        chrome_options = Options()#调用chrome的浏览器
        chrome_options.add_argument("——headless")
        chrome_options.add_argument("——disable-gpu")
        self.driver = webdriver.Chrome(chrome_options=chrome_options)
        self.count = 0#编号
        #连接数据库，如果存在就删除并建立
        try:
            self.con = sqlite3.connect("gupiao.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute("drop table gupiao")
            except:
                pass
            try:
                #数据库表的样式
                sql = "create table gupiao(count varchar(256) ,num varchar(256),stockname varchar(256),lastest_price varchar(64),ddf varchar(64),dde varchar(64),cjl varchar(64),cje varchar(32),zhenfu varchar(32),top varchar(32),low varchar(32),today varchar(32),yestd varchar(32))"
                self.cursor.execute(sql)
            except:
                pass
        except Exception as err:
            print(err)
        self.driver.get(url)#浏览器得到地址

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err)

    def insertDB(self, count,num,stockname,lastest_price,ddf,dde,cjl,cje,zhenfu,top,low,today,yestd):
        #将从浏览器爬取得到的数据插入数据库
        try:
            sql = "insert into gupiao (count,num,stockname,lastest_price,ddf,dde,cjl,cje,zhenfu,top,low,today,yestd) values (?,?,?,?,?,?,?,?,?,?,?,?,?)"
            self.cursor.execute(sql, (count,num,stockname,lastest_price,ddf,dde,cjl,cje,zhenfu,top,low,today,yestd))
        except Exception as err:
            print(err)

    def showDB(self):
        try:
            con = sqlite3.connect("gupiao.db")
            cursor = con.cursor()
            print("count","num","stockname","lastest_price","ddf","dde","cjl","cje","zhenfu","top","low","today","yestd")
            cursor.execute("select count,num,stockname,lastest_price,ddf,dde,cjl,cje,zhenfu,top,low,today,yestd from gupiao order by count")#sql语句获取数据
            rows = cursor.fetchall()
            for row in rows:
                print(row[0], row[1], row[2], row[3], row[4],row[5], row[6], row[7], row[8], row[9],row[10], row[11], row[12])
            con.close()#读取完数据，关闭数据库
        except Exception as err:
            print(err)

    def execute(self, url):
        print("Starting......")
        self.start(url)
        print("Processing......")
        self.process()
        print("Closing......")
        self.closeUp()
        print("Completed......")


    def process(self):#真正的爬取数据开始！
        time.sleep(1)
        try:
            lis = self.driver.find_elements_by_xpath("//div[@class='listview full']/table[@id='table_wrapper-table']/tbody/tr")
            time.sleep(1)
            for li in lis:
                time.sleep(1)
                num = li.find_element_by_xpath(".//td[position()=2]/a[@href]").text
                stockname = li.find_element_by_xpath(".//td[@class='mywidth']/a[@href]").text #在网页审查元素找到对应的右键Copy 可以查看具体位置
                lastest_price = li.find_element_by_xpath(".//td[position()=5]/span").text
                ddf = li.find_element_by_xpath(".//td[position()=6]/span").text #//tr[1]/td[6]/span
                dde = li.find_element_by_xpath(".//td[position()=7]/span").text #//*[@id="table_wrapper-table"]/tbody/tr[1]/td[7]/span
                cjl = li.find_element_by_xpath(".//td[position()=8]").text #//*[@id="table_wrapper-table"]/tbody/tr[1]/td[8]
                time.sleep(1)
                cje = li.find_element_by_xpath(".//td[position()=9]").text #//*[@id="table_wrapper-table"]/tbody/tr[1]/td[9]
                zhenfu = li.find_element_by_xpath(".//td[position()=10]").text #//*[@id="table_wrapper-table"]/tbody/tr[1]/td[10]
                top = li.find_element_by_xpath(".//td[position()=11]/span").text #//./td[11]/span
                low = li.find_element_by_xpath(".//td[position()=12]/span").text #//tr[1]/td[12]/span
                today = li.find_element_by_xpath(".//td[position()=13]/span").text #//td[13]/span
                yestd = li.find_element_by_xpath(".//td[position()=14]").text #
                time.sleep(1)
                self.count = self.count + 1
                count=self.count
                self.insertDB(count,num,stockname,lastest_price,ddf,dde,cjl,cje,zhenfu,top,low,today,yestd )
        except Exception as err:
            print(err)

url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
spider = gupiao()
gupiao().execute(url)
gupiao().showDB()

结果：

（2）心得体会：
因为是差不多将第一题的代码修改了下而已，所以比较简单吧。

作业③:
（1）要求：熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息（课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介）

code：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time
import pymysql


class mooc:
    header = {
        "User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072531 Minefield/3.0.2pre"
    }

    def startUp(self, url):
        chrome_options = Options()
        chrome_options.add_argument("——headless")
        chrome_options.add_argument("——disable-gpu")
        self.driver = webdriver.Chrome(chrome_options=chrome_options)
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd="1394613257", db="MyDB",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from mooc")
            self.opened = True
        except Exception as err:
            print(err)
        self.opened = False
        # 获取url链接
        self.driver.get(url)
        # 计数
        self.No = 0
# (No  varchar(32) primary key, Course varchar(256),College varchar(256),Teacher varchar(256),Team varchar(256),Count varchar(256),Process varchar(256),Brief varchar(1024))"

    def closeUp(self):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
            self.driver.close()
        print("closed")

    def insertDB(self, No, Course, College, Teacher, Team, Count, Process, Brief):
        try:
            sql = "insert into mooc (No, Course, College, Teacher, Team, Count, Process, Brief) values (?,?,?,?,?,?,?,?)"
            self.cursor.execute(sql, (No, Course, College, Teacher, Team, Count, Process, Brief))
        except Exception as err:
            print(err)


    def processSpider(self):
        time.sleep(1)
        try:
            lis = self.driver.find_elements_by_xpath("//div[@class='m-course-list']/div/div[@class]")
            for li in lis:
                self.No = self.No + 1
                No=self.No
                Course= li.find_element_by_xpath(".//div[@class='t1 f-f0 f-cb first-row']").text
                College =li.find_element_by_xpath(".//a[@class='t21 f-fc9']").text
                Teacher=li.find_element_by_xpath(".//div[@class='t2 f-fc3 f-nowrp f-f0']/a[position()=2]").text
                Team=li.find_element_by_xpath(".//div[@class='t2 f-fc3 f-nowrp f-f0']").text
                Count= li.find_element_by_xpath(".//span[@class='hot']").text
                Brief = li.find_element_by_xpath(".//span[@class='p5 brief f-ib f-f0 f-cb']").text
                Process=li.find_element_by_xpath(".//span[@class='txt']").text
                time.sleep(1)
                print(No,Course,College,Teacher,Team,Count,Process,Brief )
                if self.opened:
                    self.cursor.execute(
                        "insert into mooc(No, Course, College, Teacher, Team, Count, Process, Brief)values(%s, %s, %s, %s, %s, %s, %s, %s)",
                        (str(No), Course, College, Teacher, Team, str(Count), Process, Brief))
        except Exception as err:
            print("process")

    def executeSpider(self, url):
        print("Starting")
        self.startUp(url)
        print("Processing")
        self.processSpider()
        print("Closing")
        self.closeUp()
        print("Completed")

url = "https://www.icourse163.org/search.htm?search=%E6%B3%95%E5%AD%A6#/"
spider = mooc()
mooc().executeSpider(url)

模拟登录的话，只学会了这个简单的

from selenium import webdriver
import time

base_url = "https://www.imooc.com/user/newlogin"
driver = webdriver.Chrome()
driver.get(base_url)

# 账号密码方式登录慕课网
name_input = driver.find_element_by_name('email')  # 找到用户名的框框
pass_input = driver.find_element_by_name('password') # 找到输入密码的框框
login_button = driver.find_element_by_xpath('//div[@class="rlf-group clearfix"]/input')  # 找到登录按钮

username = "185******"  # 这里换成自己的账号
password = "********"  ##这里换成自己的密码

name_input.clear()
name_input.send_keys(username) # 填写账号
time.sleep(0.2)  # 休眠一下，模拟人工登录，不然可能被拦截
pass_input.clear()
pass_input.send_keys(password)  # 填写密码
time.sleep(0.2)
login_button.click()   # 点击登录
time.sleep(0.2)

print(driver.get_cookies())  # 打印cookies
time.sleep(2)
print(driver.title)  # 打印标题
# driver.close()   #关闭浏览器

答案：

（2）心得体会：
一开始直接爬取，但是找不到团队信息和课程介绍，还以为要先获取链接，再通过链接进入子页面，后面折腾了下，才知道从js那边获取。因为直接连sql跑出来乱码了，所以就采取了之前的连接mysql方式。

posted @ 2020-11-20 23:02 Jelor 阅读(95) 评论(0) 编辑收藏举报

刷新页面返回顶部

Jelor

第五次作业

公告