第六次作业

一、作业内容

作业①:

要求:

        用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
        每部电影的图片,采用多线程的方法爬取,图片名字为电影名
        了解正则的使用方法

候选网站:豆瓣电影:https://movie.douban.com/top250

代码:

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading

def imageSpider(start_url):
    global thread
    global count
    try:
        urls = []
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "html.parser")
        list = soup.select('ol[class="grid_view"] li')
        for li in list:
            rank=li.select('div[class="pic"] em[class=""]')[0].text.replace("em",'').replace("\n",'')
            fn=li.select('div[class="info"] div a span ')[0].text.replace("span",'').replace("\n",'')
            info_l=li.select('div[class="bd"] p')[0].text.replace("p",'').replace("\n",'').replace(" ",'')
            info_f =li.select('div[class="bd"] br')[0].text.replace("br",'').replace("\n",'').replace(" ",'')
            comment=li.select('div[class="bd"] div[class="star"] span[class="rating_num"]')[0].text.replace("span",'').replace("\n",'').replace(" ",'')#评价信息
            audi=li.select('div[class="star"] span')[3].text.replace("span",'').replace("\n",'').replace(" ",'')
            print(rank,fn,info_l,info_f,comment,audi)
        images1 = soup.select("img")
        for image in images1:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    count = count + 1
                    T = threading.Thread(target=download, args=(url, count))
                    T.setDaemon(False)
                    T.start()
                    thread.append(T)
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)


def download(url, count):
    try:
        count=count+1
        if (url[len(url) - 4] == "."):
            ext = url[len(url) - 4:]
        else:
            ext = ""
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("photo\\" + str(count) + ext, "wb")
        fobj.write(data)
        fobj.close()
        print("downloaded " + str(count) + ext)
    except Exception as err:
        print(err)
start_url = "https://movie.douban.com/top250"
headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
count = 0
thread = []


imageSpider(start_url)
for t in thread:
    t.join()
print("the End")

结果:

心得:

这次作业主要是回顾上半学期的bs和request库爬虫,相比于之前我主要是在静态网页定位上有所遗忘,进行了重新学习。

作业②:

要求:

    熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息
    爬取科软学校排名,并获取学校的详细链接,进入下载学校Logo存储、获取官网Url、院校信息等内容。

候选网站:https://www.shanghairanking.cn/rankings/bcur/2020

代码:

items.py:

import scrapy
class UniversityrankingItem(scrapy.Item):
    sNo=scrapy.Field()
    schoolName=scrapy.Field()
    city=scrapy.Field()
    officalUrl=scrapy.Field()
    info=scrapy.Field()

pipelines.py

import sqlite3
class UniversityrankingPipeline:


    class SpiderPipeline:
        def open_spider(self, MySpider):
            self.con = sqlite3.connect("University.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute("drop table University")
            except:
                pass
            sql = "create table University(sNo int,schoolName varchar(32),city varchar(16),officalUrl varchar(64),"\
                    "info text,);"
            self.cursor.execute(sql)
            #self.cursor.execute("delete from University")

        def close_spider(self, MySpider):
            if self.opened: self.con.commit()

            self.con.close()
            self.opened = False
            print("closed")

        def process_item(self, item, spider):
            try:
                print(item["sNo"])
                print(item["schoolName"])
                print(item["city"])
                print(item["officalUrl"])
                print(item["info"])
                print()
                self.cursor.execute("insert into University(sNo,schoolName,city,officalUrl,info) "
                    "values(?,?,?,?,?)",
                    (sNo, schoolName, city, officalUrl, info))

            except Exception as err:
                print(err)
            return item

结果:


由于一直没能实现将图片存入数据库中,我最后放弃了这一想法,只存储了排名、校名、城市、官网、信息五项。

感想:

结果到最后还是没能用上pymysql啊......
这一题仿照了原来的架构,在xpath上有所遗忘,但总体完成,还是比较清晰的。

作业③:

要求:

    熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素加载、网页跳转等内容。
    使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网,并获取学生自己账户中已学课程的信息并保存在MYSQL中。
    其中模拟登录账号环节需要录制gif图。

候选网站: 中国mooc网:https://www.icourse163.org

代码:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sqlite3
import time

class MOOC:
    def Load_in(self):
        time.sleep(1)
        user = self.driver.find_element_by_xpath('//*[@id="j-topnav"]/div')
        user.click()
        time.sleep(1)
        way = self.driver.find_element_by_xpath('//div[@class="ux-login-set-scan-code_ft"]/span')
        way.click()
        time.sleep(1)
        telephone = self.driver.find_element_by_xpath('//ul[@class="ux-tabs-underline_hd"]/li[2]')
        telephone.click()
        time.sleep(1)
        frame = self.driver.find_element_by_xpath(
            "/html/body/div[13]/div[2]/div/div/div/div/div/div[1]/div/div[1]/div[2]/div[2]/div/iframe")
        self.driver.switch_to.frame(frame)
        self.driver.find_element_by_xpath('//input[@type="tel"]').send_keys('15750908582')
        time.sleep(1)
        self.driver.find_element_by_xpath('//input[@class="j-inputtext dlemail"]').send_keys('wuyinhang817000')
        time.sleep(1)
        load_in = self.driver.find_element_by_xpath('//*[@id="submitBtn"]')
        load_in.click()

    def MyClass(self):
        time.sleep(1)
        myclass = self.driver.find_element_by_xpath('//*[@id="j-indexNav-bar"]/div/div/div/div/div[7]/div[3]/div')
        myclass.click()
        self.all_spider()

    def all_spider(self):
        time.sleep(1)
        self.spider()
        time.sleep(1)
        try:
            self.driver.find_element_by_xpath(
                '//ul[@class="ux-pager"]/li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-disable-gh"]')
        except Exception:
            self.driver.find_element_by_xpath(
                '//ul[@class="ux-pager"]/li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-main-gh"]').click()
            self.all_spider()

    def spider(self):
        id = 0
        time.sleep(1)
        lis = self.driver.find_elements_by_xpath('//div[@class="course-card-wrapper"]')
        for li in lis:
            time.sleep(1)
            li.click()
            window = self.driver.window_handles
            self.driver.switch_to.window(window[1])
            time.sleep(1)
            window = self.driver.window_handles
            self.driver.switch_to.window(window[2])
            time.sleep(1)
            id += 1
            course = self.driver.find_element_by_xpath(
                '//*[@id="g-body"]/div[1]/div/div[3]/div/div[1]/div[1]/span[1]').text
            teacher = self.driver.find_element_by_xpath('//*[@id="j-teacher"]//h3[@class="f-fc3"]').text
            collage = self.driver.find_element_by_xpath('//*[@id="j-teacher"]/div/a/img').get_attribute('alt')
            process = self.driver.find_element_by_xpath('//*[@id="course-enroll-info"]/div/div[1]/div[2]/div[1]').text
            count = self.driver.find_element_by_xpath('//*[@id="course-enroll-info"]/div/div[2]/div[1]/span').text
            brief = self.driver.find_element_by_xpath('//*[@id="j-rectxt2"]').text
            self.cursor.execute("insert into mooc(id, course, teacher, collage, count, process, brief) "
                                "values( ?,?,?,?,?,?,?)",
                                (id, course, teacher, collage, count, process, brief))

            time.sleep(1)
            self.driver.close()
            self.driver.switch_to.window(window[1])
            time.sleep(1)
            self.driver.close()
            self.driver.switch_to.window(window[0])

    def start(self):
        self.con = sqlite3.connect("mooc.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("drop table mooc")
        except:
            pass
        sql = 'create table mooc(id int,course varchar(32),teacher varchar(16),collage varchar(32),count varchar(64),' \
               'process varchar(64),brief text);'
        self.cursor.execute(sql)
        #self.cursor.execute("delete from mooc")

    def stop(self):
        try:
            self.con.commit()
            self.con.close()
        except Exception as err:
            print(err)

    def executespider(self, url):
        chrome_options = Options()
        self.driver = webdriver.Chrome(chrome_options=chrome_options)
        self.driver.get(url)
        self.start()
        self.Load_in()
        self.MyClass()
        self.stop()

def main():
        url = 'https://www.icourse163.org/'
        spider = MOOC()
        spider.executespider(url)

if __name__ == '__main__':
        main()

结果:

心得:

使用了上次的selenium框架,这次成功的完成了模拟登录账号,删掉了选择界面。
有时会爬到一半报错:selenium.common.exceptions.WebDriverException: Message: chrome not reachable
有时正常,尚在研究之中。

写在最后:

一学期的学习就这样结束了。本人并不是特别聪明的类型,作业做得马马虎虎,每次做作业都深刻的体会到了人和人的差距......
但还是要感谢吴老师、陈助教以及班里的各位大佬的帮助,大家对我相当宽容,给了我很多帮助,让我受益良多。
对于一门课、一种技术而言,兴趣不一定和成果成严格正比,但一定存在正相关。
愿我们以后还能把兴趣挥洒在每一个角落。再次感谢!

posted @ 2020-12-02 22:55  159ggg  阅读(194)  评论(0编辑  收藏  举报