数据采集——第六次作业

作业一:

要求：
用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
每部电影的图片，采用多线程的方法爬取，图片名字为电影名
了解正则的使用方法
候选网站：豆瓣电影：https://movie.douban.com/top250
输出信息：

1）作业结果

代码：

import threading
import urllib
import requests
from bs4 import BeautifulSoup
import re

'''
    多线程爬取豆瓣电影top250
'''

class ConsumerThread(threading.Thread):
    def __init__(self, startUrl, headers, startNum, endNum):
        threading.Thread.__init__(self)
        self.startUrl = startUrl
        self.headers = headers
        self.startNum = startNum
        self.endNum = endNum

    def run(self):
        for page in range(self.startNum, self.endNum + 25, 25):
            lst =[]
            html = request_page(self.startUrl + str(page), self.headers)
            # res.encoding = 'utf-8'
            # print("这是html")
            # print(html)
            try:
                # 调用bs4库里的BeautifulSoup类的方法，利用BeautifulSoup的html.parser解析网页
                soup = BeautifulSoup(html, "html.parser")
                div = soup.find_all('div', attrs={'class': 'item'})
                for i in div:
                    # name = i.find_all('span') 下边为简写写法
                    name = i('span', attrs={'class': 'title'})
                    # 开始对'导演演员'进行处理
                    dir = str(i('p', attrs={'class': ''})).replace(' ', '').replace('\n', '').split(' ')
                    dir = str(dir).strip('[\'[<pclass="">导演:</p>]\']')
                    # print(dir)
                    dir1 = dir.split('\\xa0\\xa0\\xa0')
                    director = dir1[0]   # 导演
                    # print(director)
                    try:
                        dir2 = dir1[1].strip('主演:').split('<br/>')
                        main_actor = dir2[0]
                    except:
                        main_actor = ' '
                    dir3 = dir2[1].split('\\xa0/\\xa0')
                    pbyear = dir3[0]
                    country = dir3[1]
                    moivetype = dir3[2]
                    #print(dir3[0], dir3[1], dir3[2])
                    # 评分
                    score = i('span', attrs={'class': 'rating_num'})
                    #print(score)
                    # 评价人数
                    comment1 = i('div', attrs={'class': 'star'})
                    comment = re.findall(r'\d+', str(comment1))[5]
                    # print(comment)
                    # 引用
                    try:
                        quote = i('span', attrs={'class': 'inq'})[0].string
                        # print(quote[0].string)
                    except Exception:
                        quote = " "
                    # 电影链接
                    href = i('a')
                    #print(href)
                    # 将href从bs4.element.ResultSet类型转换为str类型
                    for h in href:
                        lst.append(str(h))
                    # 此时"".join(lst)即为href的str类型
                    soup1 = BeautifulSoup("".join(lst), "html.parser")
                    # Href即为得到电影的链接
                    Href = soup1.a.attrs['href']
                    # 图片链接
                    pic = i('a')[0]
                    pic = pic('img')[0]['src']
                    # print(pic)
                    # 清空lst列表
                    lst.clear()
                    # 将数据存入列表中
                    download(self, pic, name[0].string)
                    if (pic[len(pic) - 4] == "."):
                        ext = pic[len(pic) - 4:]
                    else:
                        ext = ""
                    list.append([name[0].string, director, main_actor, pbyear, country, moivetype, score[0].string, comment, quote, Href, name[0].string+ext])
            except:
                print("解析出错", name[0].string)


def request_page(startUrl, headers):
    try:
        r = requests.get(startUrl, headers=headers, timeout=20)
        r.raise_for_status()
        # 分析得到网页的编码格式
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "error"

def get_headers():
    headers = {
        'cookie': 'miid=871473411901751759; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zfvmw%2Fq5v3gzlT6TfC88%2B8VsR2LZeh0x0uZ2lavUunwk7FaQvkKtSF8tQ1fLpJrqsz14B%2FyobMQW8Exzoggkl4YwBJKriMzZBFIh%2FIQKv1FPsNnhV6P2HnGBVjbylCyWiAFwW1fvt3aOBQ3uoAwtp6Of66wJHZ8YPC4wCiq2449kBKT6ufwkDkOEQ0bb3%2BTywKDqprZHDdiDO7zqxULUBvZcPasjtJ6gwye5pjKqluzqIlGZw5KubUzluZEGHyoRYm; lLtC1_=1; t=da76da827c8812bfc2ef4d091ab75358; v=0; _tb_token_=3f7e30eaede6e; _m_h5_tk=55efe72e4a7e4752c144c9d9a52250c0_1600601808838; _m_h5_tk_enc=480b7ec5d4482475e3f5f76e0f27d20f; cna=ZgXuFztWFlECAdpqkRhAo4LI; xlly_s=1; _samesite_flag_=true; cookie2=13aa06fa1e7317d5023153e70e45e95f; sgcookie=E100jT6nEM5pUv7ZaP1urbIUwS5ucPPcz2pG%2FHs9VDYB91E14tzvEV8mMtRfZtLe4dg8rBjdN9PEuYVfhzjqARIFKQ%3D%3D; unb=2341324810; uc3=id2=UUtP%2BSf9IoxzEQ%3D%3D&nk2=t2JNBkErCYOaTw%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dCufeJMlcWOvLCctE%3D; csg=6e737d42; lgc=%5Cu6E90%5Cu6E90%5Cu624B%5Cu62C9%5Cu624B; cookie17=UUtP%2BSf9IoxzEQ%3D%3D; dnk=%5Cu6E90%5Cu6E90%5Cu624B%5Cu62C9%5Cu624B; skt=d79995fc094af782; existShop=MTYwMDU5MTk1Ng%3D%3D; uc4=id4=0%40U2l1DTh9%2F%2BK9Ym%2BCcxbZCceodFVg&nk4=0%40tRGCEUSTUyhwCCuPpi69tV6rFUkp; tracknick=%5Cu6E90%5Cu6E90%5Cu624B%5Cu62C9%5Cu624B; _cc_=W5iHLLyFfA%3D%3D; _l_g_=Ug%3D%3D; sg=%E6%89%8B09; _nk_=%5Cu6E90%5Cu6E90%5Cu624B%5Cu62C9%5Cu624B; cookie1=AVdDOYh1aRzl%2BX23%2BL5DKrLq3hm6%2F%2BaYgNDVsRdW80M%3D; enc=Amz2MhdAq1awG9cmHzc5MLNi%2Bing6y4cR5EbHVPlJhWJxNKvr0B40mznTsnKW0JcubOujE6qhizMA84u7mBq2Q%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=85_1; uc1=existShop=false&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie14=Uoe0bU1NOIsDOA%3D%3D&cookie21=WqG3DMC9Fb5mPLIQo9kR&pas=0&cookie15=UIHiLt3xD8xYTw%3D%3D; thw=cn; JSESSIONID=6ECE41F4DC42E868790A52C0B362FD3C; isg=BNTUg0Kud-Dz7-OsCcrIFwHdpRJGLfgXLSfdwW61YN_iWXSjlj3Ip4rbWVFBujBv; l=eBgY5g8rOF5Pi0CSBOfanurza77OSIRYYuPzaNbMiOCP9b1B5N21WZru8mY6C3GVh60WR3rZpo7BBeYBqQAonxv92j-la_kmn; tfstk=cyJcBsT6YI5jdt1kOx6XAqHF5iMdwE3VudJvULmsgZ-4urfcF7PEePnlzEvT1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
    }
    return headers


def printList(list,num):

    print("排名", '\t', "电影名", '\t',"导演",'\t', "主演", '\t',"上映时间",'\t', "国家", '\t',"电影类型", '\t',"评分", '\t',"评价人数", '\t',"引用",'\t', "电影链接",'\t', "图片链接")
    count = 0
    # 输出的格式
    for i in range(num):
        count = count + 1
        lst = list[i]
        print(count, '\t',lst[0],'\t', lst[1],'\t',  lst[2],'\t',  lst[3], '\t', lst[4], '\t', lst[5],'\t',  lst[6], '\t', lst[7],'\t',  lst[8],'\t',  lst[9],'\t',lst[10])

def download(self, url, movieName):
    if (url[len(url) - 4] == "."):
        ext = url[len(url) - 4:]
    else:
        ext = ""
    req = urllib.request.Request(url, headers=self.headers)
    data = urllib.request.urlopen(req, timeout=100)
    data = data.read()
    fobj = open("./image/" + movieName + ext, "wb")
    fobj.write(data)
    fobj.close()


if __name__ == '__main__':
    url = 'https://movie.douban.com/top250?start='
    list = []
    header = get_headers()
    thread1 = ConsumerThread(url, header, 0, 100)
    thread2 = ConsumerThread(url, header, 125, 225)
    thread1.start()
    thread1.join()
    thread2.start()
    thread2.join()
    printList(list, 250)

结果贴图

结果输出

电影封面

2）心得体会

这次的多线程采用了前一半和后一半使用两个线程进行爬取
大体上结构以及下载图片都可以仿照以前的作业
但是在内容爬取上，不好搞哇，折腾了半天
在爬取时会有几部电影的结构不符合自己写的爬取思路，反复修改
最后没有在纠结输出格式了，有一些乱

作业二:

要求：
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息
爬取科软学校排名，并获取学校的详细链接，进入下载学校Logo存储、获取官网Url、院校信息等内容。
候选网站：https://www.shanghairanking.cn/rankings/bcur/2020
关键词：学生自由选择
输出信息：MYSQL的输出信息如下

1）作业结果

代码：

rank

import scrapy
from bs4 import UnicodeDammit
import urllib.request
from ..items import RankingItem

class RankSpider(scrapy.Spider):
    name = 'rank'
    start_urls = 'https://www.shanghairanking.cn/rankings/bcur/2020'

    def start_requests(self):
        url = RankSpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath("//div[@id='content-box']/div[2]/table/tbody/tr")
            for li in lis:
                try:
                    num = li.xpath("./td[position()=1]/text()").extract_first()
                    # print(num)
                    schoolname = li.xpath("./td[@class='align-left']/a/text()").extract_first()
                    # print(schoolname)
                    city = li.xpath("./td[position()=3]/text()").extract_first()
                    # print(city)
                    page_URL = li.xpath("././td[@class='align-left']/a/@href").extract_first()
                    # print(page_URL)
                    school_page_URL = 'https://www.shanghairanking.cn'+page_URL
                    # 链接学校
                    html = urllib.request.urlopen(school_page_URL)
                    html = html.read()
                    dammit = UnicodeDammit(html, ["utf-8", "gbk"])
                    data2 = dammit.unicode_markup
                    selector2 = scrapy.Selector(text=data2)
                    schoolinfo = selector2.xpath("//div[@class='univ-introduce']/p/text()").extract_first()
                    # print(schoolinfo)
                    officalURL = selector2.xpath("//div[@class='univ-website']/a/@href").extract_first()
                    # print(officalURL)
                    mfile = selector2.xpath("//td[@class='univ-logo']/img/@src").extract_first()
                    # print(mfile)
                    self.download(mfile, schoolname)

                    item = RankingItem()
                    item["num"] = num.strip() if num else ""
                    item["schoolname"] = schoolname.strip() if schoolname else ""
                    item["city"] = city.strip() if city else ""
                    item["officalURL"] = officalURL.strip() if officalURL else ""
                    item["schoolinfo"] = schoolinfo.strip() if schoolinfo else ""
                    item["mfile"] = mfile.strip() if mfile else ""
                    yield item
                except:
                    print("不存在该大学信息\n")
        except Exception as err:
            print(err)

    def download(self, url, schoolname):
        if (url[len(url) - 4] == "."):
            ext = url[len(url) - 4:]
        else:
            ext = ""
        data = urllib.request.urlopen(url, timeout=100)
        data = data.read()
        fobj = open("D:\\pycharm\\爬虫\\final\\image_school_logo\\" + schoolname + ext, "wb")
        fobj.write(data)
        fobj.close()

piplines

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface

import pymysql

class RankingPipeline(object):
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd="gh02120425", db="gc2",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from ranking")
            self.opened = True
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")

    def process_item(self, item, spider):
        try:
            print(item["num"])
            print(item["schoolname"])
            print(item["city"])
            print(item["officalURL"])
            print(item["schoolinfo"])
            print(item["mfile"])
            print('\n')
            if self.opened:
                self.cursor.execute(
                    "insert into ranking (n, school_name, school_city, school_officalURL, schoolinfo, mfile) values (%s,%s,%s,%s,%s,%s)",
                    (item['num'], item['schoolname'], item['city'], item['officalURL'], item['schoolinfo'],
                     item['mfile']))
        except Exception as err:
            print(err)

        return item

items

import scrapy

class DownimageItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_url = scrapy.Field()
    file_name = scrapy.Field()
    title = scrapy.Field()

settings

ITEM_PIPELINES = {
    'ranking.pipelines.RankingPipeline': 300,
}

结果贴图

结果输出

数据库

logo集

2）心得体会

这道题就好很多了！
xpath还是好用啊，scrapy框架也是仿照之前的作业
唯一就是爬着爬着突然报错，后来发现是有一个学校连接error，maybe是没有了叭hhhh

作业三:

要求：
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素加载、网页跳转等内容。
使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网，并获取学生自己账户中已学课程的信息并保存在MYSQL中。
其中模拟登录账号环节需要录制gif图。
候选网站：中国mooc网：https://www.icourse163.org
输出信息：MYSQL数据库存储和输出格式如下

1）作业结果

代码：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import pymysql
import pandas as pd
import time
import datetime

class MOOC:
    header = {
        "User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072531 Minefield/3.0.2pre"
    }

    def startUp(self, url):
        chrome_options = Options()
        chrome_options.add_argument("——headless")
        chrome_options.add_argument("——disable-gpu")
        self.driver = webdriver.Chrome(options=chrome_options)
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='gh02120425',
                                       db="gc2", charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from mooc")
            self.opened = True
        except Exception as err:
            print(err)
            self.opened = False
        self.driver.get(url)
        time.sleep(2.0)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
            self.opened = False
        except Exception as err:
            print(err)

    def showMOOC(self):
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='gh02120425', db="gc2",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.opened = True
            sqlcmd = "select * from mooc"
            a = pd.read_sql(sqlcmd, self.con)
            print(a)
            self.opened = False
        except Exception as err:
            print("出错，无法打印")
            print(err)

    def gif(self):
        # 一系列点击到手机号密码登陆界面
        time.sleep(0.5)
        self.driver.find_element_by_xpath("//div[@class='_3uWA6']").click()
        time.sleep(0.5)
        self.driver.find_element_by_xpath("//span[@class='ux-login-set-scan-code_ft_back']").click()
        time.sleep(0.5)
        self.driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']/li[2]").click()
        time.sleep(1.0)
        # 手机号密码的方式登录慕课网
        self.driver.switch_to.frame(1)
        self.driver.find_element_by_xpath("//input[@placeholder='请输入手机号']").clear()  # 找到手机号的框框
        self.driver.find_element_by_xpath("//input[@placeholder='请输入手机号']").send_keys("13123186121")
        time.sleep(1.0)
        print('ok1')
        self.driver.find_element_by_xpath("//input[@placeholder='请输入密码']").clear()  # 找到输入密码的框框
        self.driver.find_element_by_xpath("//input[@placeholder='请输入密码']").send_keys("gh02120425")
        time.sleep(1.0)
        print('ok2')
        self.driver.find_element_by_xpath("//a[@class='u-loginbtn btncolor tabfocus ']").click()
        print('ok3')
        time.sleep(2.0)

    def processMOOC(self):
        time.sleep(2)
        try:
            self.driver.find_element_by_xpath("//div[@class='_3uWA6']").click()
            self.driver.get(self.driver.current_url)  # 获取当前页面
            time.sleep(1.0)
            lis = self.driver.find_elements_by_xpath("//div[@class='course-panel-body-wrapper']//div[@class='course-card-wrapper']")
            gId = 0
            for li in lis:
                li.find_element_by_xpath(".//div[@class='box']").click()
                now_window = self.driver.window_handles[-1]
                self.driver.switch_to.window(now_window)
                time.sleep(2.0)
                self.driver.find_element_by_xpath("//div[@class='f-fl info']//a").click()
                last_now_window = self.driver.window_handles[-1]
                self.driver.switch_to.window(last_now_window)
                time.sleep(2.0)
                gId = gId + 1
                gCourse = self.driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text
                print(gCourse)
                gCollege = self.driver.find_element_by_xpath("//div[@class='m-teachers']//a").get_attribute("data-label")
                print(gCollege)
                gTeacher = self.driver.find_element_by_xpath("//div[@class='cnt f-fl']//h3[@class='f-fc3']").text
                print(gTeacher)
                gT = self.driver.find_elements_by_xpath("//div[@class='cnt f-fl']//h3[@class='f-fc3']")
                try:
                    gTeam = gT[0].text
                    for i in range(len(gT)):
                        if i > 0:
                            gTeam = gTeam + "," + gT[i].text
                except Exception as err:
                    print(err)
                print(gTeam)
                gCount = self.driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text
                print(gCount)
                gProcess = self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']//span[2]").text
                print(gProcess)
                gBrief = self.driver.find_element_by_xpath("//div[@class='course-heading-intro_intro']").text
                print(gBrief)
                self.cursor.execute(
                    "insert into mooc(gId,gCourse,gCollege,gTeacher,gTeam,gCount,gProcess,gBrief)"
                    "values (%s, %s, %s, %s, %s, %s, %s, %s)",
                    (gId, gCourse, gCollege, gTeacher, gTeam, gCount, gProcess, gBrief))
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])
        except Exception as err:
            print(err)

    def executeMOOC(self, url):
        starttime = datetime.datetime.now()
        print("starting!")
        self.startUp(url)
        self.gif()
        print("processing!")
        self.processMOOC()
        print("closing!")
        self.closeUp()
        print("complete!")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total", elapsed, "seconds elasped")

url = "https://www.icourse163.org/"
spider = MOOC()
while True:
    print("1.登陆并爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择（1，2，3）；")
    if s == "1":
        MOOC().executeMOOC(url)
        continue
    elif s == "2":
        MOOC().showMOOC()
        continue
    elif s == "3":
        break

结果贴图

结果输出

数据库

gif

2）心得体会

使用了上次的selenium框架，我觉得注意的点在于是否产生新页面，产生要定位到最后一个页面才能成功爬取
以及课程详情爬取完成后要关闭详情页，再跳转到第一个页面
在爬取时，Google会弹出一个data页面，后来百度是因为Google版本与webdriver版本不一致，那只能让程序多sleep一会儿，我手动×掉了hhh
后来发现下载了对应版本发现还是会出现，可能还有其他问题叭

posted @ 2020-12-01 11:40 DanspG 阅读(464) 评论(0) 编辑收藏举报

刷新页面返回顶部

DanspG

数据采集——第六次作业

作业一:

1）作业结果

代码：

结果贴图

2）心得体会

作业二:

1）作业结果

代码：

结果贴图

2）心得体会

作业三:

1）作业结果

代码：

结果贴图

2）心得体会

公告