数据采集与融合技术第六次作业

作业①:

要求:

  • 用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
  • 每部电影的图片,采用多线程的方法爬取,图片名字为电影名
  • 了解正则的使用方法.

代码:

from bs4 import BeautifulSoup
import threading
import re
import requests
import urllib.request
import pymysql
import datetime
import time

class Myspiders:

    def get_html(self, start_url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}
        res = requests.get(start_url, headers=headers)
        res.encoding = res.apparent_encoding
        html = res.text
        self.prase(html)

    def start(self):
        try:
            self.con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='yang6106', db='test',
                                       charset='utf8')
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            try:
                # 如果有表就删除
                self.cursor.execute("delete from movie")
            except:
                pass
            try:
                sql = 'create table movie(mNo int,mName varchar(32), director varchar(32),actor varchar(32),time varchar(32),' \
                  'country varchar(16),type varchar(32),score varchar(32),num varchar(32),quote varchar(32),mFile varchar(32));'
                self.cursor.execute(sql)
            except:
                pass
        except:
            pass

    def prase(self, html):
        urls = []
        soup = BeautifulSoup(html, "html.parser")
        movies = soup.find('ol')
        movies = movies.find_all('li')
        for i in movies:
            try:
                mNo = i.em.string
                mName = i.find('span').text
                info = i.find('p').text
                director = re.findall(r'导演: (.*?) ', info)
                actor = re.findall(r'主演: (.*?) ', info)
                array = re.findall(r'\d+.+', info)[0].split('/')
                time = array[0].strip()
                country = array[1].strip()
                type = array[2].strip()
                score = i.find('span', attrs={"class": "rating_num"}).text
                num = i.find('span',
                             attrs={"class": "rating_num"}).next_sibling.next_sibling.next_sibling.next_sibling.text
                quote = i.find('span', attrs={"class": "inq"}).text
                mFile = str(mName) + ".jpg"
                self.cursor.execute(
                    "insert into movie(mNo,mName,director,actor,time,country,type,score,num,quote,mFile) "
                    "values( % s, % s, % s, % s, % s, % s, % s, % s, % s, % s, % s)",
                    (mNo, mName, director, actor, time, country, type, score, num, quote, mFile))
            except Exception:
                pass
        # 查找页面内所有图片信息
        images = soup.select("img")
        for image in images:
            try:
                url = image['src']
                mName = image['alt']
                if url not in urls:
                    T = threading.Thread(target=self.download, args=(mName, url))
                    T.setDaemon(False)
                    T.start()
                    self.thread.append(T)
            except Exception as err:
                print(err)

    def closeup(self):
        try:
            self.con.commit()
            self.con.close()

        except Exception as err:
            print(err)

    def download(self, pic_name, img):
        req = urllib.request.Request(img)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("E:/imgs/movies_imgs/" + str(pic_name) + ".jpg", "wb")
        fobj.write(data)
        fobj.close()

    def executespider(self, url):
        #开始爬取
        starttime = datetime.datetime.now()
        print("开始爬取")
        print("数据库连接中")
        self.start()
        self.thread = []
        for page in range(0, 11):
            self.get_html("https://movie.douban.com/top250?start=" + str(page * 25) + "&filter=")
        for yang in self.thread:
            yang.join()
        self.closeup()
        #爬取结束
        print("爬取完成!")
        endtime = datetime.datetime.now()  # 计算爬虫耗时
        elapsed = (endtime - starttime).seconds
        print("总共花了", elapsed, " 秒")


def main():
    url = 'https://movie.douban.com/top250'
    spider = Myspiders()
    spider.executespider(url)


if __name__ == '__main__':
    main()

结果:

心得:刚开始爬取的时候遇到反扒机制,然后登陆账号再试了试就可以了

作业②

要求:

  • 熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息
  • 爬取科软学校排名,并获取学校的详细链接,进入下载学校Logo存储、获取官网Url、院校信息等内容。

-universitys:

import scrapy
from bs4 import UnicodeDammit
import requests
from ..items import UniversityGetsItem
from bs4 import BeautifulSoup
import os


class UniversitysSpider(scrapy.Spider):
    name = 'universitys'
    allowed_domains = ['https://www.shanghairanking.cn/rankings/bcur/2020']
    start_urls = ['http://https://www.shanghairanking.cn/rankings/bcur/2020']

    def start_requests(self, ):
        start_url = 'https://www.shanghairanking.cn/rankings/bcur/2020'
        yield scrapy.Request(url=start_url, callback=self.parse)

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", 'gbk'])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            trs = selector.xpath("//div[@class='rk-table-box']/table/tbody/tr")

            for tr in trs:
                no = tr.xpath("./td[position()=1]/text()").extract_first().strip()
                name = tr.xpath("./td[position()=2]/a/text()").extract_first()
                next = tr.xpath("./td[position()=2]/a/@href").extract_first()
                city = tr.xpath("./td[position()=3]/text()").extract_first().strip()
                # print(no,name,city)
                start_url = 'https://www.shanghairanking.cn/' + next
                # print(start_url)
                html = requests.get(url=start_url)
                dammit = UnicodeDammit(html.content, ['utf-8', 'gbk'])
                newdata = dammit.unicode_markup
                soup = BeautifulSoup(newdata, 'lxml')
                try:
                    url = soup.select("div[class='univ-website'] a")[0].text
                    # print(url)
                    mFileq = soup.select("td[class='univ-logo'] img")[0]["src"]
                    File = str(no) + '.jpg'
                    logodata = requests.get(url=mFileq).content
                    path = r'E:/imgs/university/'
                    if not os.path.exists(path):
                        os.mkdir(path)
                    file_path = path + '/' + File
                    with open(file_path, 'wb') as fp:
                        fp.write(logodata)
                    fp.close()
                    # print(File)
                    brief = soup.select("div[class='univ-introduce'] p")[0].text
                    # print(brief)
                except Exception as err:
                    print(err)
                item = UniversityGetsItem()
                item["sNo"] = no
                item["universityName"] = name
                item["city"] = city
                item["officalUrl"] = url
                item["info"] = brief
                item["mFile"] = File
                yield item

        except Exception as err:
            print(err)

        except Exception as err:
                    print(err)
  • pipelines:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from itemadapter import ItemAdapter
import pymysql


class UniversityGetsPipeline:
    def open_spider(self, spider):
        print("建立连接")
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='yang6106', db='test',
                                       charset='utf8')
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)

            try:
                self.cursor.execute("drop table if exists university")
                sql = """create table university(
                    sNo varchar(32) primary key,
                    schoolName varchar(32),
                    city varchar(32),
                    officalUrl varchar(64),
                    info text,
                    mFile varchar(32)
                )character set = utf8
                """
                self.cursor.execute(sql)
            except Exception as err:
                print(err)
                print("表格创建失败")
            self.open = True
            self.count = 1
        except Exception as err:
            print(err)
            self.open = False
            print("数据库连接失败")

    def process_item(self, item, spider):
        print(item['sNo'], item['schoolName'], item['city'], item['officalUrl'], item['info'], item['mFile'])
        if self.open:
            try:
                self.cursor.execute(
                    "insert into university (sNo,schoolName,city,officalUrl,info,mFile) values(%s,%s,%s,%s,%s,%s)", \
                    (item['sNo'], item['schoolName'], item['city'], item['officalUrl'], item['info'], item['mFile']))
                self.count += 1
            except:
                print("数据插入失败")
        else:
            print("数据库未连接")
        return item

    def close_spider(self, spider):
        if self.open:
            self.con.commit()
            self.con.close()
            self.open = False

  • items:
import scrapy


class UniversityGetsItem(scrapy.Item):
    sNo = scrapy.Field()
    schoolName = scrapy.Field()
    city = scrapy.Field()
    officalUrl = scrapy.Field()
    info = scrapy.Field()
    mFile = scrapy.Field()
  • 结果:

作业③

要求:

-熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素加载、网页跳转等内容。
-使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网,并获取学生自己账户中已学课程的信息并保存在MYSQL中。
-其中模拟登录账号环节需要录制gif图。

代码

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import time

class MOOC:

    def Load_in(self):
        time.sleep(1)
        # 点击登录
        user = self.driver.find_element_by_xpath('//*[@id="j-topnav"]/div')
        user.click()
        time.sleep(1)
        # 选择其他方式登录
        way = self.driver.find_element_by_xpath('//div[@class="ux-login-set-scan-code_ft"]/span')
        way.click()
        time.sleep(1)
        # 选择电话号码登录
        telephone = self.driver.find_element_by_xpath('//ul[@class="ux-tabs-underline_hd"]/li[2]')
        telephone.click()
        time.sleep(1)
        frame = self.driver.find_element_by_xpath(
            "/html/body/div[13]/div[2]/div/div/div/div/div/div[1]/div/div[1]/div[2]/div[2]/div/iframe")
        # 将操作切换至页面弹窗
        self.driver.switch_to.frame(frame)
        self.driver.find_element_by_xpath('//input[@type="tel"]').send_keys('13115920283')
        time.sleep(1)
        self.driver.find_element_by_xpath('//input[@class="j-inputtext dlemail"]').send_keys('5102095yang')
        time.sleep(1)
        load_in = self.driver.find_element_by_xpath('//*[@id="submitBtn"]')
        load_in.click()

    def MyClass(self):
        time.sleep(2)
        # 进入个人中心
        myclass = self.driver.find_element_by_xpath('//*[@id="j-indexNav-bar"]/div/div/div/div/div[7]/div[3]/div')
        myclass.click()
        self.all_spider()

    def all_spider(self):
        time.sleep(1)
        self.spider()
        time.sleep(1)
        # 进行翻页的尝试
        try:
            self.driver.find_element_by_xpath(
                '//ul[@class="ux-pager"]/li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-disable-gh"]')
        except Exception:
            self.driver.find_element_by_xpath(
                '//ul[@class="ux-pager"]/li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-main-gh"]').click()
            self.all_spider()

    def spider(self):
        id = 0
        time.sleep(1)
        lis = self.driver.find_elements_by_xpath('//div[@class="course-card-wrapper"]')
        print(lis)
        for li in lis:
            time.sleep(1)
            li.click()
            # 获取页面句柄,原网页为0,新网页为1
            window = self.driver.window_handles
            # 切换到新页面
            self.driver.switch_to.window(window[1])
            time.sleep(1)
            self.driver.find_element_by_xpath('//*[@id="g-body"]/div[3]/div/div[1]/div/a').click()
            # 重新获取句柄
            window = self.driver.window_handles
            # 切换到下一页面
            self.driver.switch_to.window(window[2])
            time.sleep(1)
            id += 1
            course = self.driver.find_element_by_xpath(
                '//*[@id="g-body"]/div[1]/div/div[3]/div/div[1]/div[1]/span[1]').text
            teacher = self.driver.find_element_by_xpath('//*[@id="j-teacher"]//h3[@class="f-fc3"]').text
            collage = self.driver.find_element_by_xpath('//*[@id="j-teacher"]/div/a/img').get_attribute('alt')
            process = self.driver.find_element_by_xpath('//*[@id="course-enroll-info"]/div/div[1]/div[2]/div[1]').text
            count = self.driver.find_element_by_xpath('//*[@id="course-enroll-info"]/div/div[2]/div[1]/span').text
            brief = self.driver.find_element_by_xpath('//*[@id="j-rectxt2"]').text
            self.cursor.execute("insert into mooc(id, course, teacher, collage, process, count, brief) "
                                "values( % s, % s, % s, % s, % s, % s, % s)",
                                (id, course, teacher, collage, process, count, brief))
            time.sleep(1)
            # 关闭此窗口
            self.driver.close()
            # 切换回上一网页
            self.driver.switch_to.window(window[1])
            time.sleep(1)
            # 进入两次新网页,所以要进行两次close()操作
            self.driver.close()
            self.driver.switch_to.window(window[0])

    def start(self):
        self.con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='yang6106', db='test', charset='utf8')
        self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
        # sql = 'create table mooc(id int,course varchar(32),teacher varchar(16),collage varchar(32),process varchar(64),' \
        #       'count varchar(64),brief text);'
        # '''cursor.execute(sql)'''
        self.cursor.execute("delete from mooc")

    def stop(self):
        try:
            self.con.commit()
            self.con.close()
        except Exception as err:
            print(err)

    def executespider(self, url):
        chrome_options = Options()
        self.driver = webdriver.Chrome(chrome_options=chrome_options)
        self.driver.get(url)
        self.start()
        self.Load_in()
        self.MyClass()
        self.stop()

def main():
    url = 'https://www.icourse163.org/'
    spider = MOOC()
    spider.executespider(url)


if __name__ == '__main__':
    main()

结果:

posted @ 2020-12-02 22:57  我是WiFi靠近我!  阅读(162)  评论(0编辑  收藏  举报