gbcmakehsht

导航

2023数据采集与融合技术实践作业三

作业①:

要求:

指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。
–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

输出信息:

将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。

Gitee文件夹链接

gitee仓库

关键代码

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class pics(scrapy.Item):
    url = scrapy.Field()

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class DemoPipeline:
    def open_spider(self,spider):
        self.conut = 0

    def process_item(self, item, spider):
        import requests
        path = r"D:\大三上\数据采集与融合技术\实践3\images_1"
        url = item['url']
        resp = requests.get(url)  # 获取指定图片的二进制文本
        img = resp.content
        with open(path + '\\image%d_' % (self.conut+1) + '.jpg', 'wb') as f:
            f.write(img)
            print("第%d张图片下载成功" % (self.conut+1))
        self.conut += 1

mySpider.py

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy

from items import pics

name = 'mySpider'
allowed_domains = ['p.weather.com.cn']
start_urls = ['http://p.weather.com.cn/tqxc/index.shtml']

def parse(self, response):
    # 用XML文档构建Selector对象
    data = response.body.decode()
    selector = scrapy.Selector(text=data)
    # 获取子网页链接
    links = selector.xpath("//div[@class='oi']/div[@class ='tu']/a/@href")
    srcs_str = []
    for i in range(len(links)):
        count = 0
        link=links[i].extract()
        yield scrapy.Request(url=link, callback=self.parse1)

def parse1(self,response):
    # 用XML文档构建Selector对象
    data = response.body.decode()
    selector = scrapy.Selector(text=data)
    # 获取图片路径
    pics_url=selector.xpath("//li[@class='child']/a[@class='img_back']/img/@src")
    for i in pics_url:
        url=i.extract()
        print(url)
        item = pics()
        item['url']=url
        yield item

单线程

# 单线程保存图片到指定路径
path = r"D:\大三上\数据采集与融合技术\实践3\images_1"
cnt = 1
for url in srcs_str:
    resp = requests.get(url)  # 获取指定图片的二进制文本
    img = resp.content
    with open(path + '\\image%d_' % cnt + '.jpg', 'wb') as f:
        f.write(img)
        print("第%d张图片下载成功" % cnt)
    if cnt == 149:
        break
    cnt += 1

多线程


try:
    threads=[]
    c=0
    for i in range(22):
        #构建soup对象
        count = 0
        pic_url = links[2 * i]["href"]
        rp = requests.get(pic_url)
        rp.encoding = 'utf-8'
        do = rp.text
        sp = BeautifulSoup(do, "lxml")
        pics = sp.select('img[src$=".jpg"]')
        for pic in pics:
            print(pic["src"])
            url=pic["src"]
            count += 1
            if count == 5:
                break
            # 新建下载线程下载图像
            path = r"D:\大三上\数据采集与融合技术\实践3\images_2"

            if c < 149:
                t = threading.Thread(target=Download,args=(path,url,c))
                t.setDaemon(False)
                # 设置为前台进程
                t.start()
                threads.append(t)
                c += 1
            else:
                break

    for thread in threads:
        thread.join()
except Exception as err:
    print(err)

结果


作业②

要求:

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/

输出信息:

MySQL数据库存储和输出格式如下:
表头英文命名例如:序号id,股票代码:bStock

Gitee文件夹链接

gitee仓库

关键代码

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class GupiaoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    Transaction_date = scrapy.Field()  # 交易日期
    Opening_price = scrapy.Field()  # 开盘价
    Number_of_transactions = scrapy.Field()  # 成交数量
    Closing_price = scrapy.Field()  # 收盘价
    minimum_price = scrapy.Field()  # 最低价
    Highest_price = scrapy.Field()  # 最高价
    Securities_code = scrapy.Field()  # 证券代码
    Securities_abbreviation = scrapy.Field()  # 证券简称
    pass


pipelines.py



class BaidustocksInfoPipeline(object):
    def process_item(self, item, spider):
        return item


class BaidustocksInfoPipeline(object):
    # 当爬虫被调用时
    def open_spider(self, spider):
        self.f = open('gupiao.txt', 'w')

        # 当爬虫关闭时

    def close_spider(self, spider):
        self.f.close()

        # 对每一个item处理时

    def process_item(self, item, spider):
        try:
            line = str(dict(item)) + '\n'
            self.f.write(line)
        except:
            pass
        return item


mySpider.py

import scrapy
import re


class StocksSpider(scrapy.Spider):
    name = 'stocks'
    start_urls = ['http://quote.eastmoney.com/stock_list.html']

    def parse(self, response):
        # 循环获取列表中a标签的链接信息
        for href in response.css('a::attr(href)').extract():
            try:
                # 通过正则表达式获取链接中想要的信息
                stock = re.findall(r"[s][hz]\d{6}", href)[0]
                # 生成百度股票对应的链接信息
                url = 'http://gu.qq.com/' + stock + '/gp'
                # yield是生成器
                # 将新的URL重新提交到scrapy框架
                # callback给出了处理这个响应的处理函数为parse_stock
                yield scrapy.Request(url, callback=self.parse_stock)
            except:
                continue

    # 定义如何存百度的单个页面中提取信息的方法
    def parse_stock(self, response):
        # 因为每个页面返回 的是一个字典类型,所以定义一个空字典
        infoDict = {}
        stockName = response.css('.title_bg')
        stockInfo = response.css('.col-2.fr')
        name = stockName.css('.col-1-1').extract()[0]
        code = stockName.css('.col-1-2').extract()[0]
        info = stockInfo.css('li').extract()
        # 将提取的信息保存到字典中
        for i in info[:13]:
            key = re.findall('>.*?<', i)[1][1:-1]
            key = key.replace('\u2003', '')
            key = key.replace('\xa0', '')
            try:
                val = re.findall('>.*?<', i)[3][1:-1]
            except:
                val = '--'
            infoDict[key] = val

            # 对股票的名称进行更新
        infoDict.update({'股票名称': re.findall('\>.*\<', name)[0][1:-1] + \
                                 re.findall('\>.*\<', code)[0][1:-1]})
        yield infoDict

结果

作业③:

要求:

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/

Gitee文件夹链接

gitee仓库

关键代码

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class MoneyItem(scrapy.Item):
    Currency = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    Time = scrapy.Field()

pipelines.py

# Define your ite
import sqlite3
import requests

class MoneyDB:
    def openDB(self):
        self.con = sqlite3.connect("movies.db") #建立数据库链接,若没有对应数据库则创建
        self.cursor = self.con.cursor() #建立游标
        try:
            self.cursor.execute("create table money "
                                "(mSeq int(4),mName varchar(16),"
                                "mDirector varchar(32),mActors varchar(64),"
                                "mQuote varchar(32),mScore varchar(8),mDyfm varchar(64))")
        except:
            self.cursor.execute("delete from movies")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, mSeq, mName, mDirector, mActors, mQuote, mScore, mDyfm):
        try:
            self.cursor.execute("insert into movies (mSeq,mName,mDirector,mActors,mQuote,mScore,mDyfm) values (?,?,?,?,?,?,?)",
                                (mSeq, mName, mDirector, mActors, mQuote, mScore, mDyfm))
        except Exception as err:
            print(err)

class Demo2Pipeline:
    def open_spider(self,spider):
        print("开始爬取")
        self.count = 1
        self.db = MoneyDB()
        self.db.openDB()

    def process_item(self, item, spider):
        #下载图片
        path=r"D:\大三上\数据采集与融合技术\实践3"
        url = item['dyfm']
        resp = requests.get(url)  # 获取指定图片的二进制文本
        img = resp.content
        fm_path='imgs\image%d_' % self.count + '.jpg'
        with open(path + '\\image%d_' % self.count + '.jpg', 'wb') as f:
            f.write(img)
            print("第%d张图片下载成功" % self.count)
        self.count += 1
        #保存到数据库
        self.db.insert(item['seq'],item['name'],item['director'],item['actors'],item['quote'],item['score'],fm_path)
        return item

    def close_spider(self, spider):
        self.db.closeDB()
        print("结束爬取")


mySpider.py

import re
import scrapy
import main_items3

name = 'mySpider'
allowed_domains = ['boc.cn/sourcedb/whpj']
url = 'https://www.boc.cn/sourcedb/whpj/'
start_urls = []
for i in range(10):
    if i == 0:
        start_urls.append(url)
    else:
        start_urls.append(url+"?start="+str(i * 25))

def parse(self, response):
    print(self.start_urls)
    data = response.body.decode()
    selector = scrapy.Selector(text=data)
    print(selector)
    moneys = selector.xpath('//div[@id="content"]//li')
    cnt = 1
    for money in moneys:
        item = main_items3.MoneyItem()

        item['Currency']=cnt

        item['TBP'] = money.xpath('./div/div[@class="pic"]/a[@href]/img/@alt').extract_first()

        text = money.xpath('./div/div[@class="info"]/div[@class="bd"]/p[@class]/text()').extract_first()
        text = text.strip().replace('\xa0\xa0\xa0','')
        item['CBP'] = re.split(r': |: ',text)[1]
        item['TSP'] = re.split(r': |: ',text)[2]

        item['CSP'] = money.xpath('./div/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract_first()

        # item['score'] = money.xpath('./div/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract_first()

        item['Time'] = money.xpath('./div/div[@class="pic"]/a[@href]/img/@src').extract_first()
        yield item

结果

posted on 2023-10-22 19:39  柚子湖  阅读(28)  评论(0编辑  收藏  举报