数据采集与融合实践作业三

gitee链接
https://gitee.com/zxbaixuexi/2024scrapy/tree/master/第三次实验

作业①:

1)

要求:要求:指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。

代码:
weatherspider.py

import scrapy
from ..items import WeatherItem


class WeatherspiderSpider(scrapy.Spider):
    name = "weatherspider"
    allowed_domains = ["weather.com.cn","pi.weather.com.cn"]
    start_urls = ["http://www.weather.com.cn/"]

    def parse(self, response):
        img_address = response.xpath("//img/@src").getall()
        item = WeatherItem()
        item['image_urls'] = img_address
        yield item

items.py

import scrapy

class WeatherItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()
    image_paths = scrapy.Field()

pipelines.py

from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline

class WeatherPipeline(ImagesPipeline):
    default_headers = {
        'accept': 'image/webp,image/*,*/*;q=0.8',
        'accept-encoding': 'gzip, deflate, sdch, br',
        'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'cookie': 'bid=yQdC/AzTaCw',
        'referer': 'https://www.douban.com/photos/photo/2370443040/',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
    }

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            self.default_headers['referer'] = image_url
            yield Request(image_url, headers=self.default_headers)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item


settings.py

ITEM_PIPELINES = {
   "weather.pipelines.WeatherPipeline":300
}
IMAGES_STORE = r'C:\Users\supermejane\Desktop\爬虫实践\第三次实验\pythonProject1\weather\images'
# 过期天数
IMAGES_EXPIRES = 90  #90天内抓取的都不会被重抓

运行结果

2)心得体会

主要练习了使用scrapy下载指定域名的图片,可以使用前面的request请求获取数据再with open file as...下载,这里使用的是scrapy自带的ImagesPipeline下载,需要注意的是,要重写get_media_requests,item_completed方法,并且要在settings.py启用自定义的ImagesPipeline并设置IMAGES_STORE作为图片存储路径。另外还需要正确设置weatherspider.py中的allow_domain

作业②:

1)

要求:熟练掌握scrapy中Item、Pipeline数据的序列化输出方法;Scrapy+Xpatho +MySQL数据库存储技术路线爬取股票相关信息。本次爬取的是东方财富网:https://www.eastmoney.com/

代码
stockspider.py

import scrapy
from scrapy.http import Request
import json
from ..items import StockItem

class StockspiderSpider(scrapy.Spider):
    name = "stockspider"
    allowed_domains = ["eastmoney.com"]
    start_urls = [
        "http://38.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406848566904145428_1697696179672&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1697696179673"]

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url)

    def parse(self, response):
        '''
        序号,股票代码:f12,股票名称:f14,最新报价:f2,涨跌幅:f3,涨跌额:f4,成交量:f5,成交额:f6
        振幅:f7,最高:f15,最低:f16,今开:f17,昨收:f18
        f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18
        '''
        # 提取括号内的JSON数据部分
        start_index = response.text.find('(') + 1
        end_index = response.text.rfind(')')
        json_data = response.text[start_index:end_index]
        # 解析JSON数据
        # print(json_data)
        # print(type(json_data))
        json_obj = json.loads(json_data)
        # 打印输出
        # print(type(json_obj))
        # print(json_obj)
        # 取出data中的数据列表list
        data = json_obj['data']['diff']
        # # print(data,type(data))
        goods_list = []
        name = ['f12', 'f14', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f15', 'f16', 'f17', 'f18']
        count = 0
        for li in data:
            list = []
            list.append(count)
            for n in name:
                list.append(li[n])
            count += 1
            goods_list.append(list)
        # # print(goods_list)
        for k in goods_list:
            # [1, '301348', '蓝箭电子', 50.53, 20.0, 8.42, 172116, 815409272.27, 22.56, 50.53, 41.03, 41.04, 42.11]
            stock = StockItem()
            stock['id'] = str(k[0])
            stock['number'] = str(k[1])
            stock['name'] = str(k[2])
            stock['new_price'] = str(k[3])
            stock['up_down_precent'] = str(k[4])
            stock['up_down_num'] = str(k[5])
            stock['turnover'] = str(k[6])
            stock['Transaction_volume'] = str(k[7])
            stock['vibration'] = str(k[8])
            stock['maxx'] = str(k[9])
            stock['minn'] = str(k[10])
            stock['today'] = str(k[11])
            stock['yesterday'] = str(k[12])
            yield stock

items.py

import scrapy

class StockItem(scrapy.Item):
    # define the fields for your item here like:
    id = scrapy.Field()
    number = scrapy.Field()
    name = scrapy.Field()
    new_price = scrapy.Field()
    up_down_precent = scrapy.Field()
    up_down_num = scrapy.Field()
    turnover = scrapy.Field()
    Transaction_volume = scrapy.Field()
    vibration = scrapy.Field()
    maxx = scrapy.Field()
    minn = scrapy.Field()
    today = scrapy.Field()
    yesterday = scrapy.Field()

pipelines.py

import pymysql

class StockPipeline:
    def open_spider(self, spider):
        self.client = pymysql.connect(host="localhost", port=3306, user="root", password="123456", db="homework1",
                                      charset="utf8")
        self.cursor = self.client.cursor()

    def process_item(self, item, spider):
        args = [
            item.get("id"),
            item.get("number"),
            item.get("name"),
            item.get("new_price"),
            item.get("up_down_precent"),
            item.get("up_down_num"),
            item.get("turnover"),
            item.get("Transaction_volume"),
            item.get("vibration"),
            item.get("maxx"),
            item.get("minn"),
            item.get("today"),
            item.get("yesterday"),
        ]
        sql = "insert into stock values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        self.cursor.execute(sql, args)
        self.client.commit()
        return item

    def close_spider(self, spider):
        self.client.close()
        self.cursor.close()

settings.py

ITEM_PIPELINES = {
   "stock.pipelines.StockPipeline": 300,
}
COOKIES_ENABLED = False
ROBOTSTXT_OBEY = False

运行结果

2)心得体会

主要练习了通过scrapy框架访问api获取原始数据(json格式)并解析,需要注意的是要在settings.py中设置COOKIES_ENABLED = False,
ROBOTSTXT_OBEY = False这两项,否则会报错 [scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt

作业③:

1)

要求:熟练掌握scrapy中Item、Pipeline数据的序列化输出方法;↵
使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

代码
stockspider.py

import scrapy
from ..items import BankItem

class BankspiderSpider(scrapy.Spider):
    name = "bankspider"
    allowed_domains = ["boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        # 使用XPath选择所有<tr>元素
        rows = response.xpath("//tr[position()>1]")  # 忽略第一个<tr>元素
        # 遍历每个<tr>元素
        '''
        <th>货币名称</th>
        <th>现汇买入价</th>
        <th>现钞买入价</th>
        <th>现汇卖出价</th>
        <th>现钞卖出价</th>
        <th>中行折算价</th>
        <th>发布日期</th>
        <th>发布时间</th>
        '''
        for row in rows:
            # 使用XPath选择当前<tr>下的所有<td>元素,并提取文本值
            currencyname = row.xpath("./td[1]//text()").get()
            hui_in = row.xpath("./td[2]//text()").get()
            chao_in = row.xpath("./td[3]//text()").get()
            hui_out = row.xpath("./td[4]//text()").get()
            chao_out = row.xpath("./td[5]//text()").get()
            zhonghang = row.xpath("./td[6]//text()").get()
            date = row.xpath("./td[7]//text()").get()
            time = row.xpath("./td[8]//text()").get()
            print(currencyname)
            print(hui_in)
            print(chao_in)
            print(hui_out)
            print(chao_out)
            print(zhonghang)
            print(date)
            print(time)
            currency = BankItem()
            currency['currencyname'] = str(currencyname)
            currency['hui_in'] = str(hui_in)
            currency['chao_in'] = str(chao_in)
            currency['hui_out'] = str(hui_out)
            currency['chao_out'] = str(chao_out)
            currency['zhonghang'] = str(zhonghang)
            currency['date'] = str(date)
            currency['time'] = str(time)
            yield currency

items.py

import scrapy

class BankItem(scrapy.Item):
        currencyname = scrapy.Field()
        hui_in = scrapy.Field()
        chao_in = scrapy.Field()
        hui_out = scrapy.Field()
        chao_out = scrapy.Field()
        zhonghang = scrapy.Field()
        date = scrapy.Field()
        time = scrapy.Field()

pipelines.py

import pymysql

class BankPipeline:
    def open_spider(self, spider):
        self.client = pymysql.connect(host="localhost", port=3306, user="root", password="123456", db="homework1",
                                      charset="utf8")
        self.cursor = self.client.cursor()

    def process_item(self, item, spider):
        args = [
            item.get("currencyname"),
            item.get("hui_in"),
            item.get("chao_in"),
            item.get("hui_out"),
            item.get("chao_out"),
            item.get("zhonghang"),
            item.get("date"),
            item.get("time"),
        ]
        sql = "insert into currency(currencyname,hui_in,chao_in,hui_out,chao_out,zhonghang,date,time) values(%s,%s,%s,%s,%s,%s,%s,%s)"
        self.cursor.execute(sql, args)
        self.client.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.client.close()

settings.py

ITEM_PIPELINES = {
   "bank.pipelines.BankPipeline": 300,
}

运行结果

2)心得体会
练习了scrapy框架爬取并解析网页数据,重写pipeline的process_item处理数据并存入mysql,并通过重写自带的open_spider,close_spider开启关闭数据库连接,使用的库是pymysql

posted @ 2024-10-29 23:03  菜鸟bird  阅读(1)  评论(0编辑  收藏  举报