数据采集与融合技术实践第三次作业

作业1

要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。
务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
输出信息:

代码:

weather_images.py
import scrapy
import re
import os
from scrapy.http import Request
from weather_image_spider.items import WeatherImageSpiderItem

class WeatherImagesSpider(scrapy.Spider):
    name = "weather_images"
    allowed_domains = ["weather.com.cn"]
    start_urls = ["http://www.weather.com.cn/"]
    
    # 设置图片数量和页面限制
    total_images = 125
    total_pages = 25  # 按学号尾数设置总页数
    images_downloaded = 0
    pages_visited = 0

    def parse(self, response):
        # 获取页面中的链接
        url_list = re.findall(r'<a href="(.*?)"', response.text, re.S)
        
        for url in url_list:
            if self.pages_visited >= self.total_pages:
                break
            self.pages_visited += 1
            yield Request(url, callback=self.parse_images)
    
    def parse_images(self, response):
        img_list = re.findall(r'<img.*?src="(.*?)"', response.text, re.S)
        
        for img_url in img_list:
            if self.images_downloaded >= self.total_images:
                break
            self.images_downloaded += 1
            print(f"正在保存第{self.images_downloaded}张图片 路径: {img_url}")
            yield WeatherImageSpiderItem(image_urls=[img_url])
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import os

class WeatherImagePipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None):
        image_name = request.url.split('/')[-1]
        return f'images/{image_name}'
settings.py
# Scrapy settings for weather_image_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "weather_image_spider"

SPIDER_MODULES = ["weather_image_spider.spiders"]
NEWSPIDER_MODULE = "weather_image_spider.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "weather_image_spider (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "weather_image_spider.middlewares.WeatherImageSpiderSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "weather_image_spider.middlewares.WeatherImageSpiderDownloaderMiddleware": 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    "weather_image_spider.pipelines.WeatherImageSpiderPipeline": 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

# 限制单线程
CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 0.25  # 可以根据需要调整

# 激活图片管道
ITEM_PIPELINES = {
    'weather_image_spider.pipelines.WeatherImagePipeline': 1,
}
IMAGES_STORE = './images'

心得体会:对于单线程爬取和多线程爬取的区别还不是很懂,我认为似乎就是在pipelines.py中CONCURRENT_REQUESTS是否被赋值

作业2

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/
代码:

eastmoney.py
import scrapy
import json
from ..items import StockItem

class StockSpider(scrapy.Spider):
    name = "stock"
    allowed_domains = ["eastmoney.com"]
    
    def start_requests(self):
        base_url1 = 'http://95.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112404577990037157569_1696660645140'
        base_url2 = '&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696660645141'
        total_pages = 4  #爬取前4页

        for page_number in range(1, total_pages + 1):
            page_url = f"{base_url1}&pn={page_number}{base_url2}"
            yield scrapy.Request(page_url, callback=self.parse)

    def parse(self, response):
        data = response.text 
        json_data = json.loads(data[data.find('{'):data.rfind('}') + 1])
        stock_list = json_data['data']['diff']

        for stock in stock_list:
            item = StockItem()
            item['code'] = stock['f12']
            item['name'] = stock['f14']
            item['latest_price'] = stock['f2']
            item['change_percent'] = stock['f3']
            item['change_amount'] = stock['f4']
            item['volume'] = stock['f5']
            item['turnover'] = stock['f6']
            item['amplitude'] = stock['f7']
            item['highest'] = stock['f15']
            item['lowest'] = stock['f16']
            item['open_price'] = stock['f17']
            item['close_price'] = stock['f18']
            yield item

items.py
import scrapy

class StockItem(scrapy.Item):
    code = scrapy.Field()
    name = scrapy.Field()
    latest_price = scrapy.Field()
    change_percent = scrapy.Field()
    change_amount = scrapy.Field()
    volume = scrapy.Field()
    turnover = scrapy.Field()
    amplitude = scrapy.Field()
    highest = scrapy.Field()
    lowest = scrapy.Field()
    open_price = scrapy.Field()
    close_price = scrapy.Field()
pipelines.py
import sqlite3
import MySQLdb  # 引入 MySQLdb

class StockPipeline:
    def __init__(self):
        self.create_database()

    def create_database(self):
        self.conn = sqlite3.connect('stock.db')
        self.cursor = self.conn.cursor()
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS stocks (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                code TEXT,
                name TEXT,
                latest_price REAL,
                change_percent REAL,
                change_amount REAL,
                volume INTEGER,
                turnover REAL,
                amplitude REAL,
                highest REAL,
                lowest REAL,
                open_price REAL,
                close_price REAL
            )
        ''')
        self.conn.commit()

    def process_item(self, item, spider):
        self.save_stock_data_to_database(item)
        return item

    def save_stock_data_to_database(self, item):
        self.cursor.execute('''
            INSERT INTO stocks (
                code, name, latest_price, change_percent, change_amount,
                volume, turnover, amplitude, highest, lowest, open_price, close_price
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            item['code'], item['name'], item['latest_price'], item['change_percent'],
            item['change_amount'], item['volume'], item['turnover'], item['amplitude'],
            item['highest'], item['lowest'], item['open_price'], item['close_price']
        ))
        self.conn.commit()

    def close_spider(self, spider):
        self.conn.close()

class TerminalOutputPipeline(object):
    def process_item(self, item, spider):
        # 在终端输出数据
        print(f"股票代码: {item['code']}")
        print(f"股票名称: {item['name']}")
        print(f"最新报价: {item['latest_price']}")
        print(f"涨跌幅: {item['change_percent']}")
        print(f"跌涨额: {item['change_amount']}")
        print(f"成交量: {item['volume']}")
        print(f"成交额: {item['turnover']}")
        print(f"振幅: {item['amplitude']}")
        print(f"最高: {item['highest']}")
        print(f"最低: {item['lowest']}")
        print(f"今开: {item['open_price']}")
        print(f"昨收: {item['close_price']}")
        return item

class MySQLPipeline:
    def open_spider(self, spider):
        # 初始化数据库连接
        self.connection = MySQLdb.connect(
            host='localhost',        # 数据库主机名
            user='root',    # 数据库用户名
            password='123456', # 数据库密码
            db='stocks_db',          # 数据库名称
            charset='utf8mb4',
            use_unicode=True
        )
        self.cursor = self.connection.cursor()

    def close_spider(self, spider):
        # 关闭数据库连接
        self.connection.commit()
        self.cursor.close()
        self.connection.close()

    def process_item(self, item, spider):
        # 处理 item,并将其存入数据库
        self.save_stock_data_to_database(item)
        return item

    def save_stock_data_to_database(self, item):
        sql = '''INSERT INTO stocks (code, name, latest_price, change_percent, change_amount,
                    volume, turnover, amplitude, highest, lowest, open_price, close_price)
                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
        self.cursor.execute(sql, (
            item['code'],
            item['name'],
            item['latest_price'],
            item['change_percent'],
            item['change_amount'],
            item['volume'],
            item['turnover'],
            item['amplitude'],
            item['highest'],
            item['lowest'],
            item['open_price'],
            item['close_price']
        ))
settings.py
BOT_NAME = "stock_scraper"

SPIDER_MODULES = ["stock_scraper.spiders"]
NEWSPIDER_MODULE = "stock_scraper.spiders"
ITEM_PIPELINES = {
    'stock_scraper.pipelines.MySQLPipeline': 2,
    'stock_scraper.pipelines.StockPipeline': 1,
    'stock_scraper.pipelines.TerminalOutputPipeline': 3,
}


USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

ROBOTSTXT_OBEY = False

输出信息:

心得体会:这一题用时比较长,因为股票网页是动态的,开始的时候想的比较简单。后来用了F12抓包获取对应的XPATH后才取得了成功。至于对于上课时候讲的通过Selenium框架如何结合着操作,还未实践,听的也不是很懂。

作业3

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
代码:

bank.py
import scrapy
from bank.items import BankItem

class BankSpider(scrapy.Spider):
    name = 'bank'
    start_urls = ['https://www.boc.cn/sourcedb/whpj/index.html']
    
    def start_requests(self):
        num_pages = int(getattr(self, 'pages', 4))  

        for page in range(1, num_pages + 1):
            if page == 1:
                start_url = f'https://www.boc.cn/sourcedb/whpj/index.html'
            else:
                start_url = f'https://www.boc.cn/sourcedb/whpj/index_{page-1}.html'
            yield scrapy.Request(start_url, callback=self.parse)
    
    def parse(self, response):   
        bank_list = response.xpath('//tr[position()>1]')
        for bank in bank_list:
            item = BankItem()
            item['Currency'] = bank.xpath('.//td[1]/text()').get()
            item['TBP'] = bank.xpath('.//td[2]/text()').get()
            item['CBP'] = bank.xpath('.//td[3]/text()').get()
            item['TSP'] = bank.xpath('.//td[4]/text()').get()
            item['CSP'] = bank.xpath('.//td[5]/text()').get()
            item['Time'] = bank.xpath('.//td[8]/text()').get()
            
            yield item
items.py
import scrapy
class BankItem(scrapy.Item):
    Currency = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    Time = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
import sqlite3

class BankPipeline:
    def open_spider(self, spider):
        self.conn = sqlite3.connect('bank.db')
        self.cursor = self.conn.cursor()
        self.create_database()

    def create_database(self):
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS banks (
                Currency TEXT ,
                TBP REAL,
                CBP REAL,
                TSP REAL,
                CSP REAL,
                Time TEXT
            )
        ''')
        self.conn.commit()

    def process_item(self, item, spider):
        self.cursor.execute('''
            INSERT INTO banks (
                Currency,
                TBP,
                CBP,
                TSP,
                CSP,
                Time
            ) VALUES (?, ?, ?, ?, ?, ?)
        ''', (item['Currency'],item['TBP'],item['CBP'],item['TSP'],item['CSP'],item['Time'] ))
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.conn.close()
settings.py
BOT_NAME = "bank"

SPIDER_MODULES = ["bank.spiders"]
NEWSPIDER_MODULE = "bank.spiders"

CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 1

ITEM_PIPELINES = {
    'bank.pipelines.BankPipeline': 300,
}

输出信息:

心得体会:做完第二题再来做第三题就会轻松许多了,主要加强了我对Scrapy框架和MySQL数据库(Sqlite3)操作的综合运用能力,最后我是在Navicat Prenium中进行了数据库中表的可视化

posted @ 2024-10-29 21:15  Russell-Cadian  阅读(7)  评论(0编辑  收藏  举报