数据采集与融合技术第三次作业

学号姓名 102202103王文豪
gitee仓库地址 https://gitee.com/wwhpower/project_wwh.git

作业①:

要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。

(1)代码如下:

#item.py
import scrapy

class WeatherImageItem(scrapy.Item):
    image_urls = scrapy.Field()  # 存储图片URL的字段
    images = scrapy.Field()  # 存储下载后的图片
#spider.py
import scrapy
from weather_image_scraper.items import WeatherImageItem

class WeatherSpiderSpider(scrapy.Spider):
    name = 'weather_images'
    allowed_domains = ['weather.com.cn']
    start_urls = ['http://www.weather.com.cn/']
    
    # 控制总页数和总下载的图片数量
    total_pages = 3
    total_images = 103
    images_crawled = 0
    current_page = 1

    def parse(self, response):
        # 提取当前页面的图片链接
        img_urls = response.css('img::attr(src)').getall()
        
        for img_url in img_urls:
            if self.images_crawled < self.total_images:
                yield {
                    'image_url': img_url,
                }
                self.images_crawled += 1
            else:
                break  # 达到总图片数量限制,停止爬取

        # 判断是否需要继续爬取下一页
        if self.current_page < self.total_pages:
            self.current_page += 1
            next_page_url = f'http://www.weather.com.cn/page/{self.current_page}'  # 假设分页 URL 规则
            yield scrapy.Request(url=next_page_url, callback=self.parse)
#pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import scrapy

class WeatherImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        item['images'] = image_paths
        return item
#settings.py
ITEM_PIPELINES = {
    'weather_image_scraper.pipelines.WeatherImagesPipeline': 1,
}

IMAGES_STORE = 'C:/Users/86158/examples'  # 设置图片存储路径
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1  # 单线程
# CONCURRENT_REQUESTS = 10  # 多线程

结果如下:

爬取的图片:

(图片有限,仅爬取这么多)
(2)作业心得:本次实验通过在实现单线程和多线程爬取的过程中,我明显感受到了多线程爬取的速度优势。通过Scrapy的异步处理能力,多线程爬取可以同时发起多个请求,显著提高了爬取效率。

作业②

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/

输出信息:MySQL数据库存储和输出格式如下:
表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计

(1)代码如下:

#item.py
import scrapy

class StockItem(scrapy.Item):
    id = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    newPrice = scrapy.Field()
    price_change_amplitude = scrapy.Field()
    price_change_Lines = scrapy.Field()
    volume = scrapy.Field()
    turnover = scrapy.Field()
    amplitude = scrapy.Field()
    highest = scrapy.Field()
    lowest = scrapy.Field()
    today = scrapy.Field()
    yesterday = scrapy.Field()
spider.py
import scrapy
from stock_scraper.items import StockItem

class StockSpider(scrapy.Spider):
    name = 'stock_spider'
    allowed_domains = ['www.eastmoney.com']
    start_urls = ['https://quote.eastmoney.com/center/gridlist.html#hs_a_board']

    def parse(self, response):
        stocks = response.xpath("//tbody//tr")
        for stock in stocks:
            item = StockItem()
            item['id'] = stock.xpath('.//td[1]//text()').get()
            item['code'] = stock.xpath('.//td[2]//text()').get()
            item['name'] = stock.xpath('.//td[3]//text()').get()
            item['newPrice'] = stock.xpath('.//td[5]//text()').get()
            item['price_change_amplitude'] = stock.xpath('.//td[6]//text()').get()
            item['price_change_Lines'] = stock.xpath('.//td[7]//text()').get()
            item['volume'] = stock.xpath('.//td[8]//text()').get()
            item['turnover'] = stock.xpath('.//td[9]//text()').get()
            item['amplitude'] = stock.xpath('.//td[10]//text()').get()
            item['highest'] = stock.xpath('.//td[11]//text()').get()
            item['lowest'] = stock.xpath('.//td[12]//text()').get()
            item['today'] = stock.xpath('.//td[13]//text()').get()
            item['yesterday'] = stock.xpath('.//td[14]//text()').get()
            yield item
#pipelines.py
import mysql.connector
from mysql.connector import Error

class MySQLPipeline:

    def open_spider(self, spider):
        try:
            self.connection = mysql.connector.connect(
                host='127.0.0.1',
                database='wwh',  # 使用您的数据库名称
                user='root',
                password='123456'  # 使用您的密码
            )
            self.cursor = self.connection.cursor()
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS stockData (
                    id INTEGER PRIMARY KEY AUTO_INCREMENT,
                    code VARCHAR(255),
                    name VARCHAR(255),
                    newPrice VARCHAR(255),
                    price_change_amplitude VARCHAR(255),
                    price_change_Lines VARCHAR(255),
                    volume VARCHAR(255),
                    turnover VARCHAR(255),
                    amplitude VARCHAR(255),
                    highest VARCHAR(255),
                    lowest VARCHAR(255),
                    today VARCHAR(255),
                    yesterday VARCHAR(255)
                )
            ''')
        except Error as e:
            spider.logger.error(f"Error connecting to MySQL: {e}")

    def close_spider(self, spider):
        try:
            self.connection.commit()
        except Error as e:
            spider.logger.error(f"Error committing to MySQL: {e}")
        finally:
            self.cursor.close()
            self.connection.close()

    def process_item(self, item, spider):
        try:
            with self.connection.cursor() as cursor:
                cursor.execute('''
                    INSERT INTO stockData (code, name, newPrice, price_change_amplitude, price_change_Lines, volume, turnover, amplitude, highest, lowest, today, yesterday)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON DUPLICATE KEY UPDATE
                    newPrice=VALUES(newPrice),
                    price_change_amplitude=VALUES(price_change_amplitude),
                    price_change_Lines=VALUES(price_change_Lines),
                    volume=VALUES(volume),
                    turnover=VALUES(turnover),
                    amplitude=VALUES(amplitude),
                    highest=VALUES(highest),
                    lowest=VALUES(lowest),
                    today=VALUES(today),
                    yesterday=VALUES(yesterday)
                ''', (
                    item['code'],
                    item['name'],
                    item['newPrice'],
                    item['price_change_amplitude'],
                    item['price_change_Lines'],
                    item['volume'],
                    item['turnover'],
                    item['amplitude'],
                    item['highest'],
                    item['lowest'],
                    item['today'],
                    item['yesterday']
                ))
                self.connection.commit()
        except Error as e:
            spider.logger.error(f"Error inserting data into MySQL: {e}")
        return item
#middlewares.py
import time
from selenium import webdriver
from scrapy.http import HtmlResponse

class SeleniumMiddleware:
    def process_request(self, request, spider):
        # 设置Selenium WebDriver
        driver = webdriver.Edge()

        try:
            # 访问URL
            driver.get(request.url)
            # 等待页面加载
            time.sleep(3)  
            # 获取页面源代码
            data = driver.page_source
        finally:
            # 关闭WebDriver
            driver.quit()

        # 返回构造的HtmlResponse对象
        return HtmlResponse(url=request.url, body=data.encode('utf-8'), encoding='utf-8', request=request)
#settings.py
ITEM_PIPELINES = {
    'stock_scraper.pipelines.MySQLPipeline': 300,
}
DOWNLOADER_MIDDLEWARES = {
    'stock_scraper.middlewares.SeleniumMiddleware': 543,
}

MYSQL_HOST = '127.0.0.1'
MYSQL_DATABASE = 'wwh'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
ROBOTSTXT_OBEY = False

爬取结果:

(2)作业心得:通过本次实验,我对于使用XPath解析HTML文档有了进一步的理解,在Scrapy中使用XPath选择器可以精确地定位和提取网页中的数据。

作业③:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/

(1)代码如下:

#items.py
import scrapy

class ForexItem(scrapy.Item):
    currency = scrapy.Field()
    tbp = scrapy.Field()
    cbp = scrapy.Field()
    tsp = scrapy.Field()
    csp = scrapy.Field()
    time = scrapy.Field()
#spider.py
import scrapy
from forex_scraper.items import ForexItem

class BankSpider(scrapy.Spider):
    name = "forex_spider"
    allowed_domains = ["www.boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        # 选择第一个 tbody 中的所有行
        rows = response.xpath('//tbody[1]/tr')
        
        # 调整循环范围,以遍历相关的行
        for row in rows[2:-2]:  # 从索引 2 开始,到倒数第二行结束
            item = ForexItem()
            item['currency'] = row.xpath(".//td[1]//text()").get()  # 使用 .get() 简化语法
            item['tbp'] = row.xpath(".//td[2]//text()").get()
            item['cbp'] = row.xpath(".//td[3]//text()").get()
            item['tsp'] = row.xpath(".//td[4]//text()").get()
            item['csp'] = row.xpath(".//td[5]//text()").get()
            item['time'] = row.xpath(".//td[8]//text()").get()
            yield item
#pipelines.py
import mysql.connector
from mysql.connector import Error

class MySQLPipeline:
    def open_spider(self, spider):
        try:
            self.connection = mysql.connector.connect(
                host='127.0.0.1',
                user='root',  # 替换为你的MySQL用户名
                password='123456',  # 替换为你的MySQL密码
                database='wwh',  # 替换为你的数据库名
                charset='utf8mb4',
                use_unicode=True
            )
            self.cursor = self.connection.cursor()
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS exchange_rates (
                    id Integer,
                    currency VARCHAR(255),
                    tbp VARCHAR(255),
                    cbp VARCHAR(255),
                    tsp VARCHAR(255),
                    csp VARCHAR(255),
                    time VARCHAR(255)
                )
            ''')
            self.connection.commit()
        except Error as e:
            print(f"Error connecting to MySQL: {e}")

    def close_spider(self, spider):
        if self.connection.is_connected():
            self.cursor.close()
            self.connection.close()

    def process_item(self, item, spider):
        try:
            self.cursor.execute('''
                INSERT INTO exchange_rates (currency, tbp, cbp, tsp, csp, time) 
                VALUES (%s, %s, %s, %s, %s, %s)
            ''', (item['currency'], item['tbp'], item['cbp'], item['tsp'], item['csp'], item['time']))
            self.connection.commit()
        except Error as e:
            print(f"Error inserting item into MySQL: {e}")
        return item

其它与上一题类似。
爬取结果:

(2)作业心得:跟上一题其实是一样的体会。

posted @ 2024-11-04 20:59  Wroli  阅读(11)  评论(0编辑  收藏  举报