数据采集与融合技术-第三次实践作业

作业1

要求:要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。

–务必控制总下载的图片数量(学号尾数后2位)限制爬取的措施。

Gitee链接
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo1

单线程爬虫代码

import requests
import scrapy
from demo1.items import ImgItem
class FZuSpider(scrapy.Spider):
    name = "FZu"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw.htm"]
    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img).split('?')[0]
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        yield scrapy.Request(url=next_url,callback=self.parse)

多线程爬虫代码

import requests
import scrapy

from demo1.items import ImgItem


class FZuSpider1(scrapy.Spider):
    name = "FZu1"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw.htm"]
    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img).split('?')[0]
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 10 :
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider2(scrapy.Spider):
    name = "FZu2"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/711.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 20:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider3(scrapy.Spider):
    name = "FZu3"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/701.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 30:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider4(scrapy.Spider):
    name = "FZu4"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/691.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 40:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider5(scrapy.Spider):
    name = "FZu5"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/681.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 50:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider6(scrapy.Spider):
    name = "FZu6"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/671.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 60:
            yield scrapy.Request(url=next_url,callback=self.parse)

items代码

import scrapy
class ImgItem(scrapy.Item):
    img_url = scrapy.Field()
    img = scrapy.Field()

pipelines代码

import os
from urllib.parse import urlparse, unquote

class Demo1Pipeline:
    def process_item(self, item, spider):
        img_url = item['img_url']
        img = item['img']

        # 解析URL并获取净文件名(去除查询参数)
        parsed_url = urlparse(img_url)
        path = unquote(parsed_url.path)  # 解码路径,以防有特殊字符
        filename = os.path.basename(path)  # 获取文件名

        # 清理文件名,去除非法字符
        clean_filename = os.path.splitext(filename)[0] + os.path.splitext(filename)[1]

        # 确保images目录存在
        img_dir = 'images'
        if not os.path.exists(img_dir):
            os.makedirs(img_dir, exist_ok=True)

        # 构造完整的文件路径
        file_path = os.path.join(img_dir, clean_filename)

        # 尝试打开文件
        try:
            with open(file_path, 'wb') as self.file:
                self.file.write(img)
        except Exception as e:
            print(f"无法创建文件:{file_path}, 错误信息:{e}")
        return item

结果

心得体会

效率

多线程爬取可以显著提高爬取效率,尤其是在爬取大量图片时。通过并发请求,可以减少等待时间,加快整体爬取速度。

资源消耗

多线程爬取会消耗更多的系统资源,如CPU和内存。在资源有限的情况下,需要合理设置并发请求的数量,以避免系统过载。

反爬虫机制

许多网站都有反爬虫机制,过多的请求可能会触发这些机制,导致IP被封禁。因此,在设计爬虫时,需要考虑到这一点,合理设置请求间隔和并发数量。

错误处理

在爬取过程中,可能会遇到各种错误,如网络错误、404错误等。合理地处理这些错误,可以提高爬虫的稳定性和可靠性。

遵守法律法规

在爬取网站数据时,需要遵守相关法律法规,尊重网站的robots.txt文件,不要对网站造成过大压力。

作业2

要求: 熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。候选网站:东方财富网:https://www.eastmoney.com/输出信息:MySQL数据库存储和输出格式如下:表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计

Gitee链接
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo2

爬虫代码

import scrapy

from demo2.items import StockItem


class StockSpider(scrapy.Spider):
    name = "stock"
    allowed_domains = ["www.eastmoney.com"]
    start_urls = ["https://quote.eastmoney.com/center/gridlist.html#hs_a_board"]

    def parse(self, response):
        stocks = response.xpath("//tbody//tr")
        for stock in stocks:
            item = StockItem()
            item['id'] = stock.xpath('.//td[position() = 1]//text()').extract_first()
            item['code'] = stock.xpath('.//td[position() = 2]//text()').extract_first()
            item['name'] = stock.xpath('.//td[position() = 3]//text()').extract_first()
            item['newPrice'] = stock.xpath('.//td[position() = 5]//text()').extract_first()
            item['price_change_amplitude'] = stock.xpath('.//td[position() = 6]//text()').extract_first()
            item['price_change_Lines'] = stock.xpath('.//td[position() = 7]//text()').extract_first()
            item['volume'] = stock.xpath('.//td[position() = 8]//text()').extract_first()
            item['turnover'] = stock.xpath('.//td[position() = 9]//text()').extract_first()
            item['amplitude'] = stock.xpath('.//td[position() = 10]//text()').extract_first()
            item['highest'] = stock.xpath('.//td[position() = 11]//text()').extract_first()
            item['lowest'] = stock.xpath('.//td[position() = 12]//text()').extract_first()
            item['today'] = stock.xpath('.//td[position() = 13]//text()').extract_first()
            item['yesterday'] = stock.xpath('.//td[position() = 14]//text()').extract_first()
            yield item

items代码

import scrapy
class StockItem(scrapy.Item):
    id = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    newPrice = scrapy.Field()
    price_change_amplitude = scrapy.Field()
    price_change_Lines = scrapy.Field()
    volume = scrapy.Field()
    turnover = scrapy.Field()
    amplitude = scrapy.Field()
    highest = scrapy.Field()
    lowest = scrapy.Field()
    today = scrapy.Field()
    yesterday = scrapy.Field()

使用selenium作为下载中间件

import time

from scrapy.http import HtmlResponse
from selenium import webdriver


class SeleniumMiddleware:
    def process_request(self,request,spider):
        url = request.url
        driver = webdriver.Edge()
        driver.get(url)
        time.sleep(3)
        data =driver.page_source
        driver.close()
        return HtmlResponse(url=url,body=data.encode('utf-8'),encoding='utf-8',request=request)

pipelines代码

import pymysql


host = '127.0.0.1'
port = 3306
user = 'root'
password = 'yabdylm'
database = 'pycharm'

class Demo2Pipeline:
    def __init__(self):
        self.con = pymysql.connect(host=host, port=port, user=user, password=password, database=database, charset='utf8mb4')
        self.cursor = self.con.cursor()
        self.cursor.execute(
            "CREATE TABLE IF NOT EXISTS stockData (id Integer,code VARCHAR(255),name VARCHAR(255),newPrice VARCHAR(255),price_change_amplitude VARCHAR(255),price_change_Lines VARCHAR(255), volume VARCHAR(255),turnover VARCHAR(255),amplitude VARCHAR(255),highest VARCHAR(255),lowest VARCHAR(255),today VARCHAR(255),yesterday VARCHAR(255));")

    def process_item(self, item, spider):
        try:
            id = item['id']
            code = item['code']
            name = item['name']
            newPrice = item['newPrice']
            price_change_amplitude = item['price_change_amplitude']
            price_change_Lines = item['price_change_Lines']
            volume = item['volume']
            turnover = item['turnover']
            amplitude = item['amplitude']
            highest = item['highest']
            lowest = item['lowest']
            today = item['today']
            yesterday = item['yesterday']

            # 插入数据
            self.cursor.execute("""
                INSERT INTO stockData VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            """, (id, code, name, newPrice, price_change_amplitude, price_change_Lines, volume, turnover, amplitude,
                  highest, lowest, today, yesterday))
            self.con.commit()  # 提交事务
        except Exception as e:
            print(f"An error occurred: {e}")
        return item

    def __del__(self):
        self.con.close()

结果

心得体会

项目结构和代码组织:

在开始编写爬虫之前,合理规划项目结构和代码组织是非常重要的。这包括定义Item、Pipelines、Spiders等组件,以及如何组织这些组件的代码文件。良好的代码组织可以提高代码的可读性和可维护性。

Item定义:

定义Item时,需要根据目标网站的数据结构来设计字段。这要求对网站的数据结构有清晰的理解,并且能够预见到未来可能需要的数据字段。在东方财富网的例子中,可能需要定义如股票代码、名称、价格、成交量等字段。

作业3

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/

Gitee链接
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo3

爬虫代码

import scrapy


from demo3.items import BankItem


class BankSpider(scrapy.Spider):
    name = "bank"
    allowed_domains = ["www.boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        banks = response.xpath('//tbody[position() = 1]/tr')
        for i in range(2,len(banks) - 2):
            bank = banks[i]
            item = BankItem()
            item['Currency'] = bank.xpath(".//td[position() = 1]//text()").extract_first()
            item['TBP'] = bank.xpath(".//td[position() = 2]//text()").extract_first()
            item['CBP'] = bank.xpath(".//td[position() = 3]//text()").extract_first()
            item['TSP'] = bank.xpath(".//td[position() = 4]//text()").extract_first()
            item['CSP'] = bank.xpath(".//td[position() = 5]//text()").extract_first()
            item['Time'] = bank.xpath(".//td[position() = 8]//text()").extract_first()
            yield item

myDb.closeDB()

items代码

import scrapy


class BankItem(scrapy.Item):
    Currency = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    Time = scrapy.Field()

selenium中间件代码

import time

from scrapy.http import HtmlResponse
from selenium import webdriver

class SeleniumMiddleware:
    def process_request(self,request,spider):
        url =request.url
        driver = webdriver.Edge()
        driver.get(url)
        time.sleep(1)
        data = driver.page_source
        return HtmlResponse(url=url,body=data.encode('utf-8'),encoding='utf-8',reques

pipelines代码

import pymysql
from scrapy.exceptions import DropItem

class BankPipeline:
    def __init__(self):
        # 这里填写您的数据库配置信息
        self.host = 'localhost'
        self.database = 'pycharm'
        self.user = 'root'
        self.password = 'yabdylm'

        # 建立数据库连接
        self.con = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.database,
            charset='utf8mb4'  # 使用 utf8mb4 字符集以支持全字符集
        )
        self.cursor = self.con.cursor()


    def process_item(self, item, spider):
        # SQL 插入语句
        insert_sql = """
            INSERT INTO bankData (Currency, TBP, CBP, TSP, CSP, Time)
            VALUES (%s, %s, %s, %s, %s, %s)
        """
        try:
            # 执行 SQL 插入语句
            self.cursor.execute(
                insert_sql,
                (
                    item['Currency'],
                    item['TBP'],
                    item['CBP'],
                    item['TSP'],
                    item['CSP'],
                    item['Time']
                )
            )
            # 提交事务
            self.con.commit()
        except pymysql.Error as e:
            # 如果发生错误,回滚事务
            self.con.rollback()
            raise DropItem(f"Error inserting row {item!r} into database: {e}")

        return item

    def close_spider(self, spider):
        # 关闭数据库连接
        self.cursor.close()
        self.con.close()

结果

心得体会

Xpath选择器:

使用Xpath选择器提取数据时,需要对Xpath语法有一定的了解。同时,由于网站结构可能会变化,编写的Xpath选择器需要具有一定的灵活性和健壮性,以应对可能的结构变化。
数据清洗和处理:

数据库设计:

设计数据库表结构时,需要考虑到数据之间的关系和查询效率。例如,是否需要设置外键、索引等。同时,表头的英文命名需要符合数据库命名规范,易于理解且避免关键字冲突。

posted @ 2024-10-30 01:38  PZn  阅读(21)  评论(0编辑  收藏  举报