数据采集第三次实践作业

第三次作业

作业①:

1.要求:

指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(

http://www.weather.com.cn

)。使用scrapy框架分别实现单线程和多线程的方式爬取。

–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

  • 输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
  • Gitee****文件夹链接

2.代码片段

import scrapy
import os
from urllib.parse import urljoin
from scrapy.exceptions import CloseSpider


class WeatherInfoSpider(scrapy.Spider):
    spider_name = 'weather'
    allowed_sites = ['weather.com.cn']
    initial_urls = ['http://www.weather.com.cn/']

    # 设定页面和图片下载的限制
    page_limit = 17
    image_limit = 117

    page_counter = 0
    image_counter = 0

    def start_requests(self):
        self.logger.info('Starting requests...')
        for url in self.initial_urls:
            yield scrapy.Request(url, callback=self.process_page)

    def process_page(self, response):
        # 检查是否已达到页面限制
        if self.page_counter >= self.page_limit:
            raise CloseSpider('Reached maximum page limit.')
        self.page_counter += 1
        self.logger.info(f'Visited page {self.page_counter}.')

        # 提取图片链接
        images = response.css('img::attr(src)').getall()
        for image in images:
            full_image_url = urljoin(response.url, image)
            self.image_counter += 1
            self.logger.info(f'Found image URL: {full_image_url}')
            yield {
                'image_url': full_image_url
            }
            # 检查是否已达到图片下载限制
            if self.image_counter >= self.image_limit:
                raise CloseSpider('Reached maximum image download limit.')

                # 提取并请求其他页面链接
        links = response.css('a::attr(href)').getall()
        for link in links:
            if self.page_counter >= self.page_limit:
                break
            full_link = urljoin(response.url, link)
            yield scrapy.Request(full_link, callback=self.process_page)

    def close(self, reason):
        self.logger.info(f'Spider closed. Reason: {reason}')

3.截图

心得体会:

单线程爬取:适用于需要爬取的页面较少,且目标网站响应速度较快。
多线程爬取:适用于需要抓取大量数据,或者目标网站的响应速度较慢时。

作业②

1.要求

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。

  • 候选网站:东方财富网:https://www.eastmoney.com/

  • 输出信息:MySQL数据库存储和输出格式如下:

  • 表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计

  • 序号 股票代码 股票名称 最新报价 涨跌幅 涨跌额 成交量 振幅 最高 最低 今开 昨收
    1 688093 N世华 28.47 10.92 26.13万 7.6亿 22.34 32.0 28.08 30.20 17.55
    2……
  • Gitee文件夹链接

2.代码片段

from typing import Any, Dict
import scrapy
import re
import json
import pymysql

class StockItem(scrapy.Item):
    latest_price = scrapy.Field()
    change_percentage = scrapy.Field()
    change_amount = scrapy.Field()
    volume = scrapy.Field()
    turnover = scrapy.Field()
    amplitude = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    high = scrapy.Field()
    low = scrapy.Field()
    today_open = scrapy.Field()
    yesterday_close = scrapy.Field()

class StockSpider(scrapy.Spider):
    name = 'stock_spider'
    start_urls = [
        'http://25.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124021313927342030325_some_timestamp&pn=1&pz=20&po=1&np=1&ut=some_unique_token&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=some_other_timestamp'
    ]

    def parse(self, response: scrapy.http.Response) -> Any:
        body = response.text
        diff_pattern = re.compile(r'"diff":\[(.*?)\]', re.DOTALL)
        diff_data = diff_pattern.search(body).group(1)
        
        data_pattern = re.compile(r'\{(.*?)\}', re.DOTALL)
        stock_records = data_pattern.findall(diff_data)
        
        for record in stock_records:
            stock_data = json.loads('{' + record + '}')
            item = StockItem()
            item['latest_price'] = stock_data.get('f2')
            item['change_percentage'] = stock_data.get('f3')
            item['change_amount'] = stock_data.get('f4')
            item['volume'] = stock_data.get('f5')
            item['turnover'] = stock_data.get('f6')
            item['amplitude'] = stock_data.get('f7')
            item['code'] = stock_data.get('f12')
            item['name'] = stock_data.get('f14')
            item['high'] = stock_data.get('f15')
            item['low'] = stock_data.get('f16')
            item['today_open'] = stock_data.get('f17')
            item['yesterday_close'] = stock_data.get('f18')
            
            yield item

class StockPipeline:
    def open_spider(self, spider: scrapy.Spider):
        try:
            self.connection = pymysql.connect(
                host='127.0.0.1',
                user='root',
                password='Cjkmysql.',
                port=3306,
                charset='utf8',
                database='chenoojkk'
            )
            self.cursor = self.connection.cursor()
            self.cursor.execute('DROP TABLE IF EXISTS stocks')
            create_table_sql = """
            CREATE TABLE stocks (
                latest_price DOUBLE,
                change_percentage DOUBLE,
                change_amount DOUBLE,
                volume DOUBLE,
                turnover DOUBLE,
                amplitude DOUBLE,
                code VARCHAR(12) PRIMARY KEY,
                name VARCHAR(32),
                high DOUBLE,
                low DOUBLE,
                today_open DOUBLE,
                yesterday_close DOUBLE
            )
            """
            self.cursor.execute(create_table_sql)
        except Exception as e:
            print(f"Error opening spider: {e}")

    def process_item(self, item: StockItem, spider: scrapy.Spider) -> StockItem:
        try:
            insert_sql = """
            INSERT INTO stocks (
                latest_price, change_percentage, change_amount, volume, turnover, amplitude, code, name, high, low, today_open, yesterday_close
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
            values = (
                item['latest_price'], item['change_percentage'], item['change_amount'],
                item['volume'], item['turnover'], item['amplitude'], item['code'],
                item['name'], item['high'], item['low'], item['today_open'], item['yesterday_close']
            )
            self.cursor.execute(insert_sql, values)
            self.connection.commit()
        except Exception as e:
            print(f"Error processing item: {e}")
        
        return item

    def close_spider(self, spider: scrapy.Spider):
        self.cursor.close()
        self.connection.close()

3.截图

心得体会:

学会了如何将数据存入数据库,由于使用终端较难可视化,下次将使用navicat

作业③:

1.要求

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

Currency TBP CBP TSP CSP Time
阿联酋迪拉姆 198.58 192.31 199.98 206.59 11:27:14

2.代码

import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from Practical_work3.items import Work3Item
import pymysql

class Work3Spider(scrapy.Spider):
    name = 'work3_revised'
    start_requests = [scrapy.http.Request('https://www.boc.cn/sourcedb/whpj/', callback=self.parse)]

    def parse(self, response):
        for row in response.css('table[align="left"] tr'):
            loader = ItemLoader(item=Work3Item(), selector=row, default_output_processor=TakeFirst())
            loader.add_css('name', 'td:nth-child(1)::text')
            loader.add_css('price1', 'td:nth-child(2)::text')
            loader.add_css('price2', 'td:nth-child(3)::text')
            loader.add_css('price3', 'td:nth-child(4)::text')
            loader.add_css('price4', 'td:nth-child(5)::text')
            loader.add_css('price5', 'td:nth-child(6)::text')
            loader.add_css('date', 'td:nth-last-child(1)::text')
            loader.add_value('name', MapCompose(remove_tags)(loader.get_output_value('name')))
            yield loader.load_item()

class Work3Pipeline:
    def __init__(self):
        self.db = None
        self.cursor = None

    def open_spider(self, spider):
        try:
            self.db = pymysql.connect(host='127.0.0.1', user='root', passwd='Cjkmysql.', port=3306, charset='utf8', database='chenoojkk')
            self.cursor = self.db.cursor()
            self.create_table()
        except Exception as e:
            print(f"Failed to connect to database: {e}")

    def create_table(self):
        try:
            self.cursor.execute('DROP TABLE IF EXISTS bank')
            self.cursor.execute("""
                CREATE TABLE bank (
                    Currency varchar(32),
                    p1 varchar(17),
                    p2 varchar(17),
                    p3 varchar(17),
                    p4 varchar(17),
                    p5 varchar(17),
                    Time varchar(32)
                )
            """)
        except Exception as e:
            print(f"Failed to create table: {e}")

    def process_item(self, item, spider):
        try:
            self.cursor.execute("""
                INSERT INTO bank (Currency, p1, p2, p3, p4, p5, Time)
                VALUES (%s, %s, %s, %s, %s, %s, %s)
            """, (
                item['name'],
                item['price1'],
                item['price2'],
                item['price3'],
                item['price4'],
                item['price5'],
                item['date']
            ))
            self.db.commit()
        except Exception as e:
            print(f"Failed to insert item: {e}")
        return item

    def close_spider(self, spider):
        if self.cursor:
            self.cursor.close()
        if self.db:
            self.db.close()

3.截图

心得体会:

Scrapy 框架 提供了一个完整的爬取和数据存储的解决方案,能够处理请求、解析、数据存储等问题,非常适合构建大规模爬虫。
posted @ 2024-11-12 01:08  念影苑  阅读(1)  评论(0编辑  收藏  举报