数据采集第三次作业

作业1:

指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。

代码与运行结果:

spider代码:

import scrapy
from urllib.parse import urljoin
from scrapy import Item, Field


class WeatherItem(Item):
    image_urls = Field()


class Myspider31Spider(scrapy.Spider):
    name = "myspider31"
    allowed_domains = ["weather.com.cn"]
    start_urls = ["https://weather.com.cn"]

    def parse(self, response):
        full_image_urls = []

        image_urls = response.css('img::attr(src)').getall()
        full_image_urls = [urljoin(response.url, img_url) for img_url in image_urls]

        item = WeatherItem()
        item['image_urls'] = full_image_urls

        yield item

setting代码:

ITEM_PIPELINES = {
#    "project31.pipelines.Project31Pipeline": 300,
'scrapy.pipelines.images.ImagesPipeline': 300
}
IMAGES_STORE='D:\数据集\数据采集实践3-1'

pipelines代码:

from itemadapter import ItemAdapter


class Project31Pipeline:
    def process_item(self, item, spider):
        return item

运行结果

作业二

要求: 熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。候选网站:东方财富网

代码与运行结果:

import scrapy

from demo2.items import StockItem


class StockSpider(scrapy.Spider):
    name = "stock"
    allowed_domains = ["www.eastmoney.com"]
    start_urls = ["https://quote.eastmoney.com/center/gridlist.html#hs_a_board"]

    def parse(self, response):
        stocks = response.xpath("//tbody//tr")
        for stock in stocks:
            item = StockItem()
            item['id'] = stock.xpath('.//td[position() = 1]//text()').extract_first()
            item['code'] = stock.xpath('.//td[position() = 2]//text()').extract_first()
            item['name'] = stock.xpath('.//td[position() = 3]//text()').extract_first()
            item['newPrice'] = stock.xpath('.//td[position() = 5]//text()').extract_first()
            item['price_change_amplitude'] = stock.xpath('.//td[position() = 6]//text()').extract_first()
            item['price_change_Lines'] = stock.xpath('.//td[position() = 7]//text()').extract_first()
            item['volume'] = stock.xpath('.//td[position() = 8]//text()').extract_first()
            item['turnover'] = stock.xpath('.//td[position() = 9]//text()').extract_first()
            item['amplitude'] = stock.xpath('.//td[position() = 10]//text()').extract_first()
            item['highest'] = stock.xpath('.//td[position() = 11]//text()').extract_first()
            item['lowest'] = stock.xpath('.//td[position() = 12]//text()').extract_first()
            item['today'] = stock.xpath('.//td[position() = 13]//text()').extract_first()
            item['yesterday'] = stock.xpath('.//td[position() = 14]//text()').extract_first()
            yield item
import pymysql
host = '127.0.0.1'
port = 3306
user = 'root'
password = 'yabdylm'
database = 'pycharm'

class Demo2Pipeline:
    def __init__(self):
        self.con = pymysql.connect(host=host, port=port, user=user, password=password, database=database, charset='utf8mb4')
        self.cursor = self.con.cursor()
        self.cursor.execute(
            "CREATE TABLE IF NOT EXISTS stockData (id Integer,code VARCHAR(255),name VARCHAR(255),newPrice VARCHAR(255),price_change_amplitude VARCHAR(255),price_change_Lines VARCHAR(255), volume VARCHAR(255),turnover VARCHAR(255),amplitude VARCHAR(255),highest VARCHAR(255),lowest VARCHAR(255),today VARCHAR(255),yesterday VARCHAR(255));")

    def process_item(self, item, spider):
        try:
            id = item['id']
            code = item['code']
            name = item['name']
            newPrice = item['newPrice']
            price_change_amplitude = item['price_change_amplitude']
            price_change_Lines = item['price_change_Lines']
            volume = item['volume']
            turnover = item['turnover']
            amplitude = item['amplitude']
            highest = item['highest']
            lowest = item['lowest']
            today = item['today']
            yesterday = item['yesterday']

            # 插入数据
            self.cursor.execute("""
                INSERT INTO stockData VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            """, (id, code, name, newPrice, price_change_amplitude, price_change_Lines, volume, turnover, amplitude,
                  highest, lowest, today, yesterday))
            self.con.commit()  # 提交事务
        except Exception as e:
            print(f"An error occurred: {e}")
        return item

    def __del__(self):
        self.con.close()

运行结果:

作业三

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/

代码与运行结果:

代码:

import scrapy


from demo3.items import BankItem


class BankSpider(scrapy.Spider):
    name = "bank"
    allowed_domains = ["www.boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        banks = response.xpath('//tbody[position() = 1]/tr')
        for i in range(2,len(banks) - 2):
            bank = banks[i]
            item = BankItem()
            item['Currency'] = bank.xpath(".//td[position() = 1]//text()").extract_first()
            item['TBP'] = bank.xpath(".//td[position() = 2]//text()").extract_first()
            item['CBP'] = bank.xpath(".//td[position() = 3]//text()").extract_first()
            item['TSP'] = bank.xpath(".//td[position() = 4]//text()").extract_first()
            item['CSP'] = bank.xpath(".//td[position() = 5]//text()").extract_first()
            item['Time'] = bank.xpath(".//td[position() = 8]//text()").extract_first()
            yield item

myDb.closeDB()
import pymysql
from scrapy.exceptions import DropItem

class BankPipeline:
    def __init__(self):
        # 这里填写您的数据库配置信息
        self.host = 'localhost'
        self.database = 'pycharm'
        self.user = 'root'
        self.password = 'yabdylm'

        # 建立数据库连接
        self.con = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.database,
            charset='utf8mb4'  # 使用 utf8mb4 字符集以支持全字符集
        )
        self.cursor = self.con.cursor()


    def process_item(self, item, spider):
        # SQL 插入语句
        insert_sql = """
            INSERT INTO bankData (Currency, TBP, CBP, TSP, CSP, Time)
            VALUES (%s, %s, %s, %s, %s, %s)
        """
        try:
            # 执行 SQL 插入语句
            self.cursor.execute(
                insert_sql,
                (
                    item['Currency'],
                    item['TBP'],
                    item['CBP'],
                    item['TSP'],
                    item['CSP'],
                    item['Time']
                )
            )
            # 提交事务
            self.con.commit()
        except pymysql.Error as e:
            # 如果发生错误,回滚事务
            self.con.rollback()
            raise DropItem(f"Error inserting row {item!r} into database: {e}")

        return item

    def close_spider(self, spider):
        # 关闭数据库连接
        self.cursor.close()
        self.con.close()

运行结果:

posted @   关忆南北  阅读(12)  评论(0编辑  收藏  举报
点击右上角即可分享
微信分享提示