数据采集与融合技术实践作业三

Gitee仓库网址:https://gitee.com/wang-zi-lian20031002/crawl_project

一、指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

1、核心代码与效果图展示

  • 核心代码

Wzl1Spider:

from bs4 import UnicodeDammit
import scrapy
from firstBlood.items import ImgItem

class Wzl1Spider(scrapy.Spider):
    name = "wzl1"
    # allowed_domains = ["www.weather.com.cn"]
    # start_urls = ["https://www.xxx.com"]
    # allowed_domains = ['weather.com.cn']
    start_urls = "http://www.weather.com.cn/"

    def start_requests(self):
        url = Wzl1Spider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response,**kwargs):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            qixiangs = selector.xpath("//img")
            for qixiang in qixiangs:
                item = ImgItem()
                item["url"] = qixiang.xpath("./@src").extract_first()
                print(item["url"])
                yield item
        except Exception as err:
            print(err)

Pipelines:

import os

import urllib


class FirstbloodPipeline:
    count=0
    def process_item(self, item, spider):
        Base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        image_path = os.path.join(Base_path, "images")
        if not image_path:
            os.makedirs(image_path)
        img_url = item['url']

        if img_url:
            FirstbloodPipeline.count += 1
            extension = img_url.split('.')[-1]
            filename = os.path.join(image_path, f"{FirstbloodPipeline.count}.{extension}")
            try:
                urllib.request.urlretrieve(img_url, filename=filename)
                print(f"成功爬取:{filename}")
            except Exception as e:
                print(f"下载失败: {e}")
        return item

Settings:

BOT_NAME = "firstBlood"

SPIDER_MODULES = ["firstBlood.spiders"]
NEWSPIDER_MODULE = "firstBlood.spiders"
CONCURRENT_REQUESTS_PER_DOMAIN = 1  # 同时请求同一个域名的请求数量
CONCURRENT_REQUESTS_PER_IP = 1  # 同时请求同一个IP的请求数量
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 100
CONCURRENT_REQUESTS_PER_IP = 100

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
ITEM_PIPELINES = {
    'firstBlood.pipelines.FirstbloodPipeline': 300,  # 配置你的pipeline
}
  • 效果图展示

2、心得体会

通过实践,对scrapy框架的使用更加熟悉,单线程与多线程代码基本相似,只是通过设置setting参数来使用多线程。

二、熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息

1、核心代码与效果图展示

  • 核心代码

Wzl3Spider:

import scrapy
import re
import json
import math
from secondBlood.items import SecondbloodItem
class Wzl3Spider(scrapy.Spider):
    name = "wzl3"
    # allowed_domains = ["www.xxx.com"]
    start_urls = [
        'http://31.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409705185363781139_1602849464971&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1602849464977']

    def parse(self, response):
        try:
            data = response.body.decode()
            datas = re.findall("{.*?}", data[re.search("\[", data).start():])  # 获取每支股票信息,一个{...}对应一支
            for n in range(len(datas)):
                stock = json.loads(datas[n])  # 文本解析成json格式
                item = SecondbloodItem()  # 获取相应的变量
                item['code'] = stock['f12']
                item['name'] = stock['f14']
                item['latest_price'] = str(stock['f2'])
                item['range'] = str(stock['f3'])
                item['amount'] = str(stock['f4'])
                item['trading'] = str(stock['f5'])
                yield item
            all_page = math.ceil(eval(re.findall('"total":(\d+)', response.body.decode())[0]) / 20)  # 获取页数
            page = re.findall("pn=(\d+)", response.url)[0]  # 当前页数
            if int(page) < all_page:  # 判断页数
                url = response.url.replace("pn=" + page, "pn=" + str(int(page) + 1))  # 跳转下一页
                yield scrapy.Request(url=url, callback=self.parse)  # 函数回调
        except Exception as err:
            print(err)

Pipelines:

import pymysql

class SecondbloodPipeline:
    conn = None
    cursor = None

    def open_spider(self, spider):
        print("打开数据库连接")
        self.conn = pymysql.connect(
            host='localhost',
            port=3306,
            user='root',
            password='123456',
            db='Data_acquisition'
        )
        self.cursor = self.conn.cursor()

        # 创建数据表
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS Stock (
            序号 INT AUTO_INCREMENT,
            代码 VARCHAR(10),
            名称 VARCHAR(50),
            最新价 FLOAT,
            涨跌幅 FLOAT,
            涨跌额 FLOAT,
            成交量 FLOAT,
            PRIMARY KEY (序号)
        )
        """
        self.cursor.execute(create_table_sql)
        self.conn.commit()
    def process_item(self, item, spider):
        try:
            sql = "INSERT INTO Stock (代码, 名称, 最新价, 涨跌幅, 涨跌额, 成交量) VALUES (%s, %s, %s, %s, %s, %s)"
            values = (
                item["code"],
                item["name"],
                item['latest_price'],
                item['range'],
                item['amount'],
                item['trading']
            )
            self.cursor.execute(sql, values)
            self.conn.commit()
        except Exception as err:
            print(err)
        return item

    def close_spider(self, spider):
        print("关闭数据库连接")
        self.cursor.close()
        self.conn.close()

Settings:

BOT_NAME = "secondBlood"

SPIDER_MODULES = ["secondBlood.spiders"]
NEWSPIDER_MODULE = "secondBlood.spiders"
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
    'secondBlood.pipelines.SecondbloodPipeline': 300,
}
# LOG_LEVEL = "ERROR"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DB = 'Data_acquisition'
  • 效果图展示

2、心得体会

巩固了scrapy框架和数据库的使用,对scrapy中的Item、Pipeline数据的序列化输出方法更加熟悉。

三、熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据

1、核心代码与效果图展示

  • 核心代码

Wzl4Spider:

import scrapy
from scrapy import signals
from scrapy.utils.log import configure_logging
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from fourBlood.items import FourbloodItem
class Wzl4Spider(scrapy.Spider):
    name = "wzl4"
    # allowed_domains = ["www.xxx.com"]
    def __init__(self, *args, **kwargs):
        super(Wzl4Spider, self).__init__(*args, **kwargs)
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # 可选:启用无头模式
        self.driver = webdriver.Chrome(service=Service(r'D:\chrome\chromedriver-win64\chromedriver.exe'), options=chrome_options)

    def start_requests(self):
        url = 'https://www.boc.cn/sourcedb/whpj/'  # 要爬取的网页URL
        self.driver.get(url)
        time.sleep(1)  # 等待页面加载完毕
        html = etree.HTML(self.driver.page_source)  # 获取网页HTML内容
        yield scrapy.Request(url, self.parse, meta={'html': html})
    def parse(self, response):
        # 使用XPath选择所有<tr>元素
        global item
        html = response.meta['html']
        lis = html.xpath('/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr')
        number = 1
        # 获取元素
        for link in lis:
            if number != 1:
                texts = link.xpath('./td[1]/text()')
                name = texts[0] if texts else ''
                texts = link.xpath('./td[2]/text()')
                TBP = texts[0] if texts else ''
                texts = link.xpath('./td[3]/text()')
                CBP = texts[0] if texts else ''
                texts = link.xpath('./td[4]/text()')
                TSP = texts[0] if texts else ''
                texts = link.xpath('./td[5]/text()')
                CSP = texts[0] if texts else ''
                texts = link.xpath('./td[8]/text()')
                TIME = texts[0] if texts else ''

                item = FourbloodItem()
                item["currency"] = name
                item["TBP"] = TBP
                item["CBP"] = CBP
                item["TSP"] = TSP
                item["CSP"] = CSP
                item["time"] = TIME
                yield item
            if number == 1:
                number += 1

    def closed(self, reason):
        self.driver.quit()  # 当爬虫关闭时关闭WebDriver

Pipelines:

import pymysql
from itemadapter import ItemAdapter
class FourbloodPipeline:

    def process_item(self, item, spider):
        try:
            print(item["currency"])  # 打印货币
            print(item["TSP"])  # 打印 TSP
            print(item["CSP"])  # 打印 CSP
            print(item["TBP"])  # 打印 TBP
            print(item["CBP"])  # 打印 CBP
            print(item["time"])  # 打印时间
            print()

            # 将数据插入数据库的表中
            mydb = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="china_bank",
                                   charset="utf8")
            mycursor = mydb.cursor()
            sql = "INSERT INTO currency (currency, TSP, CSP, TBP, CBP, time) VALUES (%s, %s, %s, %s, %s, %s)"
            val = (item["currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["time"])

            # 打印 SQL 语句和要插入的值
            print(f"Inserting into currency: {val}")

            mycursor.execute(sql, val)
            mydb.commit()
        except Exception as err:
            print(f"Error: {err}")  # 更详细的错误信息
        finally:
            mycursor.close()  # 关闭游标
            mydb.close()  # 关闭数据库连接
        return item

Settings:

BOT_NAME = "fourBlood"

SPIDER_MODULES = ["fourBlood.spiders"]
NEWSPIDER_MODULE = "fourBlood.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "fourBlood (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
    'fourBlood.pipelines.FourbloodPipeline': 1,
}
  • 效果图展示

2、心得体会

刚开始是没有做出来的,爬取一直没有反应,后面通过网上查阅资料,查找类似的爬虫程序,最后成功在数据库中打印出了信息。

posted @   lzwang  阅读(17)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示