作业3

作业①:

实验内容：

要求：
指定一个网站，爬取这个网站中的所有的所有图片，例如：中国气象网（

http://www.weather.com.cn

）。使用scrapy框架分别实现单线程和多线程的方式爬取。

–务必控制总页数（学号尾数2位）、总下载的图片数量（尾数后3位）等限制爬取的措施。

输出信息: 将下载的Url信息在控制台输出，并将下载的图片存储在images子文件中，并给出截图。

Gitee文件夹链接：https://gitee.com/tiantianmi/crawl_project/tree/master/作业3/天气图片

代码实现：

（1）weather_spider:编写爬虫，用xpath遍历所有图片

点击查看代码

import scrapy

from work3.weather_images.weather_images.items import WeatherItem


class MySpider(scrapy.Spider):
    name = "weather_spider"
    start_urls = 'http://www.weather.com.cn/'
    imagePath = r"D:\DATA\images"

    def start_requests(self):
        start_url = "http://www.weather.com.cn/"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 "
                          "Minefield/3.0.2pre"}
        yield scrapy.Request(url=start_url, callback=self.parse, headers=headers)

    def parse(self, response, **kwargs):
        data = response.body.decode(response.encoding)
        selector = scrapy.Selector(text=data)
        img_urls = selector.xpath('//img/@src').extract()
        #print(img_urls)
        for image in img_urls:
            item = WeatherItem()
            item["img_url"] = image
            yield item

（2）items:将网址传到pipelines里

点击查看代码

import scrapy


class WeatherItem(scrapy.Item):
    # define the fields for your item here like:
    # img = scrapy.Field()
    img_url = scrapy.Field()
    pass

(3)pipelines：将图片下载到本地

点击查看代码

import urllib.request

import requests
from itemadapter import ItemAdapter


class WeatherImagePipeline(object):
    count = 0
    def process_item(self, item, spider):
        WeatherImagePipeline.count += 1
        url = item["img_url"]
        try:
            imagepath = 'D:\DATA\images1\\'+str(WeatherImagePipeline.count)+".jpg"
            urllib.request.urlretrieve(url, filename = imagepath )
            print("下载成功 "+url)
        except Exception as err:
            print(err)
        return item

（4)多线程

点击查看代码

CONCURRENT_REQUESTS = 32

运行结果：

代码运行结果：

下载图片查看：

心得体会：

刚开始做题的时候感觉好难，后面做了一些时间之后慢慢搞懂了就没那么难呐

作业②

实验内容：

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。

候选网站：东方财富网：https://www.eastmoney.com/
输出信息：MySQL数据库存储和输出格式如下：

表头英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计

序号股票代码股票名称最新报价涨跌幅涨跌额成交量振幅最高最低今开昨收
1 688093 N世华 28.47 10.92 26.13万 7.6亿 22.34 32.0 28.08 30.20 17.55
2……
Gitee文件夹链接：https://gitee.com/tiantianmi/crawl_project/tree/master/作业3/财富网

代码实现：

（1）stock_spider：

点击查看代码


import json
from work3.stock.stock.items import StockItem
import requests
import scrapy
import re
import pandas as pd
import sqlite3
import pymysql


class MySpider(scrapy.Spider):
    name = "stock_spider"
    start_urls = 'https://www.eastmoney.com/'

    def start_requests(self):

        url = MySpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        url = 'http://84.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124015654504524695545_1697702280661&pn=1&pz' \
              '=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,' \
              'm:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,' \
              'f18&_=1697702280662 '
        r = requests.get(url=url)
        data = r.text.strip("jQuery1124039403436320005647_1696661497357(")
        data = data.strip(");")
        # print(data)
        pat = 'diff":\[(.*?)\]'  # 用正则表达式
        data = re.compile(pat, re.S).findall(data)
        # print(data)
        data = list(eval(data[0]))
        try:
            item = StockItem()

            for i in range(len(data)):
                item['a1'] = i+1
                item['a2'] = data[i]['f12']
                item['a3'] = data[i]['f14']
                item['a4'] = data[i]['f2']
                item['a5'] = data[i]['f3']
                item['a6'] = data[i]['f4']
                item['a7'] = data[i]['f5']
                item['a8'] = data[i]['f6']
                item['a9'] = data[i]['f7']
                item['a10'] = data[i]['f15']
                item['a11'] = data[i]['f16']
                item['a12'] = data[i]['f17']
                item['a13'] = data[i]['f18']
                yield item
        except Exception as err:
            print(err)

(2)items:

点击查看代码



import scrapy


class StockItem(scrapy.Item):
    # define the fields for your item here like:
    a1 = scrapy.Field()
    a2 = scrapy.Field()
    a3 = scrapy.Field()
    a4 = scrapy.Field()
    a5 = scrapy.Field()
    a6 = scrapy.Field()
    a7 = scrapy.Field()
    a8 = scrapy.Field()
    a9 = scrapy.Field()
    a10 = scrapy.Field()
    a11 = scrapy.Field()
    a12 = scrapy.Field()
    a13 = scrapy.Field()
    pass

(3)pipelines:

点击查看代码


import sqlite3


class StockPipeline:
    def __init__(self):
        self.con = sqlite3.connect('stock.db')
        self.cur = self.con.cursor()
        self.cur.execute(
            'CREATE TABLE IF NOT EXISTS stock(serial_no INTEGER, code TEXT, name TEXT, latest_price REAL,'
            'change_percent '
            'REAL, change_amount REAL, volume INTEGER, amount REAL,amplitude REAL, highest REAL, lowest REAL, '
            'today_open REAL, yesterday_close REAL)')

    def process_item(self, item, spider):
        try:

            # a1 = item.get('a1')
            a1=item['a1']
            a2 = item.get('a2')
            a3 = item.get('a3')
            a4 = item.get('a4')
            a5 = item.get('a5')
            a6 = item.get('a6')
            a7 = item.get('a7')
            a8 = item.get('a8')
            a9 = item.get('a9')
            a10 = item.get('a10')
            a11 = item.get('a11')
            a12 = item.get('a12')
            a13 = item.get('a13')
            #print(a2) 插入数据
            self.cur.execute(
                'insert into stock(serial_no, code,name, latest_price, change_percent, change_amount, volume, amount, '
                'amplitude, highest, lowest, today_open, yesterday_close) values(?,?,?,?,?,?,?,?,?,?,?,?,?)', (
                    a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13))

            self.con.commit()

        except Exception as err:
            print(err)
        return item

(4)setting:

点击查看代码

BOT_NAME = "stock"

SPIDER_MODULES = ["stock.spiders"]
NEWSPIDER_MODULE = "stock.spiders"


REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
ITEM_PIPELINES = {
   "stock.pipelines.StockPipeline": 300,
}

运行结果：

查看stock.db

心得体会：

在连接数据库的时候，本来想直接把数据打印出来，但是格式一直调不好，就下载了sqlitestudio，直接查看stock.db。

作业三

实验内容

要求：熟练掌握scrapy中Item、Pipeline 数据的序列化输出方法，使用scrapy框架+ Xpath+ MySQL数据库存储技术路线爬取外汇_网站数据（ https://www.boc.cn/sourcedb/whpj/）

输出信息: (MySQL 数据库存储和输出格式)
Image

Gitee文件夹链接：https://gitee.com/tiantianmi/crawl_project/tree/master/作业3/银行/currency

代码实现：

（1）currency_spider:

点击查看代码

import scrapy
from bs4 import BeautifulSoup

from work3.currency.currency.items import CurrentItem



class MySpider(scrapy.Spider):
    name = "currency_spider"
    start_urls = 'https://www.boc.cn/sourcedb/whpj/'

    def start_requests(self):
        url = MySpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

        try:
            html = BeautifulSoup(response.body, features='lxml')
            table = html.find_all('table')[1]
            rows = table.find_all('tr')
            rows.pop(0)
            for row in rows:
                item = CurrentItem()
                column = row.find_all('td')
                item['Currency'] = column[0].text
                item['TBP'] = column[1].text
                item['CBP'] = column[2].text
                item['TSP'] = column[3].text
                item['CSP'] = column[4].text
                item['Time'] = column[6].text
                #print(item)
                yield item
        except Exception as err:
            print(err)

(2)items:

点击查看代码

import scrapy


class CurrentItem(scrapy.Item):
    # no = scrapy.Field()
    Currency = scrapy.Field()
    TBP = scrapy.Field()  # 现汇买入价
    CBP = scrapy.Field()  # 现钞买入价
    TSP = scrapy.Field()  # 现汇卖出价
    CSP = scrapy.Field()  # 现钞卖出价
    Time = scrapy.Field()
    pass

(3)pipelines:

点击查看代码

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import sqlite3




class CurrencyPipeline(object):
    count = 0

    def __init__(self):
        self.con = sqlite3.connect("Currency.db")
        self.cursor = self.con.cursor()
        self.cursor.execute(
            "create table Currency (Currency varchar,TBP varchar,CBP varchar, TSP varchar,CSP varchar,Time varchar)")

    def process_item(self, item, spider):
        # print(item)
        # print(self.url)

        try:
            Currency = item.get('Currency')
            TBP = item.get('TBP')
            CBP = item.get('CBP')
            TSP = item.get('TSP')
            CSP = item.get('CSP')
            Time = item.get('Time')
            #print(Currency)# 插入数据
            self.cursor.execute(
                "insert into Currency (Currency,TBP,CBP,TSP,CSP,Time) values(?,?,?,?,?,?)", (
                    str(Currency), str(TBP), str(CBP), str(TSP), str(CSP), str(Time)))

            self.con.commit()
            # self.cursor.execute("SELECT * FROM currency")
            # rows = self.cursor.fetchall()
            # for row in rows:
            #     print(row[0]+row[1]+row[2]+row[3]+row[4]+row[5])
            # self.con.close()
        except Exception as err:
            print(err)
        return item

(4)setting:

点击查看代码

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
BOT_NAME = "currency"

SPIDER_MODULES = ["currency.spiders"]
# NEWSPIDER_MODULE = "currency.spiders"

ITEM_PIPELINES = {
    "currency.pipelines.CurrencyPipeline": 300,
}

运行结果：

心得体会：

这次作业比作业2简单一点点，但是在做的时候还是要懂了之后才比较快。

posted on 2023-11-02 01:39 tiantianmimi 阅读(12) 评论(0) 收藏举报

刷新页面返回顶部

作业3

作业①:

实验内容：

代码实现：

运行结果：

心得体会：

作业②

实验内容：

代码实现：

运行结果：

心得体会：

作业三

实验内容

代码实现：

运行结果：

心得体会：

公告