作业①:
实验内容:
要求:
指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(
)。使用scrapy框架分别实现单线程和多线程的方式爬取。
–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
Gitee文件夹链接:https://gitee.com/tiantianmi/crawl_project/tree/master/作业3/天气图片
代码实现:
(1)weather_spider
:编写爬虫,用xpath遍历所有图片
点击查看代码
import scrapy
from work3.weather_images.weather_images.items import WeatherItem
class MySpider(scrapy.Spider):
name = "weather_spider"
start_urls = 'http://www.weather.com.cn/'
imagePath = r"D:\DATA\images"
def start_requests(self):
start_url = "http://www.weather.com.cn/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 "
"Minefield/3.0.2pre"}
yield scrapy.Request(url=start_url, callback=self.parse, headers=headers)
def parse(self, response, **kwargs):
data = response.body.decode(response.encoding)
selector = scrapy.Selector(text=data)
img_urls = selector.xpath('//img/@src').extract()
#print(img_urls)
for image in img_urls:
item = WeatherItem()
item["img_url"] = image
yield item
(2)items
:将网址传到pipelines里
点击查看代码
import scrapy
class WeatherItem(scrapy.Item):
# define the fields for your item here like:
# img = scrapy.Field()
img_url = scrapy.Field()
pass
(3)pipelines
:将图片下载到本地
点击查看代码
import urllib.request
import requests
from itemadapter import ItemAdapter
class WeatherImagePipeline(object):
count = 0
def process_item(self, item, spider):
WeatherImagePipeline.count += 1
url = item["img_url"]
try:
imagepath = 'D:\DATA\images1\\'+str(WeatherImagePipeline.count)+".jpg"
urllib.request.urlretrieve(url, filename = imagepath )
print("下载成功 "+url)
except Exception as err:
print(err)
return item
(4)多线程
点击查看代码
CONCURRENT_REQUESTS = 32
运行结果:
代码运行结果:
下载图片查看:
心得体会:
刚开始做题的时候感觉好难,后面做了一些时间之后慢慢搞懂了就没那么难呐
作业②
实验内容:
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/
输出信息:MySQL数据库存储和输出格式如下:
表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计
序号 股票代码 股票名称 最新报价 涨跌幅 涨跌额 成交量 振幅 最高 最低 今开 昨收
1 688093 N世华 28.47 10.92 26.13万 7.6亿 22.34 32.0 28.08 30.20 17.55
2……
Gitee文件夹链接:https://gitee.com/tiantianmi/crawl_project/tree/master/作业3/财富网
代码实现:
(1)stock_spider
:
点击查看代码
import json
from work3.stock.stock.items import StockItem
import requests
import scrapy
import re
import pandas as pd
import sqlite3
import pymysql
class MySpider(scrapy.Spider):
name = "stock_spider"
start_urls = 'https://www.eastmoney.com/'
def start_requests(self):
url = MySpider.start_urls
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
url = 'http://84.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124015654504524695545_1697702280661&pn=1&pz' \
'=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,' \
'm:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,' \
'f18&_=1697702280662 '
r = requests.get(url=url)
data = r.text.strip("jQuery1124039403436320005647_1696661497357(")
data = data.strip(");")
# print(data)
pat = 'diff":\[(.*?)\]' # 用正则表达式
data = re.compile(pat, re.S).findall(data)
# print(data)
data = list(eval(data[0]))
try:
item = StockItem()
for i in range(len(data)):
item['a1'] = i+1
item['a2'] = data[i]['f12']
item['a3'] = data[i]['f14']
item['a4'] = data[i]['f2']
item['a5'] = data[i]['f3']
item['a6'] = data[i]['f4']
item['a7'] = data[i]['f5']
item['a8'] = data[i]['f6']
item['a9'] = data[i]['f7']
item['a10'] = data[i]['f15']
item['a11'] = data[i]['f16']
item['a12'] = data[i]['f17']
item['a13'] = data[i]['f18']
yield item
except Exception as err:
print(err)
(2)items
:
点击查看代码
import scrapy
class StockItem(scrapy.Item):
# define the fields for your item here like:
a1 = scrapy.Field()
a2 = scrapy.Field()
a3 = scrapy.Field()
a4 = scrapy.Field()
a5 = scrapy.Field()
a6 = scrapy.Field()
a7 = scrapy.Field()
a8 = scrapy.Field()
a9 = scrapy.Field()
a10 = scrapy.Field()
a11 = scrapy.Field()
a12 = scrapy.Field()
a13 = scrapy.Field()
pass
(3)pipelines
:
点击查看代码
import sqlite3
class StockPipeline:
def __init__(self):
self.con = sqlite3.connect('stock.db')
self.cur = self.con.cursor()
self.cur.execute(
'CREATE TABLE IF NOT EXISTS stock(serial_no INTEGER, code TEXT, name TEXT, latest_price REAL,'
'change_percent '
'REAL, change_amount REAL, volume INTEGER, amount REAL,amplitude REAL, highest REAL, lowest REAL, '
'today_open REAL, yesterday_close REAL)')
def process_item(self, item, spider):
try:
# a1 = item.get('a1')
a1=item['a1']
a2 = item.get('a2')
a3 = item.get('a3')
a4 = item.get('a4')
a5 = item.get('a5')
a6 = item.get('a6')
a7 = item.get('a7')
a8 = item.get('a8')
a9 = item.get('a9')
a10 = item.get('a10')
a11 = item.get('a11')
a12 = item.get('a12')
a13 = item.get('a13')
#print(a2) 插入数据
self.cur.execute(
'insert into stock(serial_no, code,name, latest_price, change_percent, change_amount, volume, amount, '
'amplitude, highest, lowest, today_open, yesterday_close) values(?,?,?,?,?,?,?,?,?,?,?,?,?)', (
a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13))
self.con.commit()
except Exception as err:
print(err)
return item
(4)setting
:
点击查看代码
BOT_NAME = "stock"
SPIDER_MODULES = ["stock.spiders"]
NEWSPIDER_MODULE = "stock.spiders"
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
ITEM_PIPELINES = {
"stock.pipelines.StockPipeline": 300,
}
运行结果:
查看stock.db
心得体会:
在连接数据库的时候,本来想直接把数据打印出来,但是格式一直调不好,就下载了sqlitestudio,直接查看stock.db。
作业三
实验内容
要求:熟练掌握scrapy中Item、Pipeline 数据的序列化输出方法,使用scrapy框架+ Xpath+ MySQL数据库存储技术路线爬取外汇_网站数据( https://www.boc.cn/sourcedb/whpj/)
输出信息: (MySQL 数据库存储和输出格式)
Image
Gitee文件夹链接:https://gitee.com/tiantianmi/crawl_project/tree/master/作业3/银行/currency
代码实现:
(1)currency_spider
:
点击查看代码
import scrapy
from bs4 import BeautifulSoup
from work3.currency.currency.items import CurrentItem
class MySpider(scrapy.Spider):
name = "currency_spider"
start_urls = 'https://www.boc.cn/sourcedb/whpj/'
def start_requests(self):
url = MySpider.start_urls
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
try:
html = BeautifulSoup(response.body, features='lxml')
table = html.find_all('table')[1]
rows = table.find_all('tr')
rows.pop(0)
for row in rows:
item = CurrentItem()
column = row.find_all('td')
item['Currency'] = column[0].text
item['TBP'] = column[1].text
item['CBP'] = column[2].text
item['TSP'] = column[3].text
item['CSP'] = column[4].text
item['Time'] = column[6].text
#print(item)
yield item
except Exception as err:
print(err)
(2)items
:
点击查看代码
import scrapy
class CurrentItem(scrapy.Item):
# no = scrapy.Field()
Currency = scrapy.Field()
TBP = scrapy.Field() # 现汇买入价
CBP = scrapy.Field() # 现钞买入价
TSP = scrapy.Field() # 现汇卖出价
CSP = scrapy.Field() # 现钞卖出价
Time = scrapy.Field()
pass
(3)pipelines
:
点击查看代码
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import sqlite3
class CurrencyPipeline(object):
count = 0
def __init__(self):
self.con = sqlite3.connect("Currency.db")
self.cursor = self.con.cursor()
self.cursor.execute(
"create table Currency (Currency varchar,TBP varchar,CBP varchar, TSP varchar,CSP varchar,Time varchar)")
def process_item(self, item, spider):
# print(item)
# print(self.url)
try:
Currency = item.get('Currency')
TBP = item.get('TBP')
CBP = item.get('CBP')
TSP = item.get('TSP')
CSP = item.get('CSP')
Time = item.get('Time')
#print(Currency)# 插入数据
self.cursor.execute(
"insert into Currency (Currency,TBP,CBP,TSP,CSP,Time) values(?,?,?,?,?,?)", (
str(Currency), str(TBP), str(CBP), str(TSP), str(CSP), str(Time)))
self.con.commit()
# self.cursor.execute("SELECT * FROM currency")
# rows = self.cursor.fetchall()
# for row in rows:
# print(row[0]+row[1]+row[2]+row[3]+row[4]+row[5])
# self.con.close()
except Exception as err:
print(err)
return item
(4)setting
:
点击查看代码
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
BOT_NAME = "currency"
SPIDER_MODULES = ["currency.spiders"]
# NEWSPIDER_MODULE = "currency.spiders"
ITEM_PIPELINES = {
"currency.pipelines.CurrencyPipeline": 300,
}
运行结果:
心得体会:
这次作业比作业2简单一点点,但是在做的时候还是要懂了之后才比较快。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了