数据采集与融合技术实践作业三
Gitee仓库网址:https://gitee.com/wang-zi-lian20031002/crawl_project
一、指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
1、核心代码与效果图展示
- 核心代码
Wzl1Spider:
from bs4 import UnicodeDammit
import scrapy
from firstBlood.items import ImgItem
class Wzl1Spider(scrapy.Spider):
name = "wzl1"
# allowed_domains = ["www.weather.com.cn"]
# start_urls = ["https://www.xxx.com"]
# allowed_domains = ['weather.com.cn']
start_urls = "http://www.weather.com.cn/"
def start_requests(self):
url = Wzl1Spider.start_urls
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response,**kwargs):
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
qixiangs = selector.xpath("//img")
for qixiang in qixiangs:
item = ImgItem()
item["url"] = qixiang.xpath("./@src").extract_first()
print(item["url"])
yield item
except Exception as err:
print(err)
Pipelines:
import os
import urllib
class FirstbloodPipeline:
count=0
def process_item(self, item, spider):
Base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
image_path = os.path.join(Base_path, "images")
if not image_path:
os.makedirs(image_path)
img_url = item['url']
if img_url:
FirstbloodPipeline.count += 1
extension = img_url.split('.')[-1]
filename = os.path.join(image_path, f"{FirstbloodPipeline.count}.{extension}")
try:
urllib.request.urlretrieve(img_url, filename=filename)
print(f"成功爬取:{filename}")
except Exception as e:
print(f"下载失败: {e}")
return item
Settings:
BOT_NAME = "firstBlood"
SPIDER_MODULES = ["firstBlood.spiders"]
NEWSPIDER_MODULE = "firstBlood.spiders"
CONCURRENT_REQUESTS_PER_DOMAIN = 1 # 同时请求同一个域名的请求数量
CONCURRENT_REQUESTS_PER_IP = 1 # 同时请求同一个IP的请求数量
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 100
CONCURRENT_REQUESTS_PER_IP = 100
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
ITEM_PIPELINES = {
'firstBlood.pipelines.FirstbloodPipeline': 300, # 配置你的pipeline
}
- 效果图展示
2、心得体会
通过实践,对scrapy框架的使用更加熟悉,单线程与多线程代码基本相似,只是通过设置setting参数来使用多线程。
二、熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息
1、核心代码与效果图展示
- 核心代码
Wzl3Spider:
import scrapy
import re
import json
import math
from secondBlood.items import SecondbloodItem
class Wzl3Spider(scrapy.Spider):
name = "wzl3"
# allowed_domains = ["www.xxx.com"]
start_urls = [
'http://31.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409705185363781139_1602849464971&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1602849464977']
def parse(self, response):
try:
data = response.body.decode()
datas = re.findall("{.*?}", data[re.search("\[", data).start():]) # 获取每支股票信息,一个{...}对应一支
for n in range(len(datas)):
stock = json.loads(datas[n]) # 文本解析成json格式
item = SecondbloodItem() # 获取相应的变量
item['code'] = stock['f12']
item['name'] = stock['f14']
item['latest_price'] = str(stock['f2'])
item['range'] = str(stock['f3'])
item['amount'] = str(stock['f4'])
item['trading'] = str(stock['f5'])
yield item
all_page = math.ceil(eval(re.findall('"total":(\d+)', response.body.decode())[0]) / 20) # 获取页数
page = re.findall("pn=(\d+)", response.url)[0] # 当前页数
if int(page) < all_page: # 判断页数
url = response.url.replace("pn=" + page, "pn=" + str(int(page) + 1)) # 跳转下一页
yield scrapy.Request(url=url, callback=self.parse) # 函数回调
except Exception as err:
print(err)
Pipelines:
import pymysql
class SecondbloodPipeline:
conn = None
cursor = None
def open_spider(self, spider):
print("打开数据库连接")
self.conn = pymysql.connect(
host='localhost',
port=3306,
user='root',
password='123456',
db='Data_acquisition'
)
self.cursor = self.conn.cursor()
# 创建数据表
create_table_sql = """
CREATE TABLE IF NOT EXISTS Stock (
序号 INT AUTO_INCREMENT,
代码 VARCHAR(10),
名称 VARCHAR(50),
最新价 FLOAT,
涨跌幅 FLOAT,
涨跌额 FLOAT,
成交量 FLOAT,
PRIMARY KEY (序号)
)
"""
self.cursor.execute(create_table_sql)
self.conn.commit()
def process_item(self, item, spider):
try:
sql = "INSERT INTO Stock (代码, 名称, 最新价, 涨跌幅, 涨跌额, 成交量) VALUES (%s, %s, %s, %s, %s, %s)"
values = (
item["code"],
item["name"],
item['latest_price'],
item['range'],
item['amount'],
item['trading']
)
self.cursor.execute(sql, values)
self.conn.commit()
except Exception as err:
print(err)
return item
def close_spider(self, spider):
print("关闭数据库连接")
self.cursor.close()
self.conn.close()
Settings:
BOT_NAME = "secondBlood"
SPIDER_MODULES = ["secondBlood.spiders"]
NEWSPIDER_MODULE = "secondBlood.spiders"
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'secondBlood.pipelines.SecondbloodPipeline': 300,
}
# LOG_LEVEL = "ERROR"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DB = 'Data_acquisition'
- 效果图展示
2、心得体会
巩固了scrapy框架和数据库的使用,对scrapy中的Item、Pipeline数据的序列化输出方法更加熟悉。
三、熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据
1、核心代码与效果图展示
- 核心代码
Wzl4Spider:
import scrapy
from scrapy import signals
from scrapy.utils.log import configure_logging
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from fourBlood.items import FourbloodItem
class Wzl4Spider(scrapy.Spider):
name = "wzl4"
# allowed_domains = ["www.xxx.com"]
def __init__(self, *args, **kwargs):
super(Wzl4Spider, self).__init__(*args, **kwargs)
chrome_options = Options()
chrome_options.add_argument("--headless") # 可选:启用无头模式
self.driver = webdriver.Chrome(service=Service(r'D:\chrome\chromedriver-win64\chromedriver.exe'), options=chrome_options)
def start_requests(self):
url = 'https://www.boc.cn/sourcedb/whpj/' # 要爬取的网页URL
self.driver.get(url)
time.sleep(1) # 等待页面加载完毕
html = etree.HTML(self.driver.page_source) # 获取网页HTML内容
yield scrapy.Request(url, self.parse, meta={'html': html})
def parse(self, response):
# 使用XPath选择所有<tr>元素
global item
html = response.meta['html']
lis = html.xpath('/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr')
number = 1
# 获取元素
for link in lis:
if number != 1:
texts = link.xpath('./td[1]/text()')
name = texts[0] if texts else ''
texts = link.xpath('./td[2]/text()')
TBP = texts[0] if texts else ''
texts = link.xpath('./td[3]/text()')
CBP = texts[0] if texts else ''
texts = link.xpath('./td[4]/text()')
TSP = texts[0] if texts else ''
texts = link.xpath('./td[5]/text()')
CSP = texts[0] if texts else ''
texts = link.xpath('./td[8]/text()')
TIME = texts[0] if texts else ''
item = FourbloodItem()
item["currency"] = name
item["TBP"] = TBP
item["CBP"] = CBP
item["TSP"] = TSP
item["CSP"] = CSP
item["time"] = TIME
yield item
if number == 1:
number += 1
def closed(self, reason):
self.driver.quit() # 当爬虫关闭时关闭WebDriver
Pipelines:
import pymysql
from itemadapter import ItemAdapter
class FourbloodPipeline:
def process_item(self, item, spider):
try:
print(item["currency"]) # 打印货币
print(item["TSP"]) # 打印 TSP
print(item["CSP"]) # 打印 CSP
print(item["TBP"]) # 打印 TBP
print(item["CBP"]) # 打印 CBP
print(item["time"]) # 打印时间
print()
# 将数据插入数据库的表中
mydb = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="china_bank",
charset="utf8")
mycursor = mydb.cursor()
sql = "INSERT INTO currency (currency, TSP, CSP, TBP, CBP, time) VALUES (%s, %s, %s, %s, %s, %s)"
val = (item["currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["time"])
# 打印 SQL 语句和要插入的值
print(f"Inserting into currency: {val}")
mycursor.execute(sql, val)
mydb.commit()
except Exception as err:
print(f"Error: {err}") # 更详细的错误信息
finally:
mycursor.close() # 关闭游标
mydb.close() # 关闭数据库连接
return item
Settings:
BOT_NAME = "fourBlood"
SPIDER_MODULES = ["fourBlood.spiders"]
NEWSPIDER_MODULE = "fourBlood.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "fourBlood (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
'fourBlood.pipelines.FourbloodPipeline': 1,
}
- 效果图展示
2、心得体会
刚开始是没有做出来的,爬取一直没有反应,后面通过网上查阅资料,查找类似的爬虫程序,最后成功在数据库中打印出了信息。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步