2023数据采集与融合技术实践作业3
作业①:
要求:
指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
输出信息:
将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
gitee链接:
https://gitee.com/hong-songyu/crawl_project/tree/master/%E4%BD%9C%E4%B8%9A3/3.1
单线程代码:
myspider:
from bs4 import BeautifulSoup
import urllib.request
def imageSpider(start_url):
try:
urls = []
req = urllib.request.Request(start_url, headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
soup = BeautifulSoup(data, "lxml")
images = soup.select("img")
for image in images:
try:
src = image["src"]
url = urllib.request.urljoin(start_url, src)
if url not in urls:
print(url)
download(url)
urls.append(url)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url):
try:
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
file_name = url.split("/")[-1]
with open(file_name, "wb") as fobj:
fobj.write(data)
print("Downloaded:", file_name)
except Exception as err:
print(err)
start_url = "http://www.weather.com.cn/weather/101280601.shtml"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
}
imageSpider(start_url)
print("The End")
pipelines:
import os
import urllib
class Pipeline:
count = 1
urllist = []
def process_item(self, item, spider):
Pipeline.count += 1
try:
if not os.path.exists('images'):
os.makedirs('images')
if item['url'] not in Pipeline.urllist:
data = urllib.request.urlopen(item['url']).read()
with open('images/'+str(Pipeline.count)+'.jpg',"wb") as f:
f.write(data)
except Exception as err:
print(err)
return item
items:
import scrapy
class PictureItem(scrapy.Item):
# define the fields for your item here like:
url = scrapy.Field()
pass
运行结果:
双线程:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
def imageSpider(start_url):
global threads
global count
try:
urls = []
req = urllib.request.Request(start_url,headers = headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data,["utf-8","gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data,"lxml")
images = soup.select("img")
for image in images:
try:
src = image["src"]
url = urllib.request.urljoin(start_url,src)
if url not in urls:
print(url)
count = count+1
T = threading.Thread(target=download,args=(url,count))
T.setDaemon(False)
T.start()
threads.append(T)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url,count):
try:
if(url[len(url)-4] == "."):
ext = url[len(url)-4:]
else:
ext = ""
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
if count == 39:
fobj = open("images\\" + str(count) + ".png", "wb")
else:
fobj = open("images\\" + str(count) + ext, "wb")
fobj.write(data)
fobj.close()
print("downloaded" + str(count) + ext)
except Exception as err:
print(err)
start_url = "http://www.weather.com.cn/weather/101280601.shtml"
headers = {
"User-Agent":"Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"
}
count = 0
threads = []
imageSpider(start_url)
for t in threads:
t.join()
print("The End")
心得体会:
虽然还是爬取天气网,但这次是第一次用scrapy框架,能感受到scrapy更加便捷,具有良好的可扩展性和灵活性。
同时学到scrapy框架实现多线程可以在myspider中加入threading库的使用
作业②
要求:
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/
表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计
gitee链接:
https://gitee.com/hong-songyu/crawl_project/tree/master/%E4%BD%9C%E4%B8%9A3/3.2
代码:
myspider:
import scrapy
from ..items import StockItem
import json
class GetStockSpider(scrapy.Spider):
name = "mySpider"
start_urls=["http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240009917002240502182_1634088844934&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1634088845178"]
page = 0
def parse(self, response):
print(response.url)
json_str = response.text[len('jQuery112409594985484135052_1697702833397('):-2]
json_data = json.loads(json_str)
items = json_data['data']['diff']
for i in items:
try:
item = StockItem()
item["stockcode"] = i["f12"]
item["stockname"] = i["f14"]
item["newprice "] = i["f2"]
item["diefu"] = i["f3"]
item["diee"] = i["f4"]
item["dealnum"] = i["f5"]
item["deale"] = i["f6"]
item["zhenfu"] = i["f7"]
item["most"] = i["f15"]
item["least"] = i["f16"]
item["today"] = i["f17"]
item["yesterday"] = i["f18"]
yield item
except Exception as err:
print(err)
self.page += 1
if self.page < 8:
url = response.url
l = url.find("pn=")
r = url.find("&pz=")
pn = int(url[l+3:r]) + 1
next_url = url[:l+3] + str(pn) + url[r:]
yield scrapy.Request(url=next_url, callback=self.parse)
items:
import scrapy
class StockItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
stockcode = scrapy.Field()
stockname = scrapy.Field
newprice = scrapy.Field
diefu = scrapy.Field
diee = scrapy.Field
dealnum = scrapy.Field
deale = scrapy.Field
zhenfu = scrapy.Field
most = scrapy.Field
least = scrapy.Field
today = scrapy.Field
yesterday = scrapy.Field
pipelines
import sqlite3
class stockPipeline:
def __init__(self):
self.connection = None
self.cursor = None
def open_spider(self, spider):
self.connection = sqlite3.connect('stocks.db')
self.cursor = self.connection.cursor()
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS stocks (
stockcode TEXT,
stockname TEXT,
newprice REAL,
diefu REAL,
diee REAL,
dealnum INTEGER,
deale REAL,
zhenfu REAL,
most REAL,
least REAL,
today REAL,
yesterday REAL
)
''')
def close_spider(self, spider):
self.connection.commit()
self.connection.close()
def process_item(self, item, spider):
self.cursor.execute('''
INSERT INTO stocks (
stockcode,
stockname,
newprice,
diefu,
diee,
dealnum,
deale,
zhenfu,
most,
least,
today,
yesterday
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
item['stockcode'],
item['stockname'],
item['newprice'],
item['diefu'],
item['diee'],
item['dealnum'],
item['deale'],
item['zhenfu'],
item['most'],
item['least'],
item['today'],
item['yesterday']
))
return item
结果:
心得体会:
加深了scarpy的爬取的使用,初步了解sql与scarpy的交互过程,以及将数据保存在数据库中并表示出来
作业③:
要求:
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
Gitee链接:
https://gitee.com/hong-songyu/crawl_project/tree/master/%E4%BD%9C%E4%B8%9A3/3.3
代码:
myspider:
import scrapy
from ..items import bStockItem
class MySpider(scrapy.Spider):
name = 'mySpider'
start_urls = ['https://www.boc.cn/sourcedb/whpj/']
def parse(self, response):
data_lists = response.xpath('//table[@align="left"]/tr')
for data_list in data_lists:
datas = data_list.xpath('.//td/text()').extract()
if datas and len(datas) == 7:
item = bStockItem()
item['name'] = datas[0]
item['Currency'] = datas[0]
item['TBP'] = datas[1]
item['CBP'] = datas[2]
item['TSP'] = datas[3]
item['CSP'] = datas[4]
item['Time'] = datas[6]
yield item
items:
import scrapy
class bStockItem(scrapy.Item):
name = scrapy.Field()
Currency = scrapy.Field()
TBP = scrapy.Field()
CBP = scrapy.Field()
TSP = scrapy.Field()
CSP = scrapy.Field()
Time = scrapy.Field()
pipelines
class bStockPipeline:
count = 0
class bStockPipeline:
def open_spider(self, spider):
print('%-10s%-10s%-10s%-10s%-10s%-10s%-10s' % (
'货币名称', '现汇买入价', '现钞买入价', '现汇卖出价', '现钞卖出价', '中行折算价', '发布日期'))
def process_item(self, item, spider):
print('%-10s%-10s%-10s%-10s%-10s%-10s%-10s' % (
item['name'], item['Currency'], item['TBP'], item['CBP'], item['TSP'], item['CSP'], item['Time']))
return item
结果:
心得体会:
对于scrapy的使用更加熟练了
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)