数据采集与融合技术第三次作业
作业1
仓库链接:https://gitee.com/jyppx000/crawl_project
作业①
要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。
1.1 代码和图片
我这里列出核心代码
first.py
import scrapy
from bs4 import UnicodeDammit
from firstBlood.items import FirstbloodItem
class FirstSpider(scrapy.Spider):
name = "first"
# 控制页码
start_urls = [f"http://www.weather.com.cn/page_{i}" for i in range(1, 57)]
def parse(self, response, **kwargs):
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
qixiangs = selector.xpath("//img")
for qixiang in qixiangs:
item = FirstbloodItem()
item["url"] = qixiang.xpath("./@src").extract_first()
if item["url"]:
print(item["url"])
yield item
except Exception as err:
print(err)
piplines.py
import os
import urllib
class FirstbloodPipeline:
count = 0
# 控制最大下载图片数量
max_images = 156
def process_item(self, item, spider):
if FirstbloodPipeline.count >= FirstbloodPipeline.max_images:
spider.crawler.engine.close_spider(spider, "达到最大图片数量,停止爬取")
return item
Base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
image_path = os.path.join(Base_path, "images")
if not os.path.exists(image_path): # 确保目录存在
os.makedirs(image_path)
img_url = item['url']
if img_url:
FirstbloodPipeline.count += 1
extension = img_url.split('.')[-1]
filename = os.path.join(image_path, f"{FirstbloodPipeline.count}.{extension}")
try:
urllib.request.urlretrieve(img_url, filename=filename)
print(f"成功爬取:{filename}")
except Exception as e:
print(f"下载失败: {e}")
return item
items.py
class FirstbloodItem(scrapy.Item):
url = scrapy.Field()
pass
settings.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 100
CONCURRENT_REQUESTS_PER_IP = 100
ITEM_PIPELINES = {
'firstBlood.pipelines.FirstbloodPipeline': 300,
}
main.py
import os
import sys
from scrapy import cmdline
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "first"])
1.2 作业心得
- 在实现特定功能的同时,我注意到了代码结构的重要性。通过将爬虫逻辑与数据处理逻辑分开,保持了代码的清晰与可维护性
- 在处理大量数据时,如何合理地组织和存储数据就很重要了。我在实践中学习到了如何通过Pipeline处理和存储图片,并在过程中避免重复下载。这使我对数据存储和管理有了更深入的理解
- 通过处理下载过程中可能出现的异常,我的错误处理能力得到了提升。让我在实际操作中更加从容应对各种问题
作业②
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
2.1 代码和图片
```second.py``
import re
import scrapy
import requests
from secondBlood.items import SecondbloodItem
class SecondSpider(scrapy.Spider):
name = "second"
# allowed_domains = ["www.xxx.com"]
start_url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
def start_requests(self):
url = SecondSpider.start_url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response, **kwargs):
try:
stocks = []
url = "https://68.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009413428787683675_1696660278138&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18&_=1696660278155"
r = requests.get(url=url)
data = re.compile("\"f2\":.*").findall(r.text)
data1 = data[0].split("},{")
data1[-1] = data1[-1].split("}")[0]
for i in range(len(data1)):
stock0 = data1[i].replace('"', "").split(",")
list = [6, 7, 0, 1, 2, 3, 4, 5, 8, 9, 10, 11]
stock = []
for j in list:
stock.append(stock0[j].split(":")[1])
stocks.append(stock)
print(stocks[0][0])
for i in range(len(stocks)):
item = SecondbloodItem()
item["stockname"] = stocks[i][0]
item["name"] = stocks[i][1]
item["newprice"] = stocks[i][2]
item["zhangdiefu"] = stocks[i][3]
item["zhangdieer"] = stocks[i][4]
item["chengjiaoliang"] = stocks[i][5]
item["chengjiaoer"] = stocks[i][6]
item["zhenfu"] = stocks[i][7]
item["zuigao"] = stocks[i][8]
item["zuidi"] = stocks[i][9]
item["jinkai"] = stocks[i][10]
item["zuoshou"] = stocks[i][11]
print(item)
yield item
except Exception as err:
print(err)
piplines.py
import pymysql
class SecondbloodPipeline:
def __init__(self):
self.mydb = pymysql.connect(
host="127.0.0.1",
port=3306,
user='root',
password='123456',
database="stock",
charset='utf8'
)
self.cursor = self.mydb.cursor()
self.cursor.execute('''CREATE TABLE IF NOT EXISTS stocks(
stockname VARCHAR(256),
name VARCHAR(256),
newprice VARCHAR(256),
zhangdiefu VARCHAR(256),
zhangdieer VARCHAR(256),
chengjiaoliang VARCHAR(256),
chengjiaoer VARCHAR(256),
zhenfu VARCHAR(256),
zuigao VARCHAR(256),
zuidi VARCHAR(256),
jinkai VARCHAR(256),
zuoshou VARCHAR(256)
)''')
self.mydb.commit()
def process_item(self, item, spider):
try:
sql = "INSERT INTO stocks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
self.cursor.execute(sql, (
item.get("stockname"), item.get("name"), item.get("newprice"),
item.get("zhangdiefu"), item.get("zhangdieer"),
item.get("chengjiaoliang"), item.get("chengjiaoer"),
item.get("zhenfu"), item.get("zuigao"),
item.get("zuidi"), item.get("jinkai"),
item.get("zuoshou")))
self.mydb.commit()
print("Successfully inserted:", item)
except Exception as e:
print("Error inserting item:", e)
self.mydb.rollback() # Rollback in case of error
return item
def close_spider(self, spider):
self.cursor.close()
self.mydb.close()
items.py
import scrapy
class SecondbloodItem(scrapy.Item):
stockname = scrapy.Field()
name = scrapy.Field()
newprice = scrapy.Field()
zhangdiefu = scrapy.Field()
zhangdieer = scrapy.Field()
chengjiaoliang = scrapy.Field()
chengjiaoer = scrapy.Field()
zhenfu = scrapy.Field()
zuigao = scrapy.Field()
zuidi = scrapy.Field()
jinkai = scrapy.Field()
zuoshou = scrapy.Field()
setting.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
'secondBlood.pipelines.SecondbloodPipeline': 1,
}
main.py
import os
import sys
from scrapy import cmdline
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "second"])
2.2 作业心得
-
我了解了如何使用Scrapy框架抓取和解析网页数据。使用
requests
库获取API数据,结合正则表达式进行数据提取 -
在将爬取的数据存储到MySQL数据库的过程中,我掌握了如何通过在Pipeline中处理数据存储
-
深入了异常处理机制
作业③:
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
3.1代码和图片
third.py
import time
import scrapy
from lxml import etree
from selenium import webdriver
from thirdBlood.items import ThirdbloodItem
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
class ThirdSpider(scrapy.Spider):
name = "third"
def __init__(self, *args, **kwargs):
super(ThirdSpider, self).__init__(*args, **kwargs)
chrome_options = Options()
chrome_options.add_argument("--headless") #
self.driver = webdriver.Chrome(service=Service(r'D:\tools\package\chromedriver-win64\chromedriver.exe'), options=chrome_options)
def start_requests(self):
url = 'https://www.boc.cn/sourcedb/whpj/'
self.driver.get(url)
time.sleep(1)
html = etree.HTML(self.driver.page_source)
yield scrapy.Request(url, self.parse, meta={'html': html})
def parse(self, response):
global item
html = response.meta['html']
lis = html.xpath('/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr')
number = 1
# 获取元素
for link in lis:
if number != 1:
texts = link.xpath('./td[1]/text()')
name = texts[0] if texts else ''
texts = link.xpath('./td[2]/text()')
TBP = texts[0] if texts else ''
texts = link.xpath('./td[3]/text()')
CBP = texts[0] if texts else ''
texts = link.xpath('./td[4]/text()')
TSP = texts[0] if texts else ''
texts = link.xpath('./td[5]/text()')
CSP = texts[0] if texts else ''
texts = link.xpath('./td[8]/text()')
TIME = texts[0] if texts else ''
item = ThirdbloodItem()
item["currency"] = name
item["TBP"] = TBP
item["CBP"] = CBP
item["TSP"] = TSP
item["CSP"] = CSP
item["time"] = TIME
yield item
if number == 1:
number += 1
def closed(self, reason):
self.driver.quit()
piplines.py
import pymysql
class ThirdbloodPipeline:
def process_item(self, item, spider):
try:
print(item["currency"])
print(item["TSP"])
print(item["CSP"])
print(item["TBP"])
print(item["CBP"])
print(item["time"])
print()
# 将数据插入数据库的表中
mydb = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="china_bank",
charset="utf8")
mycursor = mydb.cursor()
sql = "INSERT INTO currency (currency, TSP, CSP, TBP, CBP, time) VALUES (%s, %s, %s, %s, %s, %s)"
val = (item["currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["time"])
# 打印 SQL 语句和要插入的值
print(f"Inserting into currency: {val}")
mycursor.execute(sql, val)
mydb.commit()
except Exception as err:
print(f"Error: {err}")
finally:
mycursor.close()
mydb.close()
return item
items.py
import scrapy
class ThirdbloodItem(scrapy.Item):
currency = scrapy.Field()
TSP = scrapy.Field()
CSP = scrapy.Field()
TBP = scrapy.Field()
CBP = scrapy.Field()
time = scrapy.Field()
settings.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
'thirdBlood.pipelines.ThirdbloodPipeline': 1,
}
main.py
import os
import sys
from scrapy import cmdline
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "third"])
3.2 作业心得
- 学会了如何将Selenium与Scrapy结合使用,以处理动态网页的数据抓取。使用Selenium能够获取需要的JavaScript渲染后的内容
- 在解析抓取的HTML内容时,我熟悉了对XPath选择器的使用。通过XPath可以精准定位到需要的数据,提高了抓取效率和数据提取的准确性