2023数据采集与融合技术作业三
作业1
-
要求
指定一个网站,爬取这个网站中的所有的所有图片(亚马逊购物网站:https://origin-www.amazon.cn),使用scrapy框架分别实现 单线程和多线程的方式爬取。
-
输出信息
将下载的UrI信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
spider.py
import scrapy
from ..items import ScrapyAmazonItem
class AmazonSpider(scrapy.Spider):
name="amazonspider"
allowed_domains=['origin-www.amazon.cn']
start_urls=['https://origin-www.amazon.cn/s?k=礼物&page=1']
base_url='https://origin-www.amazon.cn/s?k=礼物&page='
page=1
def parse(self,response):
lists=response.xpath('//div[@data-component-type="s-search-result"]')
for u in lists:
src=u.xpath('.//img/@src').extract_first()
name=u.xpath('.//span[@class="a-size-base-plus a-color-base a-text-normal"]/text()').extract()
price=u.xpath('.//span[@class="a-offscreen"]/text()').extract_first()
book=ScrapyAmazonItem(src=src,name=name,price=price)
yield my_save
if self.page < 10:
self.page = self.page+1
url=self.base_url+str(self.page)
yield scrapy.Request(url=url,callback=self.parse)
pipeline.py
import os
import urllib.request
import threading
from queue import Queue
class AmazonDownloadPiepline:
def __init__(self):
self.counter=1
self.queue = Queue()
self.pool = []
self.lock = threading.Lock()
for _ in range(3):
thread = threading.Thread(target=self.process_item)
self.pool.append(thread)
thread.start()
def process_item(self,item,spider):
url=item.get('src')
if not os.path.exists('./pictures/'):
os.mkdir('./pictures/')
print('url'+str(self.counter)+"="+url)
filename = f'./pictures/{self.counter}'+str(url)[-4:]
urllib.request.urlretrieve(url=url, filename=filename)
self.counter +=1
return item
def process_item_threaded(self, item, spider):
self.queue.put(item)
结果:
心得体会:很明显多线程远快于单线程,并发真强大啊
作业2
-
要求
熟练掌握scrapy中Item、 Pipeline 数据的序列化输出方法; Scrapy+ Xpath+MySQl+数据库存储技术路线爬取股票相关信息。东方财富网: https://www.eastmoney.com/)
-
输出信息
MySQL数据库存储和输出格式如下,表头应是英文命名。
spider.py
import scrapy
from ..items import ScrapyStockItem
import scrapy
import re
class StockSpider(scrapy.Spider):
name="stockspider"
allowed_domains=['84.push2.eastmoney.com']
start_urls=["https://84.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112407256076698414418_1697703074753&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1697703074754"]
base_url="https://84.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112407256076698414418_1697703074753&pn="
end_url= "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1697703074754"
page=1
id=0
def parse(self,response):
response=response.text
pat='diff":\[(.*?)\]'
res=re.compile(pat,re.S).findall(response)
datas = list(eval(res[0]))
for d in datas:
self.id += 1
item= ScrapyStockItem(id=self.id,str=d)
yield item
if self.page < 5:
self.page += 1
url = self.base_url + str(self.page)+self.end_url
print(url)
yield scrapy.Request(url=url, callback=self.parse)
pipeline.py
import sqlite3
class StockDownloadPiepline:
def __init__(self):
self.print_header = True
def process_item(self, item, spider):
try:
d = item.get('str')
id = item.get('id')
if self.print_header:
print("ID Stock Code Stock Name Latest Price Change Percent Change Amount Volume Turnover Amplitude Highest Lowest Open Price Close Price")
self.print_header = False
print("{} {} {} {} {} {} {} {} {} {} {} {} {}".format(
id, d['f12'], d['f14'], d['f2'], d['f3'], d['f4'], d['f5'], d['f6'], d['f7'], d['f15'], d['f16'], d['f17'], d['f18']))
except Exception as err:
print(err)
return item
结果:
心得体会:最麻烦的地方还是在于页面的url没有改变,所以选择使用json进行抓包,主要干的都是调参数和找路径等硬活儿。
作业3
-
要求
熟练掌握scrapy中Item、Pipeline 数据的序列化输出方法,使用scrapy框架+ Xpath+ MySQL数据库存储技术路线爬取外汇_网站数据( https://www.boc.cn/sourcedb/whpj/)
-
输出信息
(MySQL 数据库存储和输出格式)
CurrencySpider.py
import scrapy
import re
class CurrencySpider(scrapy.Spider):
name = "currencyspider"
allowed_domains = ['www.boc.cn']
start_urls = ['http://www.boc.cn/sourcedb/whpj/']
base_url = 'https://www.boc.cn/sourcedb/whpj/index_'
end_url = '.html'
page = 0
def parse(self, response):
trs = response.xpath('//tr')
if self.page == 0:
print("Currency\tTBP\tCBP\tTSP\tCSP\tTime")
for tr in trs[2:29]:
Currency = tr.xpath('./td[1]/text()').extract_first() or 'none'
TBP = tr.xpath('./td[2]/text()').extract_first() or 'none'
CBP = tr.xpath('./td[3]/text()').extract_first() or 'none'
TSP = tr.xpath('./td[4]/text()').extract_first() or 'none'
CSP = tr.xpath('./td[5]/text()').extract_first() or 'none'
Time = tr.xpath('./td[7]/text()').extract_first() or 'none'
print("{}\t{}\t{}\t{}\t{}\t{}".format(Currency, TBP, CBP, TSP, CSP, Time))
if self.page < 2:
self.page += 1
url = self.base_url + str(self.page) + self.end_url
yield scrapy.Request(url=url, callback=self.parse)
pipeline.py
import sqlite3
class CrruencyDownloadPipeline:
def __init__(self):
self.print_header = True
def process_item(self, item, spider):
try:
d = item.get('str')
id = item.get('id')
if self.print_header:
print("ID Stock Code Stock Name Latest Price Change Percent Change Amount Volume Turnover Amplitude Highest Lowest Open Price Close Price")
self.print_header = False
print("{} {} {} {} {} {} {} {} {} {} {} {} {}".format(
id, d['f12'], d['f14'], d['f2'], d['f3'], d['f4'], d['f5'], d['f6'], d['f7'], d['f15'], d['f16'], d['f17'], d['f18']))
except Exception as err:
print(err)
return item
结果:
心得体会:练习了一下用xpath,比对着html解析开心多了
总结
通过这三次实验学习,我对scrapy框架有了深入的了解,爬虫的基本功(xpath、抓包)等也有所巩固。