数据采集实验作业3

作业①:
要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。
–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。

gitee链接:https://gitee.com/lian111111/crawl_project/tree/master/数据采集与融合技术第三次作业/作业3.1

完整代码如下:

 1)检查网页,可以发现a标签下有新的url,可以根据a标签的href属性访问新网页

image

 2)获取a标签下的url

def getUrls(startUrl):
	urls = ['http://www.weather.com.cn/']
	html = getHtml(startUrl)
	soup = BeautifulSoup(html, 'html.parser')
	for li in soup.find_all('a'):
		newUrl = li
		if 'http' in str(newUrl):
			reg = r'http.*?html'
			ls = re.findall(reg, str(newUrl))
			if len(ls)==1:
				urls.append(ls[0])
	return urls

  3)获取图片

复制代码
def getImg(url,page):
global x # x = 1
if x< 106:

    html = getHtml(url)
    soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的img标签
    global imgs
    for li in soup.find('body').children:
        if isinstance(li, bs4.element.Tag):
            Img = li('img')
            for i in Img:
                reg = r'//.*?jpg'
                ls = re.findall(reg, str(i))
                filename = 'Images/%s.jpg' % str(x)
                # filename = 'Images/%s.jpg' % str(x+(page-1)*5)
                # 将URL表示的网络对象复制到本地文件
                if len(ls) != 0:
                    URL = 'http:' + ls[0]
                    if URL not in imgs:
                        if x >= 106:     # if x>=6
                            break
                        imgs.add(URL)
                        urllib.request.urlretrieve(URL, filename)
                        print(" 图片URL为:" + URL)
                        x += 1

  4)单线程/多线程爬取

def main():
	threads = [] #线程列表
	#五线程
	urls = getUrls('http://www.weather.com.cn/')
	page = 1
	for i in range(len(urls)):  # range(21)
		t = threading.Thread(target=getImg, args=(urls[i], page, ))
		threads.append(t)
		page+=1
	for t in threads:
		t.start()
		t.join()
	# for t in threads:
	#     t.join()
	#注释代码部分为多线程代码

  5)结果如下(共120张)
image

image

心得体会:该实验就是还是简单的使用到这个scrapy的框架,然后将图片存储下来的话还是很有难度的,后面也是回顾了之前的作业才搞定

作业②
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/
输出信息:MySQL数据库存储和输出格式如下:

gitee链接:https://gitee.com/lian111111/crawl_project/tree/master/数据采集与融合技术第三次作业/作业3.2
完整代码如下:

定义Item:

  • 在您的项目中,创建一个新的item.py文件,然后定义一个Item类来存储您想要抓取的数据。
import scrapy
 class StockItem(scrapy.Item):
     id = scrapy.Field()
     bStockNo = scrapy.Field()
     name = scrapy.Field()
     latest_price = scrapy.Field()
     change_percent = scrapy.Field()
     change_amount = scrapy.Field()
     volume = scrapy.Field()
     amplitude = scrapy.Field()
     high = scrapy.Field()
     low = scrapy.Field()
     opening = scrapy.Field()
     closing = scrapy.Field()

创建Spider:

  • 创建一个新的spider.py文件,然后定义一个Spider类来抓取数据。

    import scrapy
    from your_project_name.items import StockItem
    
    class StockSpider(scrapy.Spider):
        name = "stock"
        start_urls = ['https://www.eastmoney.com/']
    
        def parse(self, response):
       	 # 使用XPath选择器抽取数据
       	 for stock in response.xpath('//your_xpath_selector'):
       		 item = StockItem()
       		 item['id'] = stock.xpath('your_xpath_selector').extract_first()
       		 item['bStockNo'] = stock.xpath('your_xpath_selector').extract_first()
       		 # ...为其他字段重复此操作
       		 yield item
    

配置Pipeline以存储数据到MySQL:

 `在您的项目中创建一个新的pipelines.py文件,并定义一个Pipeline类来处理和存储数据。
 import mysql.connector

 class StockPipeline(object):
     def __init__(self):
         self.conn = mysql.connector.connect(
             host='your_host',
             user='your_user',
             passwd='your_password',
             db='your_database'
         )
         self.cursor = self.conn.cursor()

     def process_item(self, item, spider):
         sql = "INSERT INTO your_table_name (id, bStockNo, name, latest_price, change_percent, change_amount, volume, amplitude, high, low, opening, closing) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
         values = (item['id'], item['bStockNo'], item['name'], item['latest_price'], item['change_percent'], item['change_amount'], item['volume'], item['amplitude'], item['high'], item['low'], item['opening'], item['closing'])
         self.cursor.execute(sql, values)
         self.conn.commit()
         return item

     def close_spider(self, spider):
         self.cursor.close()
         self.conn.close()

setings:

BOT_NAME = "stock_scraper"
SPIDER_MODULES = ["stock_scraper.spiders"]
NEWSPIDER_MODULE = "stock_scraper.spiders"
ITEM_PIPELINES = {'stock_scraper.pipelines.MySQLPipeline': 1}
ROBOTSTXT_OBEY = True
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

结果如下:
image

、心得体会:学会使用python连接mysql数据库保存数据。

作业③
实验内容
要求
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
gitee链接:https://gitee.com/lian111111/crawl_project/tree/master/数据采集与融合技术第三次作业/作业3.3
代码如下:
myspider

import scrapy
from demo1.items import currencyItem
from bs4 import UnicodeDammit

class MySpider(scrapy.Spider):
	name = "mySpider3"
	start_urls = 'https://www.boc.cn/sourcedb/whpj/'

	def start_requests(self):
		url = MySpider.start_urls
		yield scrapy.Request(url=url, callback=self.parse)
	def parse(self, response):
		try:
			dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
			data = dammit.unicode_markup
			selector = scrapy.Selector(text=data)

			trs = selector.xpath("//table/tr")
			trs = trs[2:len(trs)-2]
			for tr in trs:

				item = currencyItem()
				# print(tr.xpath("./td[1]/text()").extract())
				item['currency'] = tr.xpath("./td[1]/text()").extract()
				item['TBP'] = tr.xpath("./td[2]/text()").extract()
				item['CBP'] = tr.xpath("./td[3]/text()").extract()
				item['TSP'] = tr.xpath("./td[4]/text()").extract()
				item['CSP'] = tr.xpath("./td[5]/text()").extract()
				item['Time'] = tr.xpath("./td[8]/text()").extract()
				print(item)

				yield item

		except Exception as err:
			print(err)

items

class currencyItem(scrapy.Item):
	currency = scrapy.Field()   
	TBP = scrapy.Field()        
	CBP = scrapy.Field()        
	TSP = scrapy.Field()       
	CSP = scrapy.Field()       
	Time = scrapy.Field()
	pass

settings

ITEM_PIPELINES = {
   # "demo1.pipelines.pic_urlPipeline": 300,
   # "demo1.pipelines.stockPipeline": 300,
   "demo1.pipelines.currencyPipeline": 300,
}

pipelines

class currencyPipeline:

	def open_spider(self,spider):
		self.mydb = pymysql.connect(
		host="192.168.91.1",
		port=3306,
		user="root",
		password="123456",
		database="spider",
		charset='utf8'
		)
		self.cursor = self.mydb.cursor()

		self.cursor.execute('''CREATE TABLE IF NOT EXISTS currency
						  (Currency VARCHAR(256),
						  TBP VARCHAR(256),
						  CBP VARCHAR(256),
						  TSP VARCHAR(256),
						  CSP VARCHAR(256),
						  Times VARCHAR(256)
						   )''')
		self.mydb.commit()

	def process_item(self, item, spider):
		print(item.get("currency"))
		sql="INSERT INTO currency (Currency,TBP,CBP,TSP,CSP,Times) VALUES (%s,%s,%s,%s,%s,%s)"
		self.cursor.execute(sql,(item.get("currency"),item.get("TBP"),item.get("CBP"),item.get("TSP"),item.get("CSP"),item.get("Time")))
		self.mydb.commit()
		return item

	def close_spider(self,spider):
		self.cursor.close()
		self.mydb.close()

run

from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider3 -s LOG_ENABLED=False".split())

结果如图:
image
image

心得体会:加深了对scrapy整个框架运行的理解。

posted @   liannnn  阅读(66)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)
点击右上角即可分享
微信分享提示