aiohttp异步爬虫实战
爬取目标
一个图书网站,其链接为:https://spa5.scrape.center/ ,整个网站包含数千本图书信息,网站数据是通过JavaScript渲染出来的,数据是通过Ajax接口获取的,并且接口没有设置任何反爬措施和加密参数,页面加载如下图所示:
另外,由于这个网站的数据量多一些,所以选择用异步方式来爬取。
爬取步骤
- 分析页面数据的加载逻辑
- 用aiohttp实现Ajax数据的爬取
- 将每本图书的相关信息分别保存到MySQL数据库中
代码实例
# -*- UTF-8 -*-
"""
@File:aiohttp_p207.py
@Description:
@Author:echohye
@Date:2022/02/01 15:09
"""
import asyncio
import json
import aiohttp
import logging
import pymysql
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL = 'https://spa5.scrape.center/api/book/?limit=18&offset={offset}'
DETAIL_URL = 'https://spa5.scrape.center/api/book/{ide}'
PAGE_SIZE = 18
PAGE_NUMBER = 60
CONCURRENCY = 20
semaphore = asyncio.Semaphore(CONCURRENCY) # 设置并发量
session = None
# api内容获取
async def scrape_api(url):
async with semaphore:
try:
logging.info("scraping %s", url)
async with session.get(url) as response:
return await response.json()
except aiohttp.ClientError:
logging.error("error occurred while scraping %s", url, exc_info=True)
# 每页链接切换
async def scrape_index(page):
url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
return await scrape_api(url)
# 每本图书详情链接切换
async def scrape_detail(ide):
url = DETAIL_URL.format(ide=ide)
return await scrape_api(url)
# 爬取的主方法
async def scrape_main():
global session
session = aiohttp.ClientSession()
scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1, PAGE_NUMBER + 1)]
results = await asyncio.gather(*scrape_index_tasks)
logging.info("result %s", json.dumps(results, ensure_ascii=False, indent=2))
ids = []
for index_data in results:
if not index_data: continue
for item in index_data.get('results'):
ids.append(item.get('id'))
scrape_detail_tasks = [asyncio.ensure_future(scrape_detail(ide)) for ide in ids]
books = await asyncio.gather(*scrape_detail_tasks)
logging.info("book %s", json.dumps(books, ensure_ascii=False, indent=2))
await session.close()
return books
# 信息保存到数据库
def save_data(books: list):
db = pymysql.Connect(host="localhost", user="root", password="zhy123", port=3306, db="spiders")
cursor = db.cursor()
table_sql = 'create table if not exists aiohttp_P207(id varchar(255) not null,name varchar(255) not null,authors varchar(255),' \
'translators varchar(255),publisher varchar(255),tags varchar(255),url varchar(255),isbn varchar(255),' \
'cover varchar(255),page_number varchar(255),price varchar(255),score varchar(255),' \
'published_at varchar(255),updated_at varchar(255))'
cursor.execute(table_sql)
for book in books:
try:
sql = 'insert into aiohttp_P207(id,name,authors,translators,publisher,tags,url,isbn,cover,page_number,price,score,' \
f"published_at,updated_at) values(\"{book.get('id')}\",\"{book.get('name')}\",\"{book.get('authors')}\",\"{book.get('translators')}\",\"{book.get('publisher')}\"," \
f"\"{book.get('tags')}\",\"{book.get('url')}\",'{book.get('isbn')}',\"{book.get('cover')}\",\"{book.get('page_number')}\",\"{book.get('price')}\"," \
f"\"{book.get('score')}\",\"{book.get('published_at')}\",\"{book.get('updated_at')}\")"
cursor.execute(sql)
db.commit()
logging.info("DataBase book %s", book.get('id'))
except Exception as e:
db.rollback()
print(e.args)
db.close()
def main():
contents = asyncio.get_event_loop().run_until_complete(scrape_main())
save_data(contents)
if __name__ == '__main__':
main()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!