requests爬取数据与aiohttp爬取数据对比
# 同步
from datetime import datetime
import requests
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
"/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.121 Safari/537.36"}
def get_movie_url():
req_url = "https://movie.douban.com/chart"
response = requests.get(url=req_url, headers=headers)
html = etree.HTML(response.text)
movies_url = html.xpath(
"//*[@id='content']/div/div[1]/div/div/table/tr/td/a/@href")
return movies_url
def get_movie_content(movie_url):
response = requests.get(movie_url, headers=headers)
result = etree.HTML(response.text)
movie = dict()
name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
movie["name"] = name
movie["author"] = author
return movie
if __name__ == '__main__':
start = datetime.now()
movie_url_list = get_movie_url()
movies = dict()
for url in movie_url_list:
movies[url] = get_movie_content(url)
print(movies)
print("同步用时为:{}".format(datetime.now() - start))
# 看一下同步的结果:
#
# E:\venv\spider\Scripts\python.exe E:/python_project/filetest/douban.py
# [{'name': ['小丑 Joker'], 'author': ['托德·菲利普斯']},
# {'name': ['好莱坞往事 Once Upon a Time... in Hollywood'], 'author': ['昆汀·塔伦蒂诺']},
# {'name': ['爱尔兰人 The Irishman'], 'author': ['马丁·斯科塞斯']},
# {'name': ['准备好了没 Ready or Not'], 'author': ['马特·贝蒂内利-奥尔平', ' / ', '泰勒·吉勒特']},
# {'name': ['82年生的金智英 82년생 김지영'], 'author': ['金度英']},
# {'name': ['克劳斯:圣诞节的秘密 Klaus'], 'author': ['塞尔希奥·巴勃罗斯', ' / ', '卡洛斯·马丁内斯·洛佩斯']},
# {'name': ['寄生虫 기생충'], 'author': ['奉俊昊']},
# {'name': ['骡子 The Mule'], 'author': ['克林特·伊斯特伍德']},
# {'name': ['别告诉她 The Farewell'], 'author': ['王子逸']},
# {'name': ['犯罪现场 犯罪現場'], 'author': ['冯志强']}]
# 同步用时为:0:00:08.765342
# Process finished with exit code 0
# 异步
# 异步也很简单,关于异步的文章我还在整理,因为涉及到太多的东西了。先看这个爬虫代码:
import asyncio
from datetime import datetime
import aiohttp
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
"/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.121 Safari/537.36"}
async def get_movie_url():
req_url = "https://movie.douban.com/chart"
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url=req_url, headers=headers) as response:
result = await response.text()
result = etree.HTML(result)
return result.xpath("//*[@id='content']/div/div[1]/div/div/table/tr/td/a/@href")
async def get_movie_content(movie_url):
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url=movie_url, headers=headers) as response:
result = await response.text()
result = etree.HTML(result)
movie = dict()
name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
movie["name"] = name
movie["author"] = author
return movie
if __name__ == '__main__':
start = datetime.now()
loop = asyncio.get_event_loop()
movie_url_list = loop.run_until_complete(get_movie_url())
tasks = [get_movie_content(url) for url in movie_url_list]
movies = loop.run_until_complete(asyncio.gather(*tasks))
print(movies)
print("异步用时为:{}".format(datetime.now() - start))
# 看一下结果,你就知道差距了:
#
# E:\venv\spider\Scripts\python.exe E:/python_project/filetest/aio_douban.py
# [{'name': ['小丑 Joker'], 'author': ['托德·菲利普斯']},
# {'name': ['好莱坞往事 Once Upon a Time... in Hollywood'], 'author': ['昆汀·塔伦蒂诺']},
# {'name': ['爱尔兰人 The Irishman'], 'author': ['马丁·斯科塞斯']},
# {'name': ['准备好了没 Ready or Not'], 'author': ['马特·贝蒂内利-奥尔平', ' / ', '泰勒·吉勒特']},
# {'name': ['82年生的金智英 82년생 김지영'], 'author': ['金度英']},
# {'name': ['克劳斯:圣诞节的秘密 Klaus'], 'author': ['塞尔希奥·巴勃罗斯', ' / ', '卡洛斯·马丁内斯·洛佩斯']},
# {'name': ['寄生虫 기생충'], 'author': ['奉俊昊']},
# {'name': ['骡子 The Mule'], 'author': ['克林特·伊斯特伍德']},
# {'name': ['别告诉她 The Farewell'], 'author': ['王子逸']},
# {'name': ['犯罪现场 犯罪現場'], 'author': ['冯志强']}]
# 异步用时为:0:00:02.230956
抟扶摇而上者九万里