爬虫-aiohttp

 

# -*- coding:utf-8 -*-
import requests
import asyncio
import time
import aiohttp
from lxml import etree
'''
- aiohttp
    - 是一个支持异步的网络请求模块
    - pip install aiohttp
requests模块不支持异步
'''

urls = ['http://localhost:5000/bobo','http://localhost:5000/tom','http://localhost:5000/jay',]
# async def get_request(url):
#     #requests是一个不支持异步的模块
#     page_text = requests.get(url).text
#     return page_text

async def get_request(url):
    #实例化好一个请求对象
    async with aiohttp.ClientSession() as sess:
        #调用get发起请求,返回一个响应对象
        #get/post(url,headers,params/data,prosy="http://ip:port")
        async with await sess.get(url=url) as response:
            #text()获取了字符串形式的响应数据
            #read()获取byte类型的响应数据
            page_text = await response.text()
            return page_text
#解析函数的封装
def parse(t):
    #获取请求到页面源码数据
    page_text = t.result()
    tree = etree.HTML(page_text)
    parse_text = tree.xpath('//a[@id="feng"]/text()')[0]
    print(parse_text) #可以写入本地

if __name__ == "__main__":
    start = time.time()
    tasks = []
    for url in urls:
        c = get_request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(parse())
        tasks.append(task)
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    print('总耗时: ',time.time()-start)

 

posted on 2022-11-13 14:53  Colin88  阅读(22)  评论(0编辑  收藏  举报