aiohttp使用队列

获取百度的搜索结果,然后把百度的长链接,获取到真实的url

import time
import aiofiles
import aiohttp
import asyncio
from lxml import etree
from asyncio import Queue
from itertools import product
import async_timeout

MAX_THREADS = 50


class BaiduSpider:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                          "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
        self.q = Queue()
        self.q2 = Queue()

    def url_generator(self):
        with open('keyword.txt', 'r', encoding='utf8') as f:
            for key in product(f, range(0, 5)):
                yield f"https://www.baidu.com/s?wd={key[0].strip()}&pn={key[1]}"

    async def fetch(self, session, url):
        try:
            with async_timeout.timeout(1):
                async with session.get(url, headers=self.headers) as resp:
                    if resp.status in [200, 201]:
                        return await resp.text()
        except Exception as e:
            pass

    async def work(self, session):
        while not self.q.empty():
            url = await self.q.get()
            html = await self.fetch(session, url)
            datas = await self.parser(session, html)
            self.q.task_done()

    async def parser(self, session, html):
        if html:
            tree = etree.HTML(html)
            datas = tree.xpath('//h3[@class="t"]/a')
            for data in datas:
                title = data.xpath('string(.)')
                link = data.xpath('@href')[0]
                data = [title, link if title else '']
                self.q2.put_nowait(data)
            await self.work2(session)

    async def work2(self, session):
        while not self.q2.empty():
            data = await self.q2.get()
            try:
                with async_timeout.timeout(1):
                    async with session.get(data[1], headers=self.headers) as resp2:
                        print(resp2.url, data[0])
                        async with aiofiles.open('links.txt', 'a', encoding='utf-8') as fd:
                            if str(resp2.url) not in 'links.txt':
                                await fd.write(f"{data[0]},{resp2.url}\n")
            except Exception as e:
                pass

    async def download(self):
        urls = self.url_generator()
        conn = aiohttp.TCPConnector(verify_ssl=False)  # 防止ssl报错
        [self.q.put_nowait(url) for url in urls]
        async with aiohttp.ClientSession(connector=conn) as session:
            tasks = [asyncio.ensure_future(self.work(session)) for _ in range(MAX_THREADS)]
            await asyncio.wait(tasks)

    def run(self):
        start_time = time.time()
        loop = asyncio.get_event_loop()
        tasks1 = asyncio.gather(self.download())
        loop.run_until_complete(tasks1)
        print(f'全程用时{time.time() - start_time}秒')


if __name__ == '__main__':
    baidu = BaiduSpider()
    items = baidu.run()

posted @ 2019-04-08 10:26  公众号python学习开发  阅读(697)  评论(0编辑  收藏  举报