aiohttp爬虫的模板,类的形式

  1 import asyncio
  2 import aiohttp
  3 import async_timeout
  4 from lxml import html
  5 from timeit import default_timer as timer
  6 
  7 from db import DBData
  8 
  9 
 10 class Crawler:
 11     def __init__(self, **kwargs):
 12         self.domains = kwargs["domains"]
 13         self.max_depth = kwargs["max_depth"]
 14         self.max_retries = 3
 15         self.max_workers = 10
 16         self.Q = asyncio.Queue()
 17         self.db_Q = asyncio.Queue()
 18         self.cache = set()
 19         self.count = 0
 20         self.loop = asyncio.get_event_loop()
 21         self.db_data = DBData()
 22 
 23         # Clear
 24         self.db_data.clear_crawler()
 25 
 26     async def get(self, url, timeout):
 27         with async_timeout.timeout(timeout):
 28             async with self.session.get(url) as response:
 29                 return await response.text()
 30 
 31     async def extract_urls(self, url, timeout=10):
 32         tree = html.fromstring(await self.get(url, timeout))
 33         # Search only in domains
 34         return {p for p in tree.xpath("//a/@href")}
 35                 # if any(domain in p for domain in self.domains)}
 36 
 37     async def worker(self):
 38         while True:
 39             url, depth, retries = await self.Q.get()
 40             if url in self.cache:
 41                 self.db_Q.put_nowait(url)
 42                 self.Q.task_done()
 43                 continue
 44             try:
 45                 new_urls = await self.extract_urls(url)
 46             except Exception as e:
 47                 if retries <= self.max_retries:
 48                     self.Q.put_nowait((url, depth, retries + 1))
 49                 else:
 50                     print("Error in %s: %s" % (url, repr(e)))
 51             else:
 52                 self.cache.add(url)
 53                 self.count += 1
 54                 self.db_Q.put_nowait(url)
 55                 print("Depth: %s Retry: %s Visited: %s" % (depth, retries, url))
 56                 if depth+1 <= self.max_depth:
 57                     for x in new_urls:
 58                         self.Q.put_nowait((x, depth + 1, retries))
 59             self.Q.task_done()
 60 
 61     async def run(self):
 62         async with aiohttp.ClientSession(loop=self.loop) as session:
 63             self.session = session
 64             workers = [self.worker() for _ in range(self.max_workers)]
 65             workers += [self.write_to_db() for _ in range(self.max_workers)]
 66             tasks = [self.loop.create_task(x) for x in workers]
 67             await asyncio.sleep(5)
 68             await self.Q.join()
 69             await self.db_Q.join()
 70             for task in tasks:
 71                 task.cancel()
 72 
 73     def start(self):
 74         for domain in self.domains:
 75             print("Crawling %s start..." % domain)
 76 
 77             self.Q.put_nowait((domain, 0, 0))
 78             start_time = timer()
 79             self.loop.run_until_complete(asyncio.gather(self.run()))
 80             self.loop.close()
 81             runtime = timer() - start_time
 82 
 83             print("Crawling %s end. Exec time: %s. Requests: %s" % (
 84                 domain, runtime, self.count))
 85 
 86     async def write_to_db(self):
 87         while True:
 88             address = await self.db_Q.get()
 89             if await self.db_data.check_url(address) is None:
 90                 self.db_data.add_url(address)
 91                 print("Write to DB: %s" % address)
 92             self.db_Q.task_done()
 93 
 94 
 95 if __name__ == "__main__":
 96     options = {
 97         "domains": ["https://www.yahoo.com/news/"],
 98         "max_depth": 1
 99     }
100     c = Crawler(**options)
101     c.start()
View Code

 

posted @ 2018-12-25 10:29  zhong_sp  阅读(348)  评论(0编辑  收藏  举报