celery的使用
1.celery的任务调度
# -*- coding: utf-8 -*- import threading from bs4 import BeautifulSoup from tornado import httpclient from celery import Celery from tornado.httpclient import HTTPClient broker = 'redis://localhost:6379' backend = 'redis://localhost:6379' app = Celery('tasks', broker=broker, backend=backend) visited = {} @app.task def get_html(url): http_client = HTTPClient() try: response = http_client.fetch(url, follow_redirects=True) return response.body except httpclient.HTTPError as e: return None finally: http_client.close() def start(url): threads = [] for i in range(20): t = threading.Thread(target=schedule, args=(url,)) t.daemon = True t.start() threads.append(t) for thread in threads: thread.join() def process_html(url, html): print url + ": " + html _add_links_to_queue(url, html) def schedule(url): print "before call _work " + url _worker.delay(url) print "after call _work " + url def _add_links_to_queue(url, html): soup = BeautifulSoup(html) links = soup.find_all('a') for link in links: try: _url = link['href'] except: pass if not _url.startswith('http'): _url = 'http://' + _url print url + "==>" + _url schedule(_url) @app.task def _worker(url): print str(threading.currentThread()) + " running " + url while 1: if url in visited: continue result = get_html.delay(url) try: html = result.get(timeout=5) except Exception as e: print(url) print(e) finally: process_html(url, html) visited[url] = True if __name__ == '__main__': start("http://www.hao123.com/")
2.celery如何进行负载均衡设计
celery有send_task方式去做任务调度,因此,负载均衡的话,可以采用自己的算法去做任务分配,可参考:http://blog.csdn.net/vintage_1/article/details/47664187