python_celery redis的读取
1、execute_tasks.py # 读取任务队列存储进redis
import requests import re from lxml import etree from aqicn import crawl def get_cities(): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'} content = requests.get('http://aqicn.org/city/all/cn/',headers=headers) response = content.content.decode('utf-8') hongkong = re.compile(r'id=\'香港\'></a>(.*)<a id=\'蒙古\'></a>',re.S) taiwan = re.compile(r'id=\'台湾\'></a>(.*)<a id=\'新加坡\'></a>',re.S) land = re.compile(r'id=\'中国\'></a>(.*)<a id=\'越南\'></a>',re.S) hongkong = hongkong.findall(response)[0] hongkong = etree.HTML(hongkong) taiwan = taiwan.findall(response)[0] taiwan = etree.HTML(taiwan) land = land.findall(response)[0] land = etree.HTML(land) city_list = land.xpath('//a') + hongkong.xpath('//a') + taiwan.xpath('//a') for i in city_list: try: url = i.xpath('.//@href')[0] name = i.xpath('.//text()')[0].strip() if name: yield {'url':url,'city':name} except Exception as e: print(e) def task_manager(): for data in get_cities(): crawl.delay(data) if __name__ == '__main__': task_manager()
2、aqicn.py # 读取队列进行任务解析并存储
import requests from lxml import etree from celery import Celery # 这里定义了broker和backend # 注意IP和后面的数字都是可以调整的 app = Celery('aqicn', broker='redis://192.168.4.53:6379/1', backend='redis://192.168.4.53:6379/3') # 装饰器,说明这是一个task的实现 @app.task def crawl(data): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Host': 'aqicn.org', 'Proxy-Connection': 'keep-alive', 'Referer': 'http://aqicn.org/city/all/cn/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } url = data['url'] response = requests.get(url, headers=headers) html = response.content.decode('utf-8') html = etree.HTML(html) try: aqi = html.xpath('//div[@id="aqiwgtvalue"]//text()')[0] # 空气质量 except: news = html.xpath('//div[@class="section-content"]/center//h3//text()') if news: aqi= ''.join(news) else: aqi = '页面加载错误' data['aqi'] = aqi return data
3、redis_read.py # 从redis数据库中取值并删除原值,且一旦有新值传入,继续读取并删除,做到持久化,可以直接展示,或者存到mysql数据中
import redis, pickle rediscli = redis.StrictRedis(host='192.168.4.53', db=3, port=6379) while True: result = rediscli.keys() # 持续从redis里拿keys for i in result: key = i.decode('utf-8') data = rediscli.get(key) try:print(pickle.loads(data)) rediscli.delete(key) except Exception as e: print(e)