request+redis 分布式爬虫
# __author__ = '' # __createTime__ = '2019/1/7 13:49' # __description__ = '‘’ # # -*- coding:utf-8 -*- import random from itertools import chain from urllib.parse import quote from concurrent.futures import ThreadPoolExecutor from redis import Redis import pymysql import requests from lxml import etree '''redis + requests 分布式''' redis_connect = Redis.from_url("redis://:6379", decode_responses=True) db = pymysql.connect(host='193.112.41.49', user='', password="", database='spiders', port=3306, charset='utf8mb4') cursor = db.cursor() class Conton_Fair(): def __init__(self,url): self.url = url self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'ASP.NET_SessionId=u1rolptswy22kite05yuu2dr; Hm_lvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; Hm_lpvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; _gcl_au=1.1.1828690268.1546075439; _ga=GA1.3.682141728.1546075439; _ym_uid=15460754431066088148; _ym_d=1546075443; ASPSESSIONIDSQARTRST=JBKMEFAABPPOIONCBCGLIDOM; cookie-notification=1; ASPSESSIONIDQASDDBCA=ODAOCGMCBGEJAHGFIDCKFJHL; _ctauu_469_1=%7B%22uuid%22%3A%22cp21gbzc66s18asqrg96%22%2C%22vsts%22%3A2%2C%22imps%22%3A%7B%7D%2C%22cvs%22%3A%7B%7D%7D; safedog-flow-item=; WT_FPC=id=2eedfbfb975c7db4e0b1546075438399:lv=1546830767948:ss=1546830613964', 'Host': 'www.cantonfair.org.cn', 'Pragma': 'no-cache', 'Referer':self.url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36''' } def Get_url(self): htmls = requests.get(url=self.url,headers = self.headers) html = etree.HTML(htmls.text) return self.Save_url(html) def Save_url(self,html): h4 = html.xpath('//li//h4') for Company in h4: if Company.xpath('.//text()'): link =(Company.xpath('./a/@href')[0].replace('Product', 'Company').split('&productid')[ 0] + '&corptype=1').replace('en', 'cn') # 加入缓存 redis_connect.sadd("urls", link) # 下一页 Next = html.xpath('//a[text()="Next"]/@href') if Next: self.url = 'http://www.cantonfair.org.cn/en/search/%s'%Next[0] self.Get_url() def main(kw): url_datas = quote(kw) url = list.aspx?k=%s&lang=2&len=100' % url_datas Class_Conton = Conton_Fair(url) Class_Conton.Get_url() if __name__ == '__main__': # while True: ssql = """SELECT kw FROM words WHERE status=0 or status=5 LIMIT 100 """ cursor.execute(ssql) dataAll = cursor.fetchall() list_url = list(chain.from_iterable(dataAll)) # urls = list(chain.from_iterable(dataAll)) with ThreadPoolExecutor(3) as executor: for data_url in list_url: executor.submit(main,data_url) upda = '''UPDATE words SET status=5 WHERE kw=%r'''%data_url cursor.execute(upda) db.commit()
使用分布式爬取,我的思路是这样的,一台机器爬取指定的url,存到缓存,爬url比解析总是要快吧,一页都有好几十的那种,就算每台机器的速度都一样,爬一次的url够几台机器同时去解析的了
接下来就是我们的解析了:
因为这个网站需要的数据是动态加载的,我js比较差,也不想去找函数,我就直接使用splash渲染了,它和selenium差不多,但是的话,splash比较快一些,就选择这个了。
可以去了解了解哦
# __author__ = '' # __createTime__ = '2019/1/7 15:20' # __description__ = '代码简要说明' import time import requests from redis import Redis redis_connect = Redis.from_url("redis:/:6379", decode_responses=True) def splash_render(url): splash_url = "http:/:8050/render.html" args = { "url": url, "timeout": 5, "image": 0 } response = requests.get(splash_url, params=args) return response.text if __name__ == '__main__': # 判断缓存中是否有url if "first_urls" in redis_connect.keys(): # 随机取一个url并且移除,如果需要去重的话,可以考虑使用布隆过滤器去去重 url = redis_connect.spop("urls") html = splash_render(url) print(html)
解析网页的结果这份代码可以拷贝到许多台机器同时运行,当然,以上只是简单版的,不要以为这样分布式就完事了
以上内容作为课堂笔记,如有雷同,请联系于我