request+redis 分布式爬虫

# __author__ = ''
# __createTime__ = '2019/1/7 13:49'
# __description__ = '‘’
# # -*- coding:utf-8 -*-
import random
from itertools import chain
from urllib.parse import quote
from concurrent.futures import ThreadPoolExecutor
from redis import Redis
import pymysql
import requests
from lxml import etree
'''redis + requests 分布式'''

redis_connect = Redis.from_url("redis://:6379", decode_responses=True)
db = pymysql.connect(host='193.112.41.49', user='', password="",
                                  database='spiders', port=3306,
                                  charset='utf8mb4')
cursor = db.cursor()

class Conton_Fair():
    def __init__(self,url):
        self.url = url
        self.headers = {
                        'Accept': '*/*',
                        'Accept-Encoding': 'gzip, deflate',
                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
                        'Cache-Control': 'no-cache',
                        'Connection': 'keep-alive',
                        'Cookie': 'ASP.NET_SessionId=u1rolptswy22kite05yuu2dr; Hm_lvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; Hm_lpvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; _gcl_au=1.1.1828690268.1546075439; _ga=GA1.3.682141728.1546075439; _ym_uid=15460754431066088148; _ym_d=1546075443; ASPSESSIONIDSQARTRST=JBKMEFAABPPOIONCBCGLIDOM; cookie-notification=1; ASPSESSIONIDQASDDBCA=ODAOCGMCBGEJAHGFIDCKFJHL; _ctauu_469_1=%7B%22uuid%22%3A%22cp21gbzc66s18asqrg96%22%2C%22vsts%22%3A2%2C%22imps%22%3A%7B%7D%2C%22cvs%22%3A%7B%7D%7D; safedog-flow-item=; WT_FPC=id=2eedfbfb975c7db4e0b1546075438399:lv=1546830767948:ss=1546830613964',
                        'Host': 'www.cantonfair.org.cn',
                        'Pragma': 'no-cache',
                        'Referer':self.url,
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'''
                        }
    def Get_url(self):
        htmls = requests.get(url=self.url,headers = self.headers)
        html = etree.HTML(htmls.text)
        return self.Save_url(html)

    def Save_url(self,html):
        h4 = html.xpath('//li//h4')
        for Company in h4:
            if Company.xpath('.//text()'):
                link =(Company.xpath('./a/@href')[0].replace('Product', 'Company').split('&productid')[
                          0] + '&corptype=1').replace('en', 'cn')
                # 加入缓存
                redis_connect.sadd("urls", link)
        # 下一页
        Next = html.xpath('//a[text()="Next"]/@href')
        if Next:
            self.url = 'http://www.cantonfair.org.cn/en/search/%s'%Next[0]
            self.Get_url()
def main(kw):
    url_datas = quote(kw)
    url = list.aspx?k=%s&lang=2&len=100' % url_datas
    Class_Conton = Conton_Fair(url)
    Class_Conton.Get_url()


if __name__ == '__main__':
    # while True:
    ssql = """SELECT kw FROM words WHERE status=0 or status=5 LIMIT 100 """
    cursor.execute(ssql)
    dataAll = cursor.fetchall()
    list_url = list(chain.from_iterable(dataAll))
    # urls = list(chain.from_iterable(dataAll))
    with ThreadPoolExecutor(3) as executor:
        for data_url in list_url:
            executor.submit(main,data_url)
            upda = '''UPDATE words SET status=5 WHERE kw=%r'''%data_url
            cursor.execute(upda)
            db.commit()

使用分布式爬取,我的思路是这样的,一台机器爬取指定的url,存到缓存,爬url比解析总是要快吧,一页都有好几十的那种,就算每台机器的速度都一样,爬一次的url够几台机器同时去解析的了

接下来就是我们的解析了:

因为这个网站需要的数据是动态加载的,我js比较差,也不想去找函数,我就直接使用splash渲染了,它和selenium差不多,但是的话,splash比较快一些,就选择这个了。

可以去了解了解哦

# __author__ = ''
# __createTime__ = '2019/1/7 15:20'
# __description__ = '代码简要说明'

import time
import requests
from redis import Redis

redis_connect = Redis.from_url("redis:/:6379", decode_responses=True)

def splash_render(url):
    splash_url = "http:/:8050/render.html"

    args = {
        "url": url,
        "timeout": 5,
        "image": 0
    }
    response = requests.get(splash_url, params=args)
    return response.text


if __name__ == '__main__':
    # 判断缓存中是否有url
    if "first_urls" in redis_connect.keys():
        # 随机取一个url并且移除,如果需要去重的话,可以考虑使用布隆过滤器去去重
        url = redis_connect.spop("urls")
        html = splash_render(url)
        print(html)

解析网页的结果这份代码可以拷贝到许多台机器同时运行,当然,以上只是简单版的,不要以为这样分布式就完事了

posted @ 2019-01-11 16:21  Caionk  阅读(682)  评论(0编辑  收藏  举报