使用代理处理反爬抓取微信文章

# 使用微信处理反爬抓取微信文章
# 一、引入模块
from pyquery import PyQuery as pq
import requests
from urllib.parse import urlencode
import pymongo
from config import *

# 参数设置
headers = {
    'Cookie':'IPLOC=CN3100; SUID=3F9A2D651620940A00000000593501AD; SUV=1496646179483423; ABTEST=2|1496646063|v1; SNUID=63C771395C590C6C4DA62B9E5DA843E5; weixinIndexVisited=1; ppinf=5|1496647654|1497857254|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTUlODUlODklRTYlOUQlQjB8Y3J0OjEwOjE0OTY2NDc2NTR8cmVmbmljazoyNzolRTclOEUlOEIlRTUlODUlODklRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUJzVzdIYXl4M2tlWFpjZTdWWmNNX2NAd2VpeGluLnNvaHUuY29tfA; pprdig=xFuTU5F3rYPr-GxEdzubwrQZ7jX7ifkrTXkYt2AR7gz17xFKLcIlD5r91dsYOnH_RDub9VxG8vNpHf5fwEjxAs4qFEJTqW96oVvr1UZq3qXq-AhGxJEDqlo8g5O3ZXy_F80B8YndLpUVbWeQDfJFlrwBlQ-3PXME0lxEDeguSyY; sgid=08-28925681-AVk1BibbKicbvQn77cbUV9RKo; SUIR=63C771395C590C6C4DA62B9E5DA843E5; pgv_pvi=3861214208; pgv_si=s8617661440; PHPSESSID=3rs7b393svg890cc66tv5kp942; sct=3; JSESSIONID=aaaTpSE3q_s21beotLIXv; ppmdig=14967350130000006401a6de8aa6584a3ed50839343b064b; seccodeRight=success; successCount=2|Tue, 06 Jun 2017 07:50:44 GMT',
    'Host':'weixin.sogou.com',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
proxy = None
# 二、请求解析模块
# 1、构造url,进行微信关键词搜索
def get_url(KEYWORDS, page_num):
    base_url = 'http://weixin.sogou.com/weixin?'
    data = {
               'query': KEYWORDS,
                'type': '2',
                'page': page_num
    }
    data = urlencode(data)
    url = base_url + data
    return url

# 2、请求url,得到索引页html
def get_index_html(url):
    print('正在爬取', url)
    global proxy
    try:
        if proxy:
            print('正在使用代理',proxy)
            proxies = {
                'http':'http://' + proxy
            }
            response = requests.get(url, headers = headers, allow_redirects = False,proxies = proxies)
        else:
            response = requests.get(url, headers=headers, allow_redirects=False)
        if response.status_code == 200:
            return response.text
        if response.status_code == 302:
            # Need proxy
            print('302')
            proxy = get_proxy()
            if proxy:
                return get_index_html(url)
            else:
                print('请求代理失败')
                return None

    except Exception:
        proxy = get_proxy()
        return get_index_html(url)

# 3、请求url过程中,可能会遇到反爬虫措施,这时候需要开启代理
def get_proxy():
    print('正在请求代理')
    try:
        response = requests.get(POOL_PROXY_URL)
        if response.status_code == 200:
            return response.text
        else:
            print('请求代理失败')
            return None
    except Exception:
        return None

# 4、分析索引页html代码,返回微信详情页url
def get_article_url(index_html):
    doc = pq(index_html)
    lis = doc('.news-box .news-list li').items()
    for item in lis:
        yield item.find('h3 a').attr('href')

# 5、请求微信详情页url,得到详情页html
def get_article_html(article_url):
    try:
        response = requests.get(article_url)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except ConnectionError:
        return None

# 6、分析详情页html代码,得到微信标题、公众号、发布日期,文章内容等信息
def parse_article_html(article_html):
    doc = pq(article_html)
    article_data = {
        'title':doc('#img-content .rich_media_title').text(),
        'nickname':doc('.rich_media_meta_list .rich_media_meta_nickname').text(),
        'date':doc('.rich_media_meta_list #post-date').text(),
        'wechat':doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(),
        'content':doc('.rich_media_content').text()
    }
    # print(article_data)
    return article_data

# 三、存储模块
# 保存到数据库MongoDB
def save_to_mongo(article_data):
    try:
        if db[MONGO_TABLE].insert(article_data):
            print('保存到MONGODB成功',article_data)
    except Exception:
        print('保存到MONGODB失败!',article_data)

# 四、调试模块
def main():
    for page_num in range(1, 101):
        index_url = get_url(KEYWORDS, page_num)
        # print(index_url)
        index_html = get_index_html(index_url)
        if index_html:
            article_urls = get_article_url(index_html)
            for article_url in article_urls:
                # print(article_url)
                article_html = get_article_html(article_url)
                if article_html:
                    article_data = parse_article_html(article_html)
                    print(article_data)
                    # save_to_mongo(article_data)


if __name__ == '__main__':
    main()

#配置文件
MONGO_URL = 'localhost'
MONGO_DB = 'weixin'
MONGO_TABLE = 'article_data'

KEYWORDS= '风景'

POOL_PROXY_URL = 'http://127.0.0.1:5000/get'

 

posted @ 2017-06-06 09:47  道高一尺  阅读(2507)  评论(1编辑  收藏  举报