爬取微信公众号文章
搜狗对微信公众平台的公众号和文章做了整合,使用代理爬取。
spider.py
1 from urllib.parse import urlencode 2 import pymongo 3 import requests 4 from lxml.etree import XMLSyntaxError 5 from requests.exceptions import ConnectionError 6 from pyquery import PyQuery as pq 7 from config import * 8 9 client = pymongo.MongoClient(MONGO_URI) 10 db = client[MONGO_DB] 11 12 base_url = 'http://weixin.sogou.com/weixin?' 13 14 headers = { 15 'Cookie': 'IPLOC=CN1100; SUID=194E796A2E08990A000000005B114E85; SUV=1527860869604056; ABTEST=1|1527860872|v1; SNUID=9FCBFCEF8680EB12510E6A9C86088B29; weixinIndexVisited=1; JSESSIONID=aaaqa95rD87Zu9-CJwlnw; sct=5; ppinf=5|1527862844|1529072444|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8Y3J0OjEwOjE1Mjc4NjI4NDR8cmVmbmljazoyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUh5bE5VSDJEVWNuSHBDWnVOVG9sN2tAd2VpeGluLnNvaHUuY29tfA; pprdig=EZE8CVVtoUTqmCoJj6bEWwKngY4di5UpGDFImTA9-1qrMK_tIJEtUyGR9_0Jcv5Xw1EuqLO9BNFvAKQv5DOQvmCWh-jxudk7SGv89NuhCLow7dxPysoOtLSI-keSaKVLKT82Vhg7rDBg0SlQ3y2uiG53lBUWL0wLVw4D_f_7MLg; sgid=17-35315605-AVsRVjwpV4ichpAzPibp6olGY; ppmdig=1527862844000000243bdb95cb03e086685bb1de06087c32', 16 'Host': 'weixin.sogou.com', 17 'Upgrade-Insecure-Requests': '1', 18 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36' 19 } 20 21 22 proxy = None 23 24 25 def get_proxy(): 26 try: 27 response = requests.get(PROXY_POOL_URL) 28 if response.status_code == 200: 29 return response.text 30 return None 31 except ConnectionError: 32 return None 33 34 def get_html(url, count=1): 35 print('Crawling', url) 36 print('Trying Count', count) 37 global proxy 38 if count >= MAX_COUNT: 39 print('Tried Too Many Counts') 40 return None 41 try: 42 if proxy: 43 proxies = { 44 'http': 'http://' + proxy 45 } 46 response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies) 47 else: 48 response = requests.get(url, allow_redirects=False, headers=headers) 49 if response.status_code == 200: 50 return response.text 51 if response.status_code == 302: 52 # Need Proxy 53 print('302') 54 proxy = get_proxy() 55 if proxy: 56 print('Using Proxy', proxy) 57 #count += 1 58 #return get_html(url, count) 59 return get_html(url) 60 else: 61 print('Get Proxy Failed') 62 return None 63 except ConnectionError as e: 64 print('Error Occurred', e.args) 65 proxy = get_proxy() 66 count += 1 67 return get_html(url, count) 68 69 70 71 def get_index(keyword, page): 72 data = { 73 'query': keyword, 74 'type': 2, 75 'page': page 76 } 77 queries = urlencode(data) 78 url = base_url + queries 79 html = get_html(url) 80 return html 81 82 def parse_index(html): 83 doc = pq(html) 84 items = doc('.news-box .news-list li .txt-box h3 a').items() 85 for item in items: 86 yield item.attr('href') 87 88 def get_detail(url): 89 try: 90 response = requests.get(url) 91 if response.status_code == 200: 92 return response.text 93 return None 94 except ConnectionError: 95 return None 96 97 def parse_detail(html): 98 try: 99 doc = pq(html) 100 title = doc('.rich_media_title').text() 101 content = doc('.rich_media_content').text() 102 date = doc('#publish_time').text() 103 nickname = doc('#js_profile_qrcode > div > strong').text() 104 wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() 105 return { 106 'title': title, 107 'content': content, 108 'date': date, 109 'nickname': nickname, 110 'wechat': wechat 111 } 112 except XMLSyntaxError: 113 return None 114 115 def save_to_mongo(data): 116 if db['articles'].update({'title': data['title']}, {'$set': data}, True): 117 print('Saved to Mongo', data['title']) 118 else: 119 print('Saved to Mongo Failed', data['title']) 120 121 122 def main(): 123 for page in range(1, 101): 124 html = get_index(KEYWORD, page) 125 if html: 126 article_urls = parse_index(html) 127 for article_url in article_urls: 128 #print(article_url) 129 article_html = get_detail(article_url) 130 if article_html: 131 article_data = parse_detail(article_html) 132 print(article_data) 133 if article_data: 134 save_to_mongo(article_data) 135 136 137 138 if __name__ == '__main__': 139 main()
config.py
1 PROXY_POOL_URL = 'http://127.0.0.1:5555/random' 2 KEYWORD = 'python' 3 MONGO_URI = 'localhost' 4 MONGO_DB = 'weixin' 5 MAX_COUNT = 5