用来爬取新浪微博评论数据
1 # -*- coding:utf-8 -*- 2 import requests 3 import json 4 import re 5 import os 6 import gevent 7 import time 8 import random 9 from multiprocessing.dummy import Pool as ThreadPool 10 from bs4 import BeautifulSoup 11 class CommentCrawl(object): 12 ''' 13 用来爬取新浪微博评论数据 14 ''' 15 headers = { 16 'User-Agent': '', 17 'Cookie': ''} 18 ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 19 all_comment = [] 20 def __init__(self,urlll,file_name): 21 self.urlll = urlll 22 self.file_name=file_name 23 def base62_decode(self,string, alphabet=ALPHABET): 24 base = len(alphabet) 25 strlen = len(string) 26 num = 0 27 idx = 0 28 for char in string: 29 power = (strlen - (idx + 1)) 30 num += alphabet.index(char) * (base ** power) 31 idx += 1 32 return num 33 34 def parser_url(self): 35 code = self.urlll.split('?')[0].split('/')[-1] 36 code1 = code[0] 37 code2 = code[1:5] 38 code3 = code[5:] 39 id1 = self.base62_decode(code1) 40 id2 = self.base62_decode(code2) 41 id3 = self.base62_decode(code3) 42 numList = [id1, id2, id3] 43 plus = ''.join(map(str, numList)) 44 comment_url = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id='+ plus +'&root_comment_max_id_type=0&page={}' 45 return comment_url 46 def get_url_page(self): 47 r = requests.get(self.parser_url().format(1),headers=self.headers) 48 data = json.loads(r.text) 49 total_page = data['data']['page']['totalpage'] 50 return total_page 51 52 def all_urls(self): 53 all_urls = [self.parser_url().format(i + 1) for i in range(self.get_url_page())] 54 return all_urls 55 56 def comment_parser(self,html): 57 soup = BeautifulSoup(html, 'html.parser') 58 data = soup.select('.WB_text') 59 comment = [i.text.split(':')[-1] for i in data] 60 return comment 61 def finnal_text(self,url): 62 finnal_all_comment=''.join(self.all_comment) 63 r1 = requests.get(url,headers=self.headers) 64 time.sleep(random.randint(1,5)) 65 data1 = json.loads(r1.text) 66 html =data1['data']['html'] 67 finnal_data = self.comment_parser(html) 68 self.all_comment+=finnal_data 69 print(len(self.all_comment)) 70 return finnal_all_comment 71 def save_file(self,url): 72 path = os.getcwd() 73 filename = self.file_name + '.txt' 74 file = path + '/' + filename 75 f = open(file, 'a+', encoding='utf-8') 76 f.write(self.finnal_text(url)) 77 78 if __name__ == "__main__": 79 aa = CommentCrawl('http://weibo.com/2202387347/EFdPHe50Z?from=page_1006062202387347_profile&wvr=6&mod=weibotime','小米6发布会') 80 all_link = aa.all_urls() 81 pool=ThreadPool(4) 82 results = pool.map(aa.save_file,all_link) 83 pool.close() 84 pool.join()