腾讯微博用户关注与听众的爬取
按广度的方式爬取用户的关注和听众,腾讯微博已经停运了,网上找的登入代码已经都过时了,自己分析不出来,就直接把cookie复制下了,这样就能获取要登录的内容了。
由于停运,只能获取40页的内容,文件格式为[source,target] 表示source 关注 target。由于从source爬取的话可以从关注里找到target,而从target爬取的话会从听众找到source,所以就需要写个去重了。
一小时大概能获取2万条消息。就一路写下去,没用线程。
辣鸡代码如下:
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' @auther: Starry @file: Tencentweibo.py @time: 2018/7/15 9:50 ''' import requests from bs4 import BeautifulSoup from queue import Queue import time import datetime import json import csv import os cookies = { } headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "api.t.qq.com", "Pragma": "no-cache", "Referer": "http://api.t.qq.com/proxy.html", "rf": "http://t.qq.com/anjianbin1979/following?t=1#u=anjianbin1979&t=1&st=1&p=2", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" } class TencentWeibo: COUNT = 0 def __init__(self, start_name, start_title): self.start_name = start_name self.start_titile = start_title self.que = Queue() self.nameToId = {} self.current_num = 1 self.visName = [] self.unique = {} self.init_exe() def init_exe(self): if not os.path.exists('information.csv'): self.csv_information = csv.writer(open('information.csv','a',newline='',encoding='utf-8'),dialect='excel') self.csv_information.writerow(['id','user','name']) self.csv_information.writerow([1,self.start_name,self.start_titile]) self.que.put(self.start_name) self.nameToId[self.start_name] = self.current_num self.unique[self.nameToId[self.start_name]] = [] else: with open('information.csv','r',encoding='utf-8') as f: csvFile = csv.reader(f,dialect='excel') for index, item in enumerate(csvFile): if index == 0:continue self.que.put(item[1]) self.nameToId[item[1]] = int(item[0]) self.current_num = int(item[0]) self.csv_information = csv.writer(open('information.csv', 'a', newline='', encoding='utf-8'), dialect='excel') if not os.path.exists('data.csv'): self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel') self.csv_data.writerow(['Source', 'Target']) else: FLAG = 0 with open('data.csv', 'r', encoding='utf-8') as f: csvFile = csv.reader(f, dialect='excel') for index, item in enumerate(csvFile): if index==0:continue id1, id2 = int(item[0]),int(item[1]) if id1 not in self.unique.keys(): self.unique[id1] = [] if id2 not in self.unique.keys(): self.unique[id2] = [] self.unique[id1].append(id2) FLAG = min(id1,id2) while not self.que.empty(): name = self.que.get() id = self.nameToId[name] if id == FLAG: break else: self.visName.append(name) self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel') print('开始爬取啦!!!') def DealHtml(self,html, Flag, name): soup = BeautifulSoup(html, 'html.parser') li = soup.find_all('div', attrs={"class": "userName"}) for chlid in li: try: id = chlid.find('a').get('href')[1:] title = chlid.find('a').string if id not in self.nameToId.keys(): self.current_num += 1 self.nameToId[id] = self.current_num self.que.put(id) self.csv_information.writerow([self.current_num, id, title]) if self.nameToId[id] not in self.unique.keys(): self.unique[self.nameToId[id]] = [] # if self.COUNT == 1000: # print('已经爬取了%s条消息了'%self.current_num) # self.COUNT = 0 # self.COUNT += 1 if Flag == 1: # print("关注",id,title) if self.nameToId[id] not in self.unique[self.nameToId[name]]: self.unique[self.nameToId[name]].append(self.nameToId[id]) # print([self.nameToId[name],self.nameToId[id]]) self.csv_data.writerow([self.nameToId[name], self.nameToId[id]]) elif Flag == 2: # print("粉丝",id, title) if self.nameToId[name] not in self.unique[self.nameToId[id]]: self.unique[self.nameToId[id]].append(self.nameToId[name]) # print([self.nameToId[id], self.nameToId[name]]) self.csv_data.writerow([self.nameToId[id], self.nameToId[name]]) except Exception as e: print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e) def getFans(self, name): for i in range(1, 41): try: ctime = str(int(time.time() * 1000)) url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=2&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format( name, str(i), ctime) ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10) ret_json = json.loads(ret.text) if "info" in ret_json.keys(): self.DealHtml(ret_json['info'], 2, name) else: break except Exception as e: print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e) def getIdol(self, name): for i in range(1, 41): try: ctime = str(int(time.time() * 1000)) url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=1&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format( name, str(i), ctime) ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10) ret_json = json.loads(ret.text) if "info" in ret_json.keys(): self.DealHtml(ret_json['info'], 1, name) else: break except Exception as e: print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e) def start(self): while not self.que.empty(): visiter = self.que.get() if visiter not in self.visName: self.visName.append(visiter) self.getIdol(visiter) self.getFans(visiter) class TencentWeiboArticles: def __init__(self): self.que = Queue() self.IdToInformation = {} # self. def start(self): pass weibo = TencentWeibo('xie_na','谢娜') weibo.start()