QQ空间爬虫--获取好友信息
QQ空间网页版:https://user.qzone.qq.com/
登陆后,进入设置,有一个权限设置,设置“谁能看我的空间”为好友可见,然后构造爬虫。
(1)获取Cookie
两种方式:
第一种:通过chrome F12慢慢找获取
第二种:selenium模拟登陆获取
1 from selenium import webdriver 2 from time import sleep 3 import json 4 5 QQ_num = '' 6 QQ_s = '' 7 driver = webdriver.Firefox() 8 driver.get('https://user.qzone.qq.com/' + QQ_num + '/main') 9 driver.switch_to.frame('login_frame') 10 #找到账号密码登陆的地方 11 driver.find_element_by_id('switcher_plogin').click() 12 driver.find_element_by_id('u').send_keys(QQ_num) 13 driver.find_element_by_id('p').send_keys(QQ_s) 14 driver.find_element_by_id('login_button').click() 15 #保存本地的cookie 16 sleep(1) 17 cookies = driver.get_cookies() 18 cookie_dic = {} 19 for cookie in cookies: 20 if 'name' in cookie and 'value' in cookie: 21 cookie_dic[cookie['name']] = cookie['value'] 22 with open('cookie_dict.txt', 'w') as f: 23 json.dump(cookie_dic, f)
(2)构造链接
这个从知乎上看别人的思路,主要是g_tk的构造
(3)获取所有好友信息(主要是昵称和QQ号)
1 import urllib 2 import requests 3 import csv 4 import json 5 import re 6 7 class Qzone: 8 9 #算出来gtk 10 def get_gtk(self): 11 p_skey = cookie['p_skey'] 12 h = 5381 13 for i in p_skey: 14 h += (h << 5) + ord(i) 15 g_tk = h & 2147483647 16 return g_tk 17 18 #得到uin 19 def get_uin(self): 20 uin = cookie['ptui_loginuin'] 21 return uin 22 23 # 得到好友qq 24 def get_qq(self): 25 qq_list = [] 26 friend_list = self.get_friend() 27 28 csvfile = open('friends.csv', 'w', newline='') 29 csv_write = csv.writer(csvfile, dialect='excel') 30 31 for friend in friend_list: 32 csv_write.writerow(friend) 33 qq_list.append(friend[1]) 34 csvfile.close() 35 return qq_list 36 37 # 找出好友列表 38 def get_friend(self): 39 url_friend = 'https://user.qzone.qq.com/proxy/domain/r.qzone.qq.com/cgi-bin/tfriend/friend_ship_manager.cgi?' 40 g_tk = self.get_gtk() 41 uin = self.get_uin() 42 data = { 43 'uin': uin, 44 'do': 1, 45 'g_tk': g_tk 46 } 47 data_encode = urllib.parse.urlencode(data) 48 url_friend += data_encode 49 res = requests.get(url_friend, headers=header, cookies=cookie) 50 friend_json = re.findall('\((.*)\)', res.text, re.S)[0] 51 friend_dict = json.loads(friend_json) 52 friend_result_list = [] 53 # 循环将好友的姓名qq号存入字典中 54 for friend in friend_dict['data']['items_list']: 55 friend_result_list.append([friend['name'], friend['uin']]) 56 # 得到的好友字典是{name: qqNum}格式的 57 return friend_result_list 58 59 if __name__ == '__main__': 60 qzone = Qzone() 61 62 #将关系设置为全局变量以供方便调用 63 relationships = [] 64 header = { 65 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0", 66 "Accepted-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", 67 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 68 } 69 with open('cookie_dict.txt','r') as f: 70 cookie = json.load(f) 71 #得到qq列表, 72 qq_list = qzone.get_qq()