多线程爬虫案例--爬取世纪佳缘网
一、要求
爬取世纪佳缘网当中(20-28周岁、来自北京、有图片的女孩)信息,包括昵称、婚姻状况、身高、年龄、学历、工作地点、自我介绍、择偶要求、个人主页链接以及个人图片链接。
二、分析
网站数据采用的js分页,通过Fiddler抓包工具分析:
url:https://search.jiayuan.com/v2/search_v2.php
请求方式:POST
需要提交的表单数据:
{
"sex":'f',
'key':'',
'stc':'1:11,2:20.28,23:1',
'sn':'default',
'p':'1',#页码,变换该参数,获取不同页面返回的json数据
'f':'',
'sv':'1',
'listStyle':'bigPhoto',
'pri_uid':'0',
'jsversion':'v5'
}
返回值:需要对首尾做个处理后,才是json类型
三、编写爬虫
jiayuanSpider.py
# coding:utf-8
import requests
import threading
import re
import json
import random
import time
from hashlib import sha1
from queue import Queue
class JiayuanSpider:
def __init__(self,page_num):
self.url = 'https://search.jiayuan.com/v2/search_v2.php'
self.page_num = page_num
self.q_num = Queue() #页码队列
self.q_data = Queue() #页面数据队列
self.lock = threading.Lock() #互斥锁
self.duplefilter = set() #存放数据指纹的集合,用于去重
self.f = open('jiayuan.json','wb')
self.formdata = {
"sex":'f',
'key':'',
'stc':'1:11,2:20.28,23:1',
'sn':'default',
'p':'1',#页码,变换该参数,获取不同页面返回的json数据
'f':'',
'sv':'1',
'listStyle':'bigPhoto',
'pri_uid':'0',
'jsversion':'v5'
}
self.user_agents = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
]
def work(self):
'''爬虫开启的入口'''
#页码队列中添加页码
for page in range(1,self.page_num+1):
self.q_num.put(page)
#创建并开启10个爬取线程
get_thread_list = []
for i in range(10):
t = threading.Thread(target=self.get_data)
t.start()
get_thread_list.append(t)
for t in get_thread_list:
t.join()
#创建并开启5个解析线程
parse_thread_list = []
for i in range(5):
t = threading.Thread(target=self.parse_data)
t.start()
parse_thread_list.append(t)
for t in parse_thread_list:
t.join()
self.f.close() #关闭文件
def get_data(self):
'''获取数据'''
while not self.q_num.empty():
page = self.q_num.get()
headers = {
'User-Agent':random.choice(self.user_agents)
}
with self.lock:
self.formdata.update({'p':str(page)})
response = requests.post(self.url,data=self.formdata,headers = headers)
if response.status_code != 200:
break
self.q_data.put(response)
time.sleep(1)
def parse_data(self):
'''解析数据'''
while not self.q_data.empty():
response = self.q_data.get()
# 返回的数据需要经过处理后才是json类型
for user in json.loads(re.search(r'{.+}', response.text, re.S).group())['userInfo']:
item = {}
item['nickname'] = user['nickname'] #昵称
item['marriage'] = user['marriage'] #婚姻状况
item['heigt'] = user['height'] + 'cm' #身高
item['age'] = user['age'] #年龄
item['education'] = user['education'] #受教育程度
item['work_location'] = user['work_location'] #工作地点
item['shortnote'] = user['shortnote']#自我介绍
item['matchCondition'] = user['matchCondition'] #择偶要求
item['profile_link'] = 'https://www.jiayuan.com/%s?fxly=search_v2' % user['realUid'] #个人主页链接
item['image_url'] = user['image'] #个人图片
data = json.dumps(item,ensure_ascii=False).encode()
fingerprint = sha1(data).hexdigest() #对每个数据创建一个指纹
if fingerprint not in self.duplefilter: #如果数据已经存在,则不会写入
self.duplefilter.add(fingerprint)
with self.lock:
self.f.write(data+b'\n')
time.sleep(1)
if __name__ == '__main__':
num = int(input("请输入想要爬取的页数:"))
jiayuan = JiayuanSpider(num)
jiayuan.work()
数据保存到本地json文件中,如果想要爬取更多的数据,可以再创建一个请求队列,里面添加个人主页链接,然后爬取个人主页,并解析想要获取的数据。