多线程爬虫案例--爬取世纪佳缘网

一、要求

爬取世纪佳缘网当中(20-28周岁、来自北京、有图片的女孩)信息,包括昵称、婚姻状况、身高、年龄、学历、工作地点、自我介绍、择偶要求、个人主页链接以及个人图片链接。

二、分析

网站数据采用的js分页,通过Fiddler抓包工具分析:

url:https://search.jiayuan.com/v2/search_v2.php

请求方式:POST

需要提交的表单数据

{
    "sex":'f',
    'key':'',
    'stc':'1:11,2:20.28,23:1',
    'sn':'default',
    'p':'1',#页码,变换该参数,获取不同页面返回的json数据
    'f':'',
    'sv':'1',
    'listStyle':'bigPhoto',
    'pri_uid':'0',
    'jsversion':'v5'
}

返回值:需要对首尾做个处理后,才是json类型

三、编写爬虫

jiayuanSpider.py

# coding:utf-8

import requests
import threading
import re
import json
import random
import time

from hashlib import sha1
from queue import Queue

class JiayuanSpider:
    def __init__(self,page_num):
        self.url = 'https://search.jiayuan.com/v2/search_v2.php'
        self.page_num = page_num
        self.q_num = Queue() #页码队列
        self.q_data = Queue() #页面数据队列
        self.lock = threading.Lock() #互斥锁
        self.duplefilter = set() #存放数据指纹的集合,用于去重
        self.f = open('jiayuan.json','wb')
        self.formdata = {
        "sex":'f',
        'key':'',
        'stc':'1:11,2:20.28,23:1',
        'sn':'default',
        'p':'1',#页码,变换该参数,获取不同页面返回的json数据
        'f':'',
        'sv':'1',
        'listStyle':'bigPhoto',
        'pri_uid':'0',
        'jsversion':'v5'
    }
        self.user_agents = [
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
    ]

    def work(self):
        '''爬虫开启的入口'''

        #页码队列中添加页码
        for page in range(1,self.page_num+1):
            self.q_num.put(page)

        #创建并开启10个爬取线程
        get_thread_list = []
        for i in range(10):
            t = threading.Thread(target=self.get_data)
            t.start()
            get_thread_list.append(t)
        for t in get_thread_list:
            t.join()

        #创建并开启5个解析线程
        parse_thread_list = []
        for i in range(5):
            t = threading.Thread(target=self.parse_data)
            t.start()
            parse_thread_list.append(t)
        for t in parse_thread_list:
            t.join()

        self.f.close() #关闭文件

    def get_data(self):
        '''获取数据'''
        while not self.q_num.empty():
            page = self.q_num.get()
            headers = {
                'User-Agent':random.choice(self.user_agents)
            }
            with self.lock:
                self.formdata.update({'p':str(page)})
                response = requests.post(self.url,data=self.formdata,headers = headers)
            if response.status_code != 200:
                break
            self.q_data.put(response)
            time.sleep(1)

    def parse_data(self):
        '''解析数据'''
        while not self.q_data.empty():
            response = self.q_data.get()
            # 返回的数据需要经过处理后才是json类型
            for user in json.loads(re.search(r'{.+}', response.text, re.S).group())['userInfo']:
                item = {}
                item['nickname'] = user['nickname'] #昵称
                item['marriage'] = user['marriage'] #婚姻状况
                item['heigt'] = user['height'] + 'cm' #身高
                item['age'] = user['age'] #年龄
                item['education'] = user['education'] #受教育程度
                item['work_location'] = user['work_location'] #工作地点
                item['shortnote'] = user['shortnote']#自我介绍
                item['matchCondition'] = user['matchCondition'] #择偶要求
                item['profile_link'] = 'https://www.jiayuan.com/%s?fxly=search_v2' % user['realUid'] #个人主页链接
                item['image_url'] = user['image'] #个人图片
                data = json.dumps(item,ensure_ascii=False).encode()
                fingerprint = sha1(data).hexdigest() #对每个数据创建一个指纹
                if fingerprint not in self.duplefilter: #如果数据已经存在,则不会写入
                    self.duplefilter.add(fingerprint)
                    with self.lock:
                        self.f.write(data+b'\n')
            time.sleep(1)

if __name__ == '__main__':
    num = int(input("请输入想要爬取的页数:"))
    jiayuan = JiayuanSpider(num)
    jiayuan.work()

数据保存到本地json文件中,如果想要爬取更多的数据,可以再创建一个请求队列,里面添加个人主页链接,然后爬取个人主页,并解析想要获取的数据。

 
posted @ 2021-05-10 21:17  eliwang  阅读(336)  评论(0编辑  收藏  举报