datawhale爬虫实训4

DataWhale-Task4(爬取丁香园2)

任务:使用lxml爬虫帖子相关的回复与部分用户信息(用户名,头像地址,回复详情)

难点:需要登录才能看到所有回复
浏览器登录上去,查看cookies信息,复制,通过request.get()的参数使用标识登录身份的cookies,这样便着请求所回复(直接请求帖子主页的只是html,需要向对应的api发起请求才能看到回帖数据)

    cookies = {}
    temp = "DXY_USER_GROUP=49; __auc=f5db5ffc1693f4415a8b9b324af; _ga=GA1.2.406327704.1551544823; _gid=GA1.2.832234072.1551600247; __utma=1.406327704.1551544823.1551575932.1551655682.5; __utmz=1.1551655682.5.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmz=3004402.1551676197.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JUTE_BBS_DATA=59a1c912e729f883a4072343bbd3cf3ce120b0d2a9073d1f98b4b4abb6b976855a9d60443f3910a91a2f4acd5ba5b796cb23f957a410053aaeca64aaa758f7468c5f3b6e4f5e8b3afdde9ab9a36c4e7c599039e6942142f476034f89445921cdfdac46fbcd62e2b2d57ebc2c50c50d8e1b14d314431af16b; __utmc=3004402; JUTE_SESSION_ID=d8ff12d6-4a18-49b7-a793-3cec155e2871; JUTE_TOKEN=364c5b97-0a5e-479b-bcb2-4fa6e665aa55; JSESSIONID=0D8D5058CEC7915AFFE1E95EEB7ECDF1; __utma=3004402.406327704.1551544823.1551693466.1551704245.3; __utmt=1; __utmb=3004402.1.10.1551704245; JUTE_SESSION=e8ecebb9b808ddb678837312dda5b1b477f72176e200de7dd3f4858315fb204c21184bb31cedc24a7c1f7c4dcccee51ab23a4595b8e44787b9fd92479d0a34424ab9ce058850dba8"
    for i in temp.split(';'):
        li = i.strip().split('=')
        cookies[li[0]] = li[1]

完整代码

import requests
import json
import re
from lxml import etree


def display(topic):
    """
    topic: 字典,键值有 topic  comment
    topic   key  主题名
    comment key  相关评论
    """
    print("主题:\n", topic['topic'])
    print("主题评论:\n")
    for item in topic['comment']:
        for k, v in item.items():
            print(k, '\n')
            print('\t头像:', v['avatar'], '\n')
            print('\t评论:', v['body'], '\n')


def main(url, headers, cookies):
    topic = {}
    index = 1
    resp = requests.get(url, headers=headers, cookies=cookies)
    maxpage = resp.json()['pageBean']['total']  # 获取回复的全部页数
    topic['topic'] = resp.json()['subject']   # 该帖子的主题
    topic['comment'] = []  # 回复列表
    while index < maxpage:
        target_url = url.format(index)
        resp = requests.get(target_url, headers=headers, cookies=cookies)
        for item in resp.json()['items']:
            d = {
                item['nickname']: {'avatar': item['user']['avatar'],
                                   'body': item["body"]}
            }
            topic['comment'].append(d)
        index += 1
    display(topic)


if __name__ == '__main__':
    cookies = {}
    headers = {
        'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64)'
                       ' AppleWebKit/537.36 (KHTML, like Gecko)'
                       ' Chrome/68.0.3440.106 Safari/537.36')
    }
    # temp为复制的 cookies
    temp = "DXY_USER_GROUP=49; __auc=f5db5ffc1693f4415a8b9b324af; _ga=GA1.2.406327704.1551544823; _gid=GA1.2.832234072.1551600247; __utma=1.406327704.1551544823.1551575932.1551655682.5; __utmz=1.1551655682.5.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmz=3004402.1551676197.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JUTE_BBS_DATA=59a1c912e729f883a4072343bbd3cf3ce120b0d2a9073d1f98b4b4abb6b976855a9d60443f3910a91a2f4acd5ba5b796cb23f957a410053aaeca64aaa758f7468c5f3b6e4f5e8b3afdde9ab9a36c4e7c599039e6942142f476034f89445921cdfdac46fbcd62e2b2d57ebc2c50c50d8e1b14d314431af16b; __utmc=3004402; JUTE_SESSION_ID=d8ff12d6-4a18-49b7-a793-3cec155e2871; JUTE_TOKEN=364c5b97-0a5e-479b-bcb2-4fa6e665aa55; JSESSIONID=0D8D5058CEC7915AFFE1E95EEB7ECDF1; __utma=3004402.406327704.1551544823.1551693466.1551704245.3; __utmt=1; __utmb=3004402.1.10.1551704245; JUTE_SESSION=e8ecebb9b808ddb678837312dda5b1b477f72176e200de7dd3f4858315fb204c21184bb31cedc24a7c1f7c4dcccee51ab23a4595b8e44787b9fd92479d0a34424ab9ce058850dba8"
    for i in temp.split(';'):
        li = i.strip().split('=')
        cookies[li[0]] = li[1]
    url = ("http://3g.dxy.cn/bbs/bbsapi/mobile?"
           "s=view_topic&checkUserAction=1&with"
           "Good=1&order=0&size=20&id=509959&page={}")
    main(url, headers, cookies)


结果

posted @ 2019-03-04 21:39  朝行  阅读(234)  评论(0编辑  收藏  举报