利用爬虫爬知乎少字回答

利用爬虫爬知乎少字回答

感谢马哥python说的指导

最近在学习有关的知识 如果怕对服务器影响 可以修改sleep 时间每次更长一点。
这样就不用看一些营销号的长篇大论 或者看别人写小说了。




# 发送请求  有bug一次只有五个回答  但是我们可以拿到下一次回答。
# 最后一次回答的时候data等于[]如果data是[就退出循环]
r = requests.get(url, headers=headers)
# 接收返回数据  我已经成功请求到数据了

# 确保状态码为200,表示请求成功
if r.status_code == 200:
    # 解析返回的JSON数据
    j_data = r.json()
    
    # 假设你想获取一个名为"data"的键对应的值的长度
    # 注意,下面这行代码中的"data"应该是你想要操作的实际键的名称
    answer_list = j_data["data"]
    print(len(j_data["data"]))
    title = answer_list[0]['target']['question']['title']
    for answer in answer_list:
    # 回答内容
        try:
            # 从HTML中想要提取有用的文字,你可以使用Python的第三方库来进行解析和处理HTML。
            answer_content = answer['target']['content']
            soup = BeautifulSoup(answer_content, 'html.parser')
            # 使用.get_text()方法来提取纯文本内容
            useful_text = soup.get_text()
            
            # 时间
             
            timestamp = answer['target']['updated_time']
            # 使用datetime.fromtimestamp()方法将时间戳转换为datetime对象
            
            dt_object = datetime.fromtimestamp(int(timestamp))
             
            # 使用strftime()方法将datetime对象格式化为你想要的日期字符串
            formatted_date = dt_object.strftime("%Y/%#m/%#d")
           
             
            
        except:
            useful_text = ''
            formatted_date = ''
        answer_content_list.append(useful_text)
        answer_time_list.append(formatted_date)
         
    Write2Csv(
        {
 
        '回答时间': answer_time_list,
 
        '回答内容': answer_content_list,
    },title)
    print(j_data["paging"]["next"])
else:
    print("请求失败了:", r.status_code)
v_question_id = 'session_id=1691645872201785699'
# 定义请求地址(含指定问题id):
# 请求地址
url = 'https://www.zhihu.com/api/v4/questions/556648354/feeds?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Creaction_instruction%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset=&limit=3&order=default&platform=desktop'.format(v_question_id)

总共的代码

import requests
import time
import pandas as pd
import os
import re
import random
from datetime import datetime
from bs4 import BeautifulSoup
# 请求头 (如果数据量仍未满足且遇到反爬,请尝试增加cookie等其他请求头解决)
# 只需要这点数据量 这个请求头就够了
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
}
v_question_id = 'session_id=1691645872201785699'
# 定义请求地址(含指定问题id):
# 请求地址
url = 'https://www.zhihu.com/api/v4/questions/556648354/feeds?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Creaction_instruction%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset=&limit=3&order=default&platform=desktop'.format(v_question_id)


# 定义一些空列表用于存放解析后数据: 一些对我没用我不写了
author_name_list = []  # 答主昵称
author_gender_list = []  # 答主性别
follower_count_list = []  # 答主粉丝数
author_url_list = []  # 答主主页
headline_list = []  # 答主签名
answer_id_list = []  # 回答id
answer_time_list = []  # 回答时间
answer_content_list = []  # 回答内容
comment_count_list = []  # 评论数
voteup_count_list = []  # 点赞数
thanks_count_list = []  # 喜欢数

def Write2Csv(List,title) :
    import pandas as pd
    dataframe = pd.DataFrame(List)
    print('请选择输出的格式1是slsx 2是csv,输入1或者2:')
    typeinput = input()
    if typeinput == '1':
        filepath = title+'.xlsx'
        dataframe.to_excel(filepath)
    elif typeinput == '2':
        filepath = title +'.csv'
        dataframe.to_csv( filepath , index=False , sep = '\n' ,  header = False , encoding='utf_8_sig')
title = ""
def whileurl(urls) :
    global title
    print("执行一次")
    # 发送请求  有bug一次只有五个回答  但是我们可以拿到下一次回答。
    # 最后一次回答的时候data等于[]如果data是[就退出循环]
    r = requests.get(urls, headers=headers)
    # 接收返回数据  我已经成功请求到数据了

    # 确保状态码为200,表示请求成功
    if r.status_code == 200:
        # 解析返回的JSON数据
        j_data = r.json()
        if len(j_data) == 0 or j_data["paging"]["page"]==100:
            Write2Csv({'回答时间': answer_time_list,'回答内容': answer_content_list,},title)
            return  # 退出递归调用
        # 假设你想获取一个名为"data"的键对应的值的长度
        # 注意,下面这行代码中的"data"应该是你想要操作的实际键的名称
        answer_list = j_data["data"]

        title = answer_list[0]['target']['question']['title']
        for answer in answer_list:
        # 回答内容
            try:
                # 从HTML中想要提取有用的文字,你可以使用Python的第三方库来进行解析和处理HTML。
                answer_content = answer['target']['content']
                soup = BeautifulSoup(answer_content, 'html.parser')
                # 使用.get_text()方法来提取纯文本内容
                useful_text = soup.get_text()
            
                # 时间
             
                timestamp = answer['target']['updated_time']
                # 使用datetime.fromtimestamp()方法将时间戳转换为datetime对象
            
                dt_object = datetime.fromtimestamp(int(timestamp))
             
                # 使用strftime()方法将datetime对象格式化为你想要的日期字符串
                formatted_date = dt_object.strftime("%Y/%#m/%#d")
           
             
            
            except:
                useful_text = ''
                formatted_date = ''
            if len(useful_text)<=200:
                answer_content_list.append(useful_text)
                answer_time_list.append(formatted_date)
             
            
         

         
        urls = j_data["paging"]["next"]
        
        time.sleep(5)
        whileurl(urls)
    else:
        print("请求失败了:", r.status_code)





whileurl(url)


posted @ 2023-08-10 16:24  yjxQWQ  阅读(14)  评论(0编辑  收藏  举报