爬取百度贴吧帖子页内容

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

说明：仅学习参考，如有侵权，将立即删除此内容

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

开发文档

version_2

功能：爬取输入贴吧名字（如“李毅”）的帖子内容，并保存获取的原始html文件,以及处理后的包含相关字段的json文件

函数：next_page_url.py，主要负责获取next_page_url

　　　page_key_info.py，主要负责获取当页关键信息的提取

　　　settings.py完成，存放设置

　　　request_response.py用来处理请求

　　　tiezi_total.py 实现主要逻辑，并爬取相关内容

　　　main_spider.py程序运行的接口（未启用）

文件：jsonfiletotal存放提取出的json数据

技术点：多线程、线程中的通讯（队列）、递归获取下页地址、生产者消费者模型

bug_1：这个版本，目前流程大体上是正确的，但是会在解析第二页内容时，报UnicodeDecodeError，目前本人没能力解决，大佬如果有经验，方便的话，希望可以提点一下。

bug_1修复：问题已解决，代码已修复，主要细节问题，出现在请求下页地址时，没有将其正确的拼凑好。

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源码

# settings.py


# User-Agent池
# waiting do

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}

# tiezi_total.py

import requests
from urllib import parse
import re
import json
from queue import Queue
from threading import Thread
import time
from settings import HEADERS
from next_page_url import NextPageUrl
from page_key_info import PageKeyInfo
from request_response import RequestResponse


class MainProcess():
    """
    接受参数：start_url,tieba_name
    """

    def __init__(self, tieba_name, url):
        self.tieba_name = tieba_name
        self.url = url.format(parse.quote(tieba_name))
        self.url_queue = Queue()
        self.rawhtml_queue = Queue()
        self.content_queue = Queue()

    def __make_url_and_rawhtml(self, url):
　　　　 """生产url和rawhtml"""
        # self.url_queue.put(url)
        html_str = RequestResponse(url).run()
        next_page_url = NextPageUrl(html_str).run()
        print(next_page_url)
        # 将html字符串放入队列
        self.rawhtml_queue.put(html_str)
        while next_page_url:
            self.url_queue.put(next_page_url)
            return self.__make_url_and_rawhtml(next_page_url)

    def __make_key_info(self):
　　　　　"""消费url和rawhtml，生产content"""
        while self.url_queue.not_empty and self.rawhtml_queue.not_empty:
            # 从队列中取出一一对应的url和rawhtml
            url = self.url_queue.get()
            html_str = self.rawhtml_queue.get()
            item_list = PageKeyInfo(html_str).run()

            # 将当前页url放入相关数据中返回
            item = dict(current_page_url=url)
            item_list.append(item)

            # 将相关数据放入队列
            self.content_queue.put(item_list)

            # 显示状态
            print("开始从当前{}提取信息".format(url))

            # 队列计数减1
            self.url_queue.task_done()
            self.rawhtml_queue.task_done()

    def __save_json_file(self):
        """保存相关数据为json文件,消费content"""
        while self.content_queue.not_empty:
            # 从队列取数
            content = self.content_queue.get()
            # 构造filename
            url = content[-1]["current_page_url"]
            filename = parse.unquote(
                re.split(pattern=r'\?', string=url)[-1])+".json"

            with open("./jsonfiletotal/"+filename, 'w', encoding='utf8') as f:
                f.write(json.dumps(content, ensure_ascii=False, indent=4))
                print("保存"+filename+"文件成功")
            # 队列计数减1
            self.content_queue.task_done()

    def run(self):
        # 将首个url放入self.url_queue队列
        self.url_queue.put(self.url)

        # 创建线程列表
        thread_list = list()

        make_url_and_rawhtml_thread = Thread(
            target=self.__make_url_and_rawhtml, args=(self.url,))
        thread_list.append(make_url_and_rawhtml_thread)

        make_key_info_thread = Thread(target=self.__make_key_info)
        thread_list.append(make_key_info_thread)

        save_json_file_thread = Thread(target=self.__save_json_file)
        thread_list.append(save_json_file_thread)

        for t in thread_list:
            t.setDaemon = True
            t.start()

        # 让所有队列里内容清空
        self.url_queue.join()
        self.rawhtml_queue.join()
        self.content_queue.join()


# 测试用例
if __name__ == "__main__":
    tieba_name = "李毅"
    first_url = "https://tieba.baidu.com/f?kw={}&ie=urf-8&pn=0"
    obj = MainProcess(tieba_name, first_url)
    obj.run()

#request_response.py


import requests
from settings import HEADERS


class RequestResponse():
    """传入一个请求url,返回一个原始字符串"""
    def __init__(self,url):
        self.url = url

    def __get_resquest(self,url):
        """
        获取响应,接受一个url参数，作为通用函数
        目前设置反反爬虫策略
        返回原始的未处理的原始字符串
        """
        response = requests.get(url, headers=HEADERS)
        print("请求响应代码：", response.status_code)
        response_ = response.content.decode()
        return response_

    def run(self):
        return self.__get_resquest(self.url)

# next_page_url.py

import re


class NextPageUrl():
    """传入的参数为response产生的原始字符串"""
    def __init__(self,html_str):
        """初始化参数"""
        self.html_str = html_str
    def __info_str(self,html_str):
        """将传入的html_str分解，提取有用的内容"""
        html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0]
        return html_

    def __parse_next_url(self, html_):
        # 提取当页下包含下一页的div
        div_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0]
        # if next_page == None:代表没有下一页
        # 由于无法直接定位，取所有的url，并放入列表
        next_url_list = re.findall(r'<a(.*?)>', div_content, re.S)
        for i in next_url_list:
            if "next pagination-item" in i:
                next_page_url = "https: + re.findall(r'href="(.*?)"', i, re.S)[0]
                return next_page_url

    def run(self):
        """提供主要的对外接口"""
        __html_ = self.__info_str(self.html_str)
        __next_page_url = self.__parse_next_url(__html_)
        return __next_page_url

# page_key_info.py


import re


class PageKeyInfo():
    """传入的参数为response产生的原始字符串"""
    def __init__(self, html_str):
        """初始化参数"""
        self.html_str = html_str

    def __info_str(self, html_str):
        """将传入的html_str分解，提取有用的内容"""
        html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0]
        return html_

    def __get_usefulinfo_by_one(self,ul_one):
        one_tiezi_info = dict()
        # 获取标题和地址
        title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S)
        title_and_href = title_and_href[0] if len(title_and_href) > 0 else None
        if title_and_href:
            title_href_ = "https://tieba.baidu.com"+title_and_href[0]
            title_ = title_and_href[1]
        else:
            title_href_ = None
            title_ = None
        # 获取作者和作者id
        author_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S)
        author_name = author_name[0] if len(author_name) > 0 else None
        author_id = re.findall(r'title="主题作者.*?".*?data-field=\'{&quot;user_id&quot;:(.*?)}\' >', ul_one, re.S)
        author_id = author_id[0] if len(author_id) > 0 else None
        author_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S)
        author_home = "https://tieba.baidu.com" + author_home[0] if len(author_home) > 0 else None
        # 取内容
        content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S)
        content = content[0] if len(content) > 0 else None
        image = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S)
        # 将数据存放在字典中
        one_tiezi_info["title"] = title_
        one_tiezi_info["title_href"] = title_href_
        one_tiezi_info["author_name"] = author_name
        one_tiezi_info["author_id"] = author_id
        one_tiezi_info['author_home'] = author_home
        one_tiezi_info['content'] = content
        one_tiezi_info['image'] = image
        return one_tiezi_info

    def __ul_content(self,html_):
        # 获取当前主题页的所有列表
        ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S)
        return ul_content_list

    def __get_content(self,html_):
        item_list = list()
        # 获取包含所有单块帖子的列表
        ul_content_list = self.__ul_content(html_)
        for ul_one in ul_content_list:
            item = self.__get_usefulinfo_by_one(ul_one)
            item_list.append(item)
        return item_list

    def run(self):
        # 处理字符串
        __html_ = self.__info_str(self.html_str)
        # 处理关键字段
        __item_list = self.__get_content(__html_)
        return __item_list

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

开发文档

verison_1
使用方式：python main_spider.py 李毅
功能：爬取输入贴吧名字（如“李毅”）的帖子内容，并保存获取的原始html文件,以及处理后的包含相关字段的json文件
函数：settings.py 存放设置
　　　tieziparse.py处理单页的内容，提取有用信息
　　　main_spider.py 执行主要逻辑
　　　tieba_title_total.py,贴吧名字，url,处理单页字符带，待修改
文件：htmlfiletotal存放获取到的html页面
　　　jsonfiletotal存放提取出的json数据

说明：属于单线程，目前能够执行，优化请看version_2

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源码

# settings.py存放主要的参数
# User-Agent池
# waiting do

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}

# main_spider.py

from tieba_title_total import TitleTotal
import sys

# 
class ProcessMain():
    def __init__(self,tieba_name,first_url):
        self.tieba_name = tieba_name
        self.first_url = first_url

    # 获取贴吧当前页的所有帖子
    def tieba_current_pagecontent(self,tieba_name,url):
        obj = TitleTotal(tieba_name,url)  
        next_page_url = obj.run()
        while next_page_url:
            return self.tieba_current_pagecontent(tieba_name,next_page_url)

    def run(self):
        self.tieba_current_pagecontent(self.tieba_name,self.first_url)        
        

if __name__=="__main__":
    tieba_name = sys.argv[1]
    # tieba_name = "李毅"
    first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0"
    obj = ProcessMain(tieba_name,first_url)
    obj.run()

# tieba_title_total.py


import requests
from urllib import parse
import re
from settings import HEADERS
import json
from tieziparse import TieziParse

class TitleTotal():
    """
    获取帖子内容
    标题：title、title_href
    作者：author_name、author_id、author_home
    内容：content、image
    下一页地址：next_page_url
    """
    def __init__(self,tieba_name,url):
        self.tieba_name = tieba_name
        self.url = url.format(parse.quote(tieba_name))
        
    def __get_response(self,url):
        """
        获取响应,接受一个url参数，作为通用函数
        目前设置反反爬虫策略
        """
        response = requests.get(url,headers=HEADERS)
        print("响应代码：",response.status_code)
        response_ = response.content.decode()
        return response_
    
    def __save_raw_file(self,content,filename):
        """保存文件"""
        with open("./htmlfiletotal/"+filename,'w',encoding='utf8') as f:
            f.write(content)
            print("写入原始文件成功")

    def __save_json_file(self,content,filename):
        """保存json文件"""
        with open("./jsonfiletotal/"+filename,'w',encoding='utf8') as f:
            f.write(json.dumps(content,ensure_ascii=False, indent=4))
            print("保存json文件成功")

    def __parse_html_str(self,html_str):
        current_page_info = dict()
        instantiation_ = TieziParse(html_str)
        next_page_url,item_list = instantiation_.run()
        current_page_info["next_page_url"] = next_page_url
        current_page_info["content_list"] = item_list
        return current_page_info

    def run(self):
        """
        实现主要逻辑
        """
        print("开始url:",self.url)
        # 获取响应
        html_str = self.__get_response(self.url)

        # 保存抓取到的原始的内容
        filename = parse.unquote(re.split(pattern=r'\?',string=self.url)[-1])+".html"
        self.__save_raw_file(content=html_str,filename=filename)
        # 对抓取到的原始内容进行详细信息抓取
        key_info_dict = self.__parse_html_str(html_str)
        # 将提取到的关键信息保存到json文件
        filename = parse.unquote(re.split(r'\?',self.url)[-1])+".json"
        self.__save_json_file(content=key_info_dict, filename=filename)
        print("下一页url:",key_info_dict["next_page_url"])
        return key_info_dict["next_page_url"]

# 测试用例
if __name__=="__main__":
    tieba_name = "李毅"
    first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0"
    obj = TitleTotal(tieba_name,first_url)
    obj.run()

# tieziprase.py


import re

class TieziParse():
    """
    包含单页中所有帖子内容的字符串
    处理传入的htnl_str
    标题：title、title_href
    作者：author_name、author_id、author_home
    内容：content、image
    下一页地址：next_page_url
    """

    def __init__(self, html_str):
        """初始化参数"""
        self.html_str = html_str

    def __info_str(self):
        """将传入的html_str分解，提取有用的内容"""
        html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', self.html_str, re.S)[0]
        return html_

    def __parse_next_url(self, html_):
        # 提取当页下包含下一页的div
        div_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0]
        # if next_page == None:代表没有下一页
        # 由于无法直接定位，取所有的url，并放入列表
        next_url_list = re.findall(r'<a(.*?)>', div_content, re.S)
        for i in next_url_list:
            if "next pagination-item" in i:
                next_page_url = "https:" + re.findall(r'href="(.*?)"', i, re.S)[0]
                return next_page_url
            


    def __get_usefulinfo_by_one(self, ul_one):
        one_tiezi_info = dict()
        # 获取标题和地址
        title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S)
        # title_ = None
        # title_href_ = None
        # for con in title_and_href:
        
        title_and_href = title_and_href[0] if len(title_and_href)>0 else None

        if title_and_href:
            title_href_ = "https://tieba.baidu.com"+title_and_href[0]
            title_ = title_and_href[1]
        else:
            title_href_ = None
            title_ = None

        # 获取作者和作者id
        author_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S)
        author_name = author_name[0] if len(author_name)>0 else None

        author_id = re.findall(r'title="主题作者.*?".*?data-field=\'{&quot;user_id&quot;:(.*?)}\' >', ul_one, re.S)
        author_id = author_id[0] if len(author_id)>0 else None

        author_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S)
        author_home = "https://tieba.baidu.com"+author_home[0] if len(author_home)>0 else None
        # 取内容
        content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S)
        content = content[0] if len(content)>0 else None
        image = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S)
        # 将数据存放在字典中
        one_tiezi_info["title"] = title_
        one_tiezi_info["title_href"] = title_href_
        one_tiezi_info["author_name"] = author_name
        one_tiezi_info["author_id"] = author_id
        one_tiezi_info['author_home'] = author_home
        one_tiezi_info['content'] = content
        one_tiezi_info['image'] = image
        return one_tiezi_info

    def __ul_content(self, html_):
        # 获取当前主题页的列表
        ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S)
        return ul_content_list

    def __get_content(self, html_):
        item_list = list()
        # 获取包含所有单块帖子的列表
        ul_content_list = self.__ul_content(html_)
        for ul_one in ul_content_list:
            item = self.__get_usefulinfo_by_one(ul_one)
            item_list.append(item)
        return item_list

    def run(self):
        """
        向外提供的接口
        返回的第一个参数是next_page_url
        返回的第二个参数是item_list
        """
        # 分解传入的html_str，提取有用信息
        __html_ = self.__info_str()
        # 获取下一页的url
        __next_page_url = self.__parse_next_url(__html_)
        # 获取当前页中所有的帖子等信息
        __item_list = self.__get_content(__html_)
        return __next_page_url, __item_list

posted @ 2020-05-19 12:22 Norni 阅读(511) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

爬取百度贴吧帖子页内容

公告