爬取百度贴吧帖子页内容

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

说明:仅学习参考,如有侵权,将立即删除此内容

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

 

 

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

开发文档

version_2

功能:爬取输入贴吧名字(如“李毅”)的帖子内容,并保存获取的原始html文件,以及处理后的包含相关字段的json文件

函数:next_page_url.py,主要负责获取next_page_url

   page_key_info.py,主要负责获取当页关键信息的提取

   settings.py完成,存放设置

   request_response.py用来处理请求

   tiezi_total.py 实现主要逻辑,并爬取相关内容

   main_spider.py程序运行的接口(未启用)


文件:jsonfiletotal存放提取出的json数据

技术点:多线程、线程中的通讯(队列)、递归获取下页地址、生产者消费者模型

bug_1:这个版本,目前流程大体上是正确的,但是会在解析第二页内容时,报UnicodeDecodeError,目前本人没能力解决,大佬如果有经验,方便的话,希望可以提点一下。

bug_1修复:问题已解决,代码已修复,主要细节问题,出现在请求下页地址时,没有将其正确的拼凑好。

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源码

# settings.py


# User-Agent池
# waiting do

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}
# tiezi_total.py

import requests
from urllib import parse
import re
import json
from queue import Queue
from threading import Thread
import time
from settings import HEADERS
from next_page_url import NextPageUrl
from page_key_info import PageKeyInfo
from request_response import RequestResponse


class MainProcess():
    """
    接受参数:start_url,tieba_name
    """

    def __init__(self, tieba_name, url):
        self.tieba_name = tieba_name
        self.url = url.format(parse.quote(tieba_name))
        self.url_queue = Queue()
        self.rawhtml_queue = Queue()
        self.content_queue = Queue()

    def __make_url_and_rawhtml(self, url):
     """生产url和rawhtml"""
# self.url_queue.put(url) html_str = RequestResponse(url).run() next_page_url = NextPageUrl(html_str).run() print(next_page_url) # 将html字符串放入队列 self.rawhtml_queue.put(html_str) while next_page_url: self.url_queue.put(next_page_url) return self.__make_url_and_rawhtml(next_page_url) def __make_key_info(self):
     """消费url和rawhtml,生产content"""
while self.url_queue.not_empty and self.rawhtml_queue.not_empty: # 从队列中取出一一对应的url和rawhtml url = self.url_queue.get() html_str = self.rawhtml_queue.get() item_list = PageKeyInfo(html_str).run() # 将当前页url放入相关数据中返回 item = dict(current_page_url=url) item_list.append(item) # 将相关数据放入队列 self.content_queue.put(item_list) # 显示状态 print("开始从当前{}提取信息".format(url)) # 队列计数减1 self.url_queue.task_done() self.rawhtml_queue.task_done() def __save_json_file(self): """保存相关数据为json文件,消费content""" while self.content_queue.not_empty: # 从队列取数 content = self.content_queue.get() # 构造filename url = content[-1]["current_page_url"] filename = parse.unquote( re.split(pattern=r'\?', string=url)[-1])+".json" with open("./jsonfiletotal/"+filename, 'w', encoding='utf8') as f: f.write(json.dumps(content, ensure_ascii=False, indent=4)) print("保存"+filename+"文件成功") # 队列计数减1 self.content_queue.task_done() def run(self): # 将首个url放入self.url_queue队列 self.url_queue.put(self.url) # 创建线程列表 thread_list = list() make_url_and_rawhtml_thread = Thread( target=self.__make_url_and_rawhtml, args=(self.url,)) thread_list.append(make_url_and_rawhtml_thread) make_key_info_thread = Thread(target=self.__make_key_info) thread_list.append(make_key_info_thread) save_json_file_thread = Thread(target=self.__save_json_file) thread_list.append(save_json_file_thread) for t in thread_list: t.setDaemon = True t.start() # 让所有队列里内容清空 self.url_queue.join() self.rawhtml_queue.join() self.content_queue.join() # 测试用例 if __name__ == "__main__": tieba_name = "李毅" first_url = "https://tieba.baidu.com/f?kw={}&ie=urf-8&pn=0" obj = MainProcess(tieba_name, first_url) obj.run()
#request_response.py


import requests
from settings import HEADERS


class RequestResponse():
    """传入一个请求url,返回一个原始字符串"""
    def __init__(self,url):
        self.url = url

    def __get_resquest(self,url):
        """
        获取响应,接受一个url参数,作为通用函数
        目前设置反反爬虫策略
        返回原始的未处理的原始字符串
        """
        response = requests.get(url, headers=HEADERS)
        print("请求响应代码:", response.status_code)
        response_ = response.content.decode()
        return response_

    def run(self):
        return self.__get_resquest(self.url)
# next_page_url.py

import re


class NextPageUrl():
    """传入的参数为response产生的原始字符串"""
    def __init__(self,html_str):
        """初始化参数"""
        self.html_str = html_str
    def __info_str(self,html_str):
        """将传入的html_str分解,提取有用的内容"""
        html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0]
        return html_

    def __parse_next_url(self, html_):
        # 提取当页下包含下一页的div
        div_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0]
        # if next_page == None:代表没有下一页
        # 由于无法直接定位,取所有的url,并放入列表
        next_url_list = re.findall(r'<a(.*?)>', div_content, re.S)
        for i in next_url_list:
            if "next pagination-item" in i:
                next_page_url = "https: + re.findall(r'href="(.*?)"', i, re.S)[0]
                return next_page_url

    def run(self):
        """提供主要的对外接口"""
        __html_ = self.__info_str(self.html_str)
        __next_page_url = self.__parse_next_url(__html_)
        return __next_page_url
# page_key_info.py


import re


class PageKeyInfo():
    """传入的参数为response产生的原始字符串"""
    def __init__(self, html_str):
        """初始化参数"""
        self.html_str = html_str

    def __info_str(self, html_str):
        """将传入的html_str分解,提取有用的内容"""
        html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0]
        return html_

    def __get_usefulinfo_by_one(self,ul_one):
        one_tiezi_info = dict()
        # 获取标题和地址
        title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S)
        title_and_href = title_and_href[0] if len(title_and_href) > 0 else None
        if title_and_href:
            title_href_ = "https://tieba.baidu.com"+title_and_href[0]
            title_ = title_and_href[1]
        else:
            title_href_ = None
            title_ = None
        # 获取作者和作者id
        author_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S)
        author_name = author_name[0] if len(author_name) > 0 else None
        author_id = re.findall(r'title="主题作者.*?".*?data-field=\'{&quot;user_id&quot;:(.*?)}\' >', ul_one, re.S)
        author_id = author_id[0] if len(author_id) > 0 else None
        author_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S)
        author_home = "https://tieba.baidu.com" + author_home[0] if len(author_home) > 0 else None
        # 取内容
        content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S)
        content = content[0] if len(content) > 0 else None
        image = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S)
        # 将数据存放在字典中
        one_tiezi_info["title"] = title_
        one_tiezi_info["title_href"] = title_href_
        one_tiezi_info["author_name"] = author_name
        one_tiezi_info["author_id"] = author_id
        one_tiezi_info['author_home'] = author_home
        one_tiezi_info['content'] = content
        one_tiezi_info['image'] = image
        return one_tiezi_info

    def __ul_content(self,html_):
        # 获取当前主题页的所有列表
        ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S)
        return ul_content_list

    def __get_content(self,html_):
        item_list = list()
        # 获取包含所有单块帖子的列表
        ul_content_list = self.__ul_content(html_)
        for ul_one in ul_content_list:
            item = self.__get_usefulinfo_by_one(ul_one)
            item_list.append(item)
        return item_list

    def run(self):
        # 处理字符串
        __html_ = self.__info_str(self.html_str)
        # 处理关键字段
        __item_list = self.__get_content(__html_)
        return __item_list

 

 

 

 

 

 

 

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

开发文档

verison_1
使用方式:python main_spider.py 李毅
功能:爬取输入贴吧名字(如“李毅”)的帖子内容,并保存获取的原始html文件,以及处理后的包含相关字段的json文件
函数:settings.py 存放设置
   tieziparse.py处理单页的内容,提取有用信息
   main_spider.py 执行主要逻辑
   tieba_title_total.py,贴吧名字,url,处理单页字符带,待修改
文件:htmlfiletotal存放获取到的html页面
   jsonfiletotal存放提取出的json数据

说明:属于单线程,目前能够执行,优化请看version_2

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源码

# settings.py存放主要的参数
# User-Agent池
# waiting do

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}
# main_spider.py

from tieba_title_total import TitleTotal
import sys

# 
class ProcessMain():
    def __init__(self,tieba_name,first_url):
        self.tieba_name = tieba_name
        self.first_url = first_url

    # 获取贴吧当前页的所有帖子
    def tieba_current_pagecontent(self,tieba_name,url):
        obj = TitleTotal(tieba_name,url)  
        next_page_url = obj.run()
        while next_page_url:
            return self.tieba_current_pagecontent(tieba_name,next_page_url)

    def run(self):
        self.tieba_current_pagecontent(self.tieba_name,self.first_url)        
        

if __name__=="__main__":
    tieba_name = sys.argv[1]
    # tieba_name = "李毅"
    first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0"
    obj = ProcessMain(tieba_name,first_url)
    obj.run()
# tieba_title_total.py


import requests
from urllib import parse
import re
from settings import HEADERS
import json
from tieziparse import TieziParse

class TitleTotal():
    """
    获取帖子内容
    标题:title、title_href
    作者:author_name、author_id、author_home
    内容:content、image
    下一页地址:next_page_url
    """
    def __init__(self,tieba_name,url):
        self.tieba_name = tieba_name
        self.url = url.format(parse.quote(tieba_name))
        
    def __get_response(self,url):
        """
        获取响应,接受一个url参数,作为通用函数
        目前设置反反爬虫策略
        """
        response = requests.get(url,headers=HEADERS)
        print("响应代码:",response.status_code)
        response_ = response.content.decode()
        return response_
    
    def __save_raw_file(self,content,filename):
        """保存文件"""
        with open("./htmlfiletotal/"+filename,'w',encoding='utf8') as f:
            f.write(content)
            print("写入原始文件成功")

    def __save_json_file(self,content,filename):
        """保存json文件"""
        with open("./jsonfiletotal/"+filename,'w',encoding='utf8') as f:
            f.write(json.dumps(content,ensure_ascii=False, indent=4))
            print("保存json文件成功")

    def __parse_html_str(self,html_str):
        current_page_info = dict()
        instantiation_ = TieziParse(html_str)
        next_page_url,item_list = instantiation_.run()
        current_page_info["next_page_url"] = next_page_url
        current_page_info["content_list"] = item_list
        return current_page_info

    def run(self):
        """
        实现主要逻辑
        """
        print("开始url:",self.url)
        # 获取响应
        html_str = self.__get_response(self.url)

        # 保存抓取到的原始的内容
        filename = parse.unquote(re.split(pattern=r'\?',string=self.url)[-1])+".html"
        self.__save_raw_file(content=html_str,filename=filename)
        # 对抓取到的原始内容进行详细信息抓取
        key_info_dict = self.__parse_html_str(html_str)
        # 将提取到的关键信息保存到json文件
        filename = parse.unquote(re.split(r'\?',self.url)[-1])+".json"
        self.__save_json_file(content=key_info_dict, filename=filename)
        print("下一页url:",key_info_dict["next_page_url"])
        return key_info_dict["next_page_url"]

# 测试用例
if __name__=="__main__":
    tieba_name = "李毅"
    first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0"
    obj = TitleTotal(tieba_name,first_url)
    obj.run()
# tieziprase.py


import re

class TieziParse():
    """
    包含单页中所有帖子内容的字符串
    处理传入的htnl_str
    标题:title、title_href
    作者:author_name、author_id、author_home
    内容:content、image
    下一页地址:next_page_url
    """

    def __init__(self, html_str):
        """初始化参数"""
        self.html_str = html_str

    def __info_str(self):
        """将传入的html_str分解,提取有用的内容"""
        html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', self.html_str, re.S)[0]
        return html_

    def __parse_next_url(self, html_):
        # 提取当页下包含下一页的div
        div_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0]
        # if next_page == None:代表没有下一页
        # 由于无法直接定位,取所有的url,并放入列表
        next_url_list = re.findall(r'<a(.*?)>', div_content, re.S)
        for i in next_url_list:
            if "next pagination-item" in i:
                next_page_url = "https:" + re.findall(r'href="(.*?)"', i, re.S)[0]
                return next_page_url
            


    def __get_usefulinfo_by_one(self, ul_one):
        one_tiezi_info = dict()
        # 获取标题和地址
        title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S)
        # title_ = None
        # title_href_ = None
        # for con in title_and_href:
        
        title_and_href = title_and_href[0] if len(title_and_href)>0 else None

        if title_and_href:
            title_href_ = "https://tieba.baidu.com"+title_and_href[0]
            title_ = title_and_href[1]
        else:
            title_href_ = None
            title_ = None

        # 获取作者和作者id
        author_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S)
        author_name = author_name[0] if len(author_name)>0 else None

        author_id = re.findall(r'title="主题作者.*?".*?data-field=\'{&quot;user_id&quot;:(.*?)}\' >', ul_one, re.S)
        author_id = author_id[0] if len(author_id)>0 else None

        author_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S)
        author_home = "https://tieba.baidu.com"+author_home[0] if len(author_home)>0 else None
        # 取内容
        content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S)
        content = content[0] if len(content)>0 else None
        image = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S)
        # 将数据存放在字典中
        one_tiezi_info["title"] = title_
        one_tiezi_info["title_href"] = title_href_
        one_tiezi_info["author_name"] = author_name
        one_tiezi_info["author_id"] = author_id
        one_tiezi_info['author_home'] = author_home
        one_tiezi_info['content'] = content
        one_tiezi_info['image'] = image
        return one_tiezi_info

    def __ul_content(self, html_):
        # 获取当前主题页的列表
        ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S)
        return ul_content_list

    def __get_content(self, html_):
        item_list = list()
        # 获取包含所有单块帖子的列表
        ul_content_list = self.__ul_content(html_)
        for ul_one in ul_content_list:
            item = self.__get_usefulinfo_by_one(ul_one)
            item_list.append(item)
        return item_list

    def run(self):
        """
        向外提供的接口
        返回的第一个参数是next_page_url
        返回的第二个参数是item_list
        """
        # 分解传入的html_str,提取有用信息
        __html_ = self.__info_str()
        # 获取下一页的url
        __next_page_url = self.__parse_next_url(__html_)
        # 获取当前页中所有的帖子等信息
        __item_list = self.__get_content(__html_)
        return __next_page_url, __item_list

 

posted @ 2020-05-19 12:22  Norni  阅读(511)  评论(0编辑  收藏  举报