爬取百度贴吧帖子页内容
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
说明:仅学习参考,如有侵权,将立即删除此内容
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
开发文档
version_2
功能:爬取输入贴吧名字(如“李毅”)的帖子内容,并保存获取的原始html文件,以及处理后的包含相关字段的json文件
函数:next_page_url.py,主要负责获取next_page_url
page_key_info.py,主要负责获取当页关键信息的提取
settings.py完成,存放设置
request_response.py用来处理请求
tiezi_total.py 实现主要逻辑,并爬取相关内容
main_spider.py程序运行的接口(未启用)
文件:jsonfiletotal存放提取出的json数据
技术点:多线程、线程中的通讯(队列)、递归获取下页地址、生产者消费者模型
bug_1:这个版本,目前流程大体上是正确的,但是会在解析第二页内容时,报UnicodeDecodeError,目前本人没能力解决,大佬如果有经验,方便的话,希望可以提点一下。
bug_1修复:问题已解决,代码已修复,主要细节问题,出现在请求下页地址时,没有将其正确的拼凑好。
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
源码
# settings.py # User-Agent池 # waiting do HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36", }
# tiezi_total.py import requests from urllib import parse import re import json from queue import Queue from threading import Thread import time from settings import HEADERS from next_page_url import NextPageUrl from page_key_info import PageKeyInfo from request_response import RequestResponse class MainProcess(): """ 接受参数:start_url,tieba_name """ def __init__(self, tieba_name, url): self.tieba_name = tieba_name self.url = url.format(parse.quote(tieba_name)) self.url_queue = Queue() self.rawhtml_queue = Queue() self.content_queue = Queue() def __make_url_and_rawhtml(self, url):
"""生产url和rawhtml""" # self.url_queue.put(url) html_str = RequestResponse(url).run() next_page_url = NextPageUrl(html_str).run() print(next_page_url) # 将html字符串放入队列 self.rawhtml_queue.put(html_str) while next_page_url: self.url_queue.put(next_page_url) return self.__make_url_and_rawhtml(next_page_url) def __make_key_info(self):
"""消费url和rawhtml,生产content""" while self.url_queue.not_empty and self.rawhtml_queue.not_empty: # 从队列中取出一一对应的url和rawhtml url = self.url_queue.get() html_str = self.rawhtml_queue.get() item_list = PageKeyInfo(html_str).run() # 将当前页url放入相关数据中返回 item = dict(current_page_url=url) item_list.append(item) # 将相关数据放入队列 self.content_queue.put(item_list) # 显示状态 print("开始从当前{}提取信息".format(url)) # 队列计数减1 self.url_queue.task_done() self.rawhtml_queue.task_done() def __save_json_file(self): """保存相关数据为json文件,消费content""" while self.content_queue.not_empty: # 从队列取数 content = self.content_queue.get() # 构造filename url = content[-1]["current_page_url"] filename = parse.unquote( re.split(pattern=r'\?', string=url)[-1])+".json" with open("./jsonfiletotal/"+filename, 'w', encoding='utf8') as f: f.write(json.dumps(content, ensure_ascii=False, indent=4)) print("保存"+filename+"文件成功") # 队列计数减1 self.content_queue.task_done() def run(self): # 将首个url放入self.url_queue队列 self.url_queue.put(self.url) # 创建线程列表 thread_list = list() make_url_and_rawhtml_thread = Thread( target=self.__make_url_and_rawhtml, args=(self.url,)) thread_list.append(make_url_and_rawhtml_thread) make_key_info_thread = Thread(target=self.__make_key_info) thread_list.append(make_key_info_thread) save_json_file_thread = Thread(target=self.__save_json_file) thread_list.append(save_json_file_thread) for t in thread_list: t.setDaemon = True t.start() # 让所有队列里内容清空 self.url_queue.join() self.rawhtml_queue.join() self.content_queue.join() # 测试用例 if __name__ == "__main__": tieba_name = "李毅" first_url = "https://tieba.baidu.com/f?kw={}&ie=urf-8&pn=0" obj = MainProcess(tieba_name, first_url) obj.run()
#request_response.py import requests from settings import HEADERS class RequestResponse(): """传入一个请求url,返回一个原始字符串""" def __init__(self,url): self.url = url def __get_resquest(self,url): """ 获取响应,接受一个url参数,作为通用函数 目前设置反反爬虫策略 返回原始的未处理的原始字符串 """ response = requests.get(url, headers=HEADERS) print("请求响应代码:", response.status_code) response_ = response.content.decode() return response_ def run(self): return self.__get_resquest(self.url)
# next_page_url.py import re class NextPageUrl(): """传入的参数为response产生的原始字符串""" def __init__(self,html_str): """初始化参数""" self.html_str = html_str def __info_str(self,html_str): """将传入的html_str分解,提取有用的内容""" html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0] return html_ def __parse_next_url(self, html_): # 提取当页下包含下一页的div div_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0] # if next_page == None:代表没有下一页 # 由于无法直接定位,取所有的url,并放入列表 next_url_list = re.findall(r'<a(.*?)>', div_content, re.S) for i in next_url_list: if "next pagination-item" in i: next_page_url = "https: + re.findall(r'href="(.*?)"', i, re.S)[0] return next_page_url def run(self): """提供主要的对外接口""" __html_ = self.__info_str(self.html_str) __next_page_url = self.__parse_next_url(__html_) return __next_page_url
# page_key_info.py import re class PageKeyInfo(): """传入的参数为response产生的原始字符串""" def __init__(self, html_str): """初始化参数""" self.html_str = html_str def __info_str(self, html_str): """将传入的html_str分解,提取有用的内容""" html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0] return html_ def __get_usefulinfo_by_one(self,ul_one): one_tiezi_info = dict() # 获取标题和地址 title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S) title_and_href = title_and_href[0] if len(title_and_href) > 0 else None if title_and_href: title_href_ = "https://tieba.baidu.com"+title_and_href[0] title_ = title_and_href[1] else: title_href_ = None title_ = None # 获取作者和作者id author_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S) author_name = author_name[0] if len(author_name) > 0 else None author_id = re.findall(r'title="主题作者.*?".*?data-field=\'{"user_id":(.*?)}\' >', ul_one, re.S) author_id = author_id[0] if len(author_id) > 0 else None author_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S) author_home = "https://tieba.baidu.com" + author_home[0] if len(author_home) > 0 else None # 取内容 content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S) content = content[0] if len(content) > 0 else None image = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S) # 将数据存放在字典中 one_tiezi_info["title"] = title_ one_tiezi_info["title_href"] = title_href_ one_tiezi_info["author_name"] = author_name one_tiezi_info["author_id"] = author_id one_tiezi_info['author_home'] = author_home one_tiezi_info['content'] = content one_tiezi_info['image'] = image return one_tiezi_info def __ul_content(self,html_): # 获取当前主题页的所有列表 ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S) return ul_content_list def __get_content(self,html_): item_list = list() # 获取包含所有单块帖子的列表 ul_content_list = self.__ul_content(html_) for ul_one in ul_content_list: item = self.__get_usefulinfo_by_one(ul_one) item_list.append(item) return item_list def run(self): # 处理字符串 __html_ = self.__info_str(self.html_str) # 处理关键字段 __item_list = self.__get_content(__html_) return __item_list
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
开发文档
verison_1
使用方式:python main_spider.py 李毅
功能:爬取输入贴吧名字(如“李毅”)的帖子内容,并保存获取的原始html文件,以及处理后的包含相关字段的json文件
函数:settings.py 存放设置
tieziparse.py处理单页的内容,提取有用信息
main_spider.py 执行主要逻辑
tieba_title_total.py,贴吧名字,url,处理单页字符带,待修改
文件:htmlfiletotal存放获取到的html页面
jsonfiletotal存放提取出的json数据
说明:属于单线程,目前能够执行,优化请看version_2
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
源码
# settings.py存放主要的参数 # User-Agent池 # waiting do HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36", }
# main_spider.py from tieba_title_total import TitleTotal import sys # class ProcessMain(): def __init__(self,tieba_name,first_url): self.tieba_name = tieba_name self.first_url = first_url # 获取贴吧当前页的所有帖子 def tieba_current_pagecontent(self,tieba_name,url): obj = TitleTotal(tieba_name,url) next_page_url = obj.run() while next_page_url: return self.tieba_current_pagecontent(tieba_name,next_page_url) def run(self): self.tieba_current_pagecontent(self.tieba_name,self.first_url) if __name__=="__main__": tieba_name = sys.argv[1] # tieba_name = "李毅" first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0" obj = ProcessMain(tieba_name,first_url) obj.run()
# tieba_title_total.py import requests from urllib import parse import re from settings import HEADERS import json from tieziparse import TieziParse class TitleTotal(): """ 获取帖子内容 标题:title、title_href 作者:author_name、author_id、author_home 内容:content、image 下一页地址:next_page_url """ def __init__(self,tieba_name,url): self.tieba_name = tieba_name self.url = url.format(parse.quote(tieba_name)) def __get_response(self,url): """ 获取响应,接受一个url参数,作为通用函数 目前设置反反爬虫策略 """ response = requests.get(url,headers=HEADERS) print("响应代码:",response.status_code) response_ = response.content.decode() return response_ def __save_raw_file(self,content,filename): """保存文件""" with open("./htmlfiletotal/"+filename,'w',encoding='utf8') as f: f.write(content) print("写入原始文件成功") def __save_json_file(self,content,filename): """保存json文件""" with open("./jsonfiletotal/"+filename,'w',encoding='utf8') as f: f.write(json.dumps(content,ensure_ascii=False, indent=4)) print("保存json文件成功") def __parse_html_str(self,html_str): current_page_info = dict() instantiation_ = TieziParse(html_str) next_page_url,item_list = instantiation_.run() current_page_info["next_page_url"] = next_page_url current_page_info["content_list"] = item_list return current_page_info def run(self): """ 实现主要逻辑 """ print("开始url:",self.url) # 获取响应 html_str = self.__get_response(self.url) # 保存抓取到的原始的内容 filename = parse.unquote(re.split(pattern=r'\?',string=self.url)[-1])+".html" self.__save_raw_file(content=html_str,filename=filename) # 对抓取到的原始内容进行详细信息抓取 key_info_dict = self.__parse_html_str(html_str) # 将提取到的关键信息保存到json文件 filename = parse.unquote(re.split(r'\?',self.url)[-1])+".json" self.__save_json_file(content=key_info_dict, filename=filename) print("下一页url:",key_info_dict["next_page_url"]) return key_info_dict["next_page_url"] # 测试用例 if __name__=="__main__": tieba_name = "李毅" first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0" obj = TitleTotal(tieba_name,first_url) obj.run()
# tieziprase.py import re class TieziParse(): """ 包含单页中所有帖子内容的字符串 处理传入的htnl_str 标题:title、title_href 作者:author_name、author_id、author_home 内容:content、image 下一页地址:next_page_url """ def __init__(self, html_str): """初始化参数""" self.html_str = html_str def __info_str(self): """将传入的html_str分解,提取有用的内容""" html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', self.html_str, re.S)[0] return html_ def __parse_next_url(self, html_): # 提取当页下包含下一页的div div_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0] # if next_page == None:代表没有下一页 # 由于无法直接定位,取所有的url,并放入列表 next_url_list = re.findall(r'<a(.*?)>', div_content, re.S) for i in next_url_list: if "next pagination-item" in i: next_page_url = "https:" + re.findall(r'href="(.*?)"', i, re.S)[0] return next_page_url def __get_usefulinfo_by_one(self, ul_one): one_tiezi_info = dict() # 获取标题和地址 title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S) # title_ = None # title_href_ = None # for con in title_and_href: title_and_href = title_and_href[0] if len(title_and_href)>0 else None if title_and_href: title_href_ = "https://tieba.baidu.com"+title_and_href[0] title_ = title_and_href[1] else: title_href_ = None title_ = None # 获取作者和作者id author_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S) author_name = author_name[0] if len(author_name)>0 else None author_id = re.findall(r'title="主题作者.*?".*?data-field=\'{"user_id":(.*?)}\' >', ul_one, re.S) author_id = author_id[0] if len(author_id)>0 else None author_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S) author_home = "https://tieba.baidu.com"+author_home[0] if len(author_home)>0 else None # 取内容 content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S) content = content[0] if len(content)>0 else None image = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S) # 将数据存放在字典中 one_tiezi_info["title"] = title_ one_tiezi_info["title_href"] = title_href_ one_tiezi_info["author_name"] = author_name one_tiezi_info["author_id"] = author_id one_tiezi_info['author_home'] = author_home one_tiezi_info['content'] = content one_tiezi_info['image'] = image return one_tiezi_info def __ul_content(self, html_): # 获取当前主题页的列表 ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S) return ul_content_list def __get_content(self, html_): item_list = list() # 获取包含所有单块帖子的列表 ul_content_list = self.__ul_content(html_) for ul_one in ul_content_list: item = self.__get_usefulinfo_by_one(ul_one) item_list.append(item) return item_list def run(self): """ 向外提供的接口 返回的第一个参数是next_page_url 返回的第二个参数是item_list """ # 分解传入的html_str,提取有用信息 __html_ = self.__info_str() # 获取下一页的url __next_page_url = self.__parse_next_url(__html_) # 获取当前页中所有的帖子等信息 __item_list = self.__get_content(__html_) return __next_page_url, __item_list