爬取贴吧中的html,并保存到相对应的文件夹中
功能:输入要爬取的贴吧名称,起始页和终止页即可。
# -*- coding: utf-8 -*- import urllib.request import urllib.parse import os class BaiduSpider: def __init__(self): self.baseurl = "" self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"} # 请求并获取页面的内容 def getPage(self, url): req = urllib.request.Request(url, headers=self.headers) res = urllib.request.urlopen(req) html = res.read().decode("utf-8") print(res.getcode()) return html # 保存 def writePage(self, x, html): print("===") # 获取路径 path = os.path.join(os.path.dirname(__file__),"baidutieba/") # 判断路径 if not os.path.exists(path): # 如果不存在,则创建该路径相关的路径文件 os.makedirs(path) # 给文件有相对应的权限 os.chmod(path, "rw") with open(path+str(x) + ".html", "w", encoding="utf-8") as f: f.write(html) print("保存成功") # 主函数 def workOn(self): title = urllib.parse.quote(input("请输入你要搜索的贴吧名:")) start= int(input("请输入起始页:")) end = int(input("请输入终止页:")) baseurl = "https://tieba.baidu.com/f?kw="+ title +"&ie=utf-8&" for x in range(start, end+1): if x == 1: url = baseurl pn = (x-1)*50 url = baseurl + "&pn=" + str(pn) html = self.getPage(url) self.writePage(x, html) print("ok") if __name__ == "__main__": # 创建对象 spider = BaiduSpider() spider.workOn()
运行结果:
请输入你要搜索的贴吧名:海贼王
请输入起始页:1
请输入终止页:5
200
===
保存成功
ok