爬取贴吧中的html,并保存到相对应的文件夹中

功能:输入要爬取的贴吧名称,起始页和终止页即可。

# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
import os

class BaiduSpider:
    def __init__(self):
        self.baseurl = ""
        self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"}
    
    # 请求并获取页面的内容
    def getPage(self, url):
        req = urllib.request.Request(url, headers=self.headers)
        res = urllib.request.urlopen(req)
        html = res.read().decode("utf-8")
        print(res.getcode())
        return html
    
    # 保存
    def writePage(self, x, html):
        print("===")
        # 获取路径
        path = os.path.join(os.path.dirname(__file__),"baidutieba/")
        # 判断路径
        if not os.path.exists(path):
            # 如果不存在,则创建该路径相关的路径文件
            os.makedirs(path)
            # 给文件有相对应的权限
            os.chmod(path, "rw")
        with open(path+str(x) + ".html", "w", encoding="utf-8") as f:
            f.write(html)
            print("保存成功")
    
    # 主函数
    def workOn(self):
        title = urllib.parse.quote(input("请输入你要搜索的贴吧名:"))
        start= int(input("请输入起始页:"))
        end = int(input("请输入终止页:"))
        baseurl = "https://tieba.baidu.com/f?kw="+ title  +"&ie=utf-8&"
        for x in range(start, end+1):
            if x == 1:
                url = baseurl
            pn = (x-1)*50
            url = baseurl + "&pn=" + str(pn)
            html = self.getPage(url)
            self.writePage(x, html)
            print("ok")
    
if __name__ == "__main__":
    # 创建对象
    spider = BaiduSpider()
    spider.workOn()

运行结果:

请输入你要搜索的贴吧名:海贼王

请输入起始页:1

请输入终止页:5
200
===
保存成功
ok

posted on 2018-11-23 10:10  zengsf  阅读(258)  评论(0编辑  收藏  举报

导航