百度贴吧帖子爬虫

Posted on 2024-04-26 21:14  Capterlliar  阅读(17)  评论(0编辑  收藏  举报

想把以前喜欢的帖子爬下来,代码存档于2024.4.26,不知道能用多久。

import requests
from lxml import etree


#移除链接、图片的标签
def removeTag(text):
    tree = etree.fromstring(text)
    for bad in tree.xpath("//a"):
        bad.getparent().remove(bad)
    for bad in tree.xpath("//img"):
        bad.getparent().remove(bad)
    return etree.tostring(tree, encoding="utf-8").decode('utf-8')


#获取内容并清洗
def getContent(url):
    r = requests.get(url=url)
    r.encoding = "utf-8"
    html = etree.HTML(r.text)
    res1 = html.xpath("//div[@class='d_post_content j_d_post_content ']")
    res2 = []
    for ele in res1:
        s = etree.tostring(ele, encoding="utf-8").decode('utf-8')
        s = removeTag(s)
        pos = s.find(">")
        s = s[pos + 1:]
        pos2 = 0
        while s[pos2] == ' ':
            pos2 = pos2 + 1
        s = s[pos2:]
        s = s.replace("</div>", "")

        s = s.replace("<br/>", "\n")
        s = s.replace("<strong>", "")
        s = s.replace("</strong>", "")
        s = s.replace("</span>", "")
        s = s.replace('''<span class="edit_font_color">''', "")
        s = s.replace("&lt;", "")
        s = s.replace("&gt;", "")
        # s = s.replace("</strong>", "")
        # s = s.replace("</span>", "")
        res2.append(s)
    return res2


#写回txt
def writePage(text, filename):
    with open(filename, "a", encoding="utf-8") as f:
        for ele in text:
            f.write(ele + '\n')


if __name__ == "__main__":
    #需要填写的东西
    #这是帖子网址,记得把pn=后面的数字删了
    url = "https://tieba.baidu.com/p/xxxxxxx?see_lz=1&pn="
    #这是文件名
    filename = "xxx.txt"
    #这是起止页数,爬虫爬到的应该是浏览器看到的页数的2倍
    st=1
    ed=10

    for i in range(st, ed + 1):
        url2 = url + str(i)
        text = getContent(url2)
        if text == "null":
            break
        writePage(text, filename)
    print("end.")

upd 6.1

水一个爬图片的代码:

import requests
from lxml import etree


#移除链接、图片的标签
def geturl(text):
    tree = etree.fromstring(text)
    res = []
    for img in tree.xpath("//img"):
        url = img.attrib.get('src')
        res.append(url)
    return res


#获取内容并清洗
def getContent(url, page, directory):
    r = requests.get(url=url)
    r.encoding = "utf-8"
    html = etree.HTML(r.text)
    res1 = html.xpath("//div[@class='d_post_content j_d_post_content ']")
    cnt = 1
    for ele in res1:
        s = etree.tostring(ele, encoding="utf-8").decode('utf-8')
        res2 = geturl(s)
        for url in res2:
            path = directory+"p"+str(page)+"("+str(cnt)+").jpg"
            downloadImg(url, path)
            cnt=cnt+1

#写回txt
def downloadImg(url, filename):
    response = requests.get(url)
    # 获取的文本实际上是图片的二进制文本
    img = response.content
    # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
    # 保存路径
    with open(filename, 'wb') as f:
        f.write(img)


if __name__ == "__main__":
    #需要填写的东西
    #这是帖子网址,记得把pn=后面的数字删了
    url = ""
    #这是文件名
    directory = "\\"
    #这是起止页数,爬虫爬到的应该是浏览器看到的页数的2倍
    st=3
    ed=28

    for i in range(st, ed + 1):
        url2 = url + str(i)
        getContent(url2, i, directory)
    print("end.")
View Code

贴吧17年死了一次后面又活了,但好些楼没恢复。那些初中水的贴闭着眼睛还能想起镇楼图什么样,一抬眼却要大学毕业了。