想把以前喜欢的帖子爬下来,代码存档于2024.4.26,不知道能用多久。
import requests from lxml import etree #移除链接、图片的标签 def removeTag(text): tree = etree.fromstring(text) for bad in tree.xpath("//a"): bad.getparent().remove(bad) for bad in tree.xpath("//img"): bad.getparent().remove(bad) return etree.tostring(tree, encoding="utf-8").decode('utf-8') #获取内容并清洗 def getContent(url): r = requests.get(url=url) r.encoding = "utf-8" html = etree.HTML(r.text) res1 = html.xpath("//div[@class='d_post_content j_d_post_content ']") res2 = [] for ele in res1: s = etree.tostring(ele, encoding="utf-8").decode('utf-8') s = removeTag(s) pos = s.find(">") s = s[pos + 1:] pos2 = 0 while s[pos2] == ' ': pos2 = pos2 + 1 s = s[pos2:] s = s.replace("</div>", "") s = s.replace("<br/>", "\n") s = s.replace("<strong>", "") s = s.replace("</strong>", "") s = s.replace("</span>", "") s = s.replace('''<span class="edit_font_color">''', "") s = s.replace("<", "") s = s.replace(">", "") # s = s.replace("</strong>", "") # s = s.replace("</span>", "") res2.append(s) return res2 #写回txt def writePage(text, filename): with open(filename, "a", encoding="utf-8") as f: for ele in text: f.write(ele + '\n') if __name__ == "__main__": #需要填写的东西 #这是帖子网址,记得把pn=后面的数字删了 url = "https://tieba.baidu.com/p/xxxxxxx?see_lz=1&pn=" #这是文件名 filename = "xxx.txt" #这是起止页数,爬虫爬到的应该是浏览器看到的页数的2倍 st=1 ed=10 for i in range(st, ed + 1): url2 = url + str(i) text = getContent(url2) if text == "null": break writePage(text, filename) print("end.")
upd 6.1
水一个爬图片的代码:
import requests from lxml import etree #移除链接、图片的标签 def geturl(text): tree = etree.fromstring(text) res = [] for img in tree.xpath("//img"): url = img.attrib.get('src') res.append(url) return res #获取内容并清洗 def getContent(url, page, directory): r = requests.get(url=url) r.encoding = "utf-8" html = etree.HTML(r.text) res1 = html.xpath("//div[@class='d_post_content j_d_post_content ']") cnt = 1 for ele in res1: s = etree.tostring(ele, encoding="utf-8").decode('utf-8') res2 = geturl(s) for url in res2: path = directory+"p"+str(page)+"("+str(cnt)+").jpg" downloadImg(url, path) cnt=cnt+1 #写回txt def downloadImg(url, filename): response = requests.get(url) # 获取的文本实际上是图片的二进制文本 img = response.content # 将他拷贝到本地文件 w 写 b 二进制 wb代表写入二进制文本 # 保存路径 with open(filename, 'wb') as f: f.write(img) if __name__ == "__main__": #需要填写的东西 #这是帖子网址,记得把pn=后面的数字删了 url = "" #这是文件名 directory = "\\" #这是起止页数,爬虫爬到的应该是浏览器看到的页数的2倍 st=3 ed=28 for i in range(st, ed + 1): url2 = url + str(i) getContent(url2, i, directory) print("end.")
贴吧17年死了一次后面又活了,但好些楼没恢复。那些初中水的贴闭着眼睛还能想起镇楼图什么样,一抬眼却要大学毕业了。