爬取百度贴吧中的图片以及视频
将爬取下来的内容保存到本地
import re import time import urllib import random import requests from lxml import etree class ImmgeSpider: def __init__(self): self.headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"} # self.headers = {"User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"} # 获取所有帖子URL列表 def getPageUrl(self, url): # 获取贴吧页面的html res = requests.get(url, headers=self.headers) print(url) res.encoding = "utf-8" html = res.text # print(html) # 提取页面中所有帖子的url # parseHtml = etree.HTML(html) # t_list = parseHtml.xpath('//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a/@href') p = re.compile('<div class="threadlist_title pull_left j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title', re.S) t_list = p.findall(html) print(t_list) for t_link in t_list: t_url = "http://tieba.baidu.com" + t_link self.getImageUrl(t_url) # 获取每个帖子中图片的url列表 def getImageUrl(self, t_url): # 获取每个帖子的响应html res = requests.get(t_url, headers = self.headers) res.encoding = "utf-8" html = res.text parseHtml = etree.HTML(html) img_list = parseHtml.xpath('//*[@class="d_post_content j_d_post_content clearfix"]/img/@src | ' + '//embed/@data-video') print(img_list) for img_link in img_list: self.writeImage(img_link) # 保存图片 def writeImage(self, img_link): times = random.randint(2, 5) time.sleep(times) # 获取每张图片的二进制 res = requests.get(img_link, headers=self.headers) res.encoding = "utf-8" html = res.content # 保存到本地(以图片的后12位为文件名) filename = img_link.split("/")[-1] if "?" in filename: filename = filename.split("?")[-1] with open("图片/"+ filename, 'wb') as f: f.write(html) print(filename, "下载成功") # 主函数 def workOn(self): name = input("请输入贴吧名:") start = int(input("请输入起始页:")) end = int(input("请输入结束页:")) for pn in range(start, end+1): pn = (pn-1) * 50 kw = {"kw": name} kw = urllib.parse.urlencode(kw) fullurl = "http://tieba.baidu.com/f?" + kw + "&ie=utf-8&pn=" + str(pn) # fullurl = "http://tieba.baidu.com/f?kw=%E6%A0%A1%E8%8A%B1&ie=utf-8&pn=50" # 直接调用类内函数 self.getPageUrl(fullurl) if __name__ == '__main__': spider = ImmgeSpider() spider.workOn()