学习笔记(爬虫):爬取任意百度吧帖子中的图片和视频 2
续上篇:https://www.cnblogs.com/maxxu11/p/12626007.html
1、代码:
# -*- coding: utf-8 -*- import requests from lxml import etree import os from urllib import parse #爬虫 class BtcSpider(object): def __init__(self): #爬取美女吧图片 #解析:美女吧url地址,pn值决定页数,pn =0 ,表示第一页,pn = 50 表示第二页…… self.url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}' self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', } #发送请求 def get_response(self,url): response = requests.get(url,headers = self.headers) data = response.text return data #发送请求获取网页数据,数据读取方式为二进制content形式 def get_data(self,url): data = requests.get(url,headers = self.headers).content return data #解析数据,封装xpath def get_xpath(self,html,pattern): #构建树 p = etree.HTML(html) #解析网页内容,获取url_lists result = p.xpath(pattern) return result #下载图片& def download_src(self,url): html = self.get_response(url) html = html.replace("<!--", "") pattern1 = '//div[@class="threadlist_title pull_left j_th_tit "]/a/@href' #获取每个帖子的url地址 url_lists = self.get_xpath(html,pattern1) base_url = "http://tieba.baidu.com/" true_lists = [] for i in url_lists: #帖子完整的url地址 tie_url = "http://tieba.baidu.com" + i tie_html = self.get_response(tie_url) pattern2 = '//img[@class="BDE_Image"]/@src | //div[@class="video_src_wrapper"]/embed/@data-video' img_lists = self.get_xpath(tie_html,pattern2) #下载图片 self.save_data(img_lists) #保存数据 def save_data(self,url_lists): for url in url_lists: img_data = self.get_data(url) file_name = url[-10:] print('正在下载图片:',file_name) file_path = self.dir+'\\'+file_name with open(file_path,'wb')as f: f.write(img_data) def run(self): word = input("请输入关键词:") begin = int(input("请输入起始页:")) end = int(input("请输入终止页:")) #转化格式 name = parse.quote(word) os.makedirs(word, exist_ok=True) self.dir = word #设置需要爬取的页数 for i in range(begin,end): i = (i-1)*50 url = self.url.format(name,i) self.download_src(url) if __name__ == "__main__": spider = BtcSpider() spider.run()
学习、思考、阅读、旅游、爱人—— 一生所求