python之爬取网页贴吧图片
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # author:Momo time:2018/6/29 4 5 6 import urllib.request 7 import os 8 import urllib 9 import re 10 from lxml import etree 11 12 def get_html_code(url): 13 html_page = urllib.request.urlopen(url) 14 # 这里用 xpath 或者之前的 re 拿到img_url_list 15 html_code = html_page.read() 16 return html_code 17 18 # 通过url获取每个帖子链接 19 def getArticleLinks(url): 20 Selector = etree.HTML(get_html_code(url)) 21 # 通过Xpath 获取每个帖子的url后缀 22 url_list = Selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href') 23 # 在每个后缀前加上百度贴吧的url前缀 24 for i in range(len(url_list)): 25 url_list[i] = 'http://tieba.baidu.com' + url_list[i] 26 return url_list 27 28 # 通过所给帖子链接,下载帖子中所有图片 29 def get_img(url): 30 Selector = etree.HTML(get_html_code(url)) 31 img_url_list = Selector.xpath('//*[@class="BDE_Image"]/@src') 32 pic_name = 0 33 for each in img_url_list: 34 urllib.request.urlretrieve(each, 'pic_%s.jpg' % pic_name) 35 pic_name += 1 36 37 # 为每个帖子创建独立文件夹,并下载图片 38 def download_img(url_list,page): 39 # 该目录下创建一个downloads文件夹存放下载图片 40 if not os.path.exists('downloads'): 41 os.mkdir('downloads') 42 root_path = os.getcwd() 43 for i in range(page): 44 img_dir = 'downloads/' + url_list[i][23:].replace("/", '') 45 if not os.path.exists(img_dir): 46 os.mkdir(img_dir) 47 os.chdir(img_dir) 48 get_img(url_list[i]) 49 os.chdir(root_path) 50 51 if __name__ == '__main__': 52 print('-----贴吧图片爬取装置2.0-----') 53 print('请输入贴吧地址:',) 54 targetUrl = input('') 55 if not targetUrl: 56 print('---没有地址输入正在使用默认地址(baidu壁纸吧)---') 57 targetUrl = 'http://tieba.baidu.com/f?kw=%E5%A3%81%E7%BA%B8&ie=utf-8' 58 59 page = '' 60 while True: 61 print('请输入你要下载的帖子数:',) 62 page = input('') 63 if re.findall(r'^[0-9]*[1-9][0-9]*$',page): 64 page = int(page) 65 break 66 print('----------正在下载图片---------') 67 ArticleLinks = getArticleLinks(targetUrl) 68 download_img(ArticleLinks,page) 69 print('-----------下载成功-----------') 70 input('Press Enter to exit')