爬虫_斗图啦_表情包下载
为下一个多线程练练手
1 import requests 2 from lxml import etree 3 import re 4 from urllib import request 5 import time 6 import os 7 8 def get_html(url): 9 headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'} 10 try: 11 response = requests.get(url, headers=headers) 12 response.raise_for_status() 13 14 response.encoding = 'utf-8' 15 return response.text 16 except: 17 return 0 18 19 20 def parse_html(html): 21 html_element = etree.HTML(html) 22 imgs = html_element.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]') 23 for img in imgs: 24 url = img.get('data-original') 25 name = img.get('alt') 26 name = re.sub('[??,。]', '', name)#看系统的命名要求了 27 tail = os.path.splitext(url)[1] #取后缀 28 29 file_name = name + tail #拼接文件名 30 request.urlretrieve(url, 'images/' + file_name) #注意提前新建images文件夹 31 32 33 def main(): 34 for i in range(1, 50): 35 url = 'http://www.doutula.com/photo/list/?page=%d' % i 36 html = get_html(url) 37 while html == 0: 38 time.sleep(2) 39 html = get_html(url) 40 parse_html(html) 41 # break 42 43 44 if __name__ == '__main__': 45 main()
不多说了,沙海开始了。