umei-spider
1 #!/usr/bin/python3 2 3 import requests 4 from bs4 import BeautifulSoup 5 from contextlib import closing 6 import time 7 import uuid 8 9 10 class SevenOneSixZero: 11 def __init__(self): 12 self.photo_id = [] 13 self.url = 'http://www.umei.cc/tags/xiezhen_1.htm' 14 self.urls = [] 15 self.headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 17 } 18 19 def get_ids(self): 20 res = requests.get(url=self.url, headers=self.headers) 21 22 def get_page_content(self, url): 23 res = requests.get(url, headers=self.headers) 24 r = res.text 25 # response.encoding是指从HTTP的header中猜测的响应内容编码方式 如果header中不存在charset,则默认编码为ISO-8859-1 26 # print(res.encoding) 27 # response.apparent_encoding是指从内容中分析出的响应内容编码方式。 28 # print(res.apparent_encoding) 29 # requests内部的 utils 也提供了一个从返回 body 获取页面编码的函数get_encodings_from_content,这样如果服务器返回的头不含 Charset,再通过 get_encodings_from_content 就可以知道页面的正确编码了 30 # print(requests.utils.get_encodings_from_content(r)) 31 return r.encode(res.encoding).decode(res.apparent_encoding) 32 33 def get_img(self): 34 res = self.get_page_content(self.url) 35 soup = BeautifulSoup(res, 'lxml') 36 img_list = soup.select('body > div.wrap > div.TypeList > ul > li > a > img') 37 return img_list 38 39 def get_img_src_list(self): 40 img_list = [] 41 for img in self.get_img(): 42 img_dict = { 43 'src': img.get('src') 44 } 45 img_list.append(img_dict) 46 return img_list 47 48 def download_img(self, img_list): 49 """ 50 下载图片 51 :param img_list: 52 :return: 53 """ 54 i = 0 55 for img_dic in img_list: 56 # time.sleep(0.1) 57 src = img_dic['src'] 58 res = requests.get(src, self.headers) 59 i = i + 1 60 with closing(res) as r: 61 name = uuid.uuid1() 62 with open('D:/python/imgs/{}.jpg'.format(name), 'ab+') as f: 63 for chunk in r.iter_content(chunk_size=1024): 64 if chunk: 65 f.write(chunk) 66 f.flush() 67 print('成功下载第{}张图:{}.jpg'.format(i,name )) 68 69 def get_url_list(self, start, end): 70 """ 71 获取url列表 72 :param start: 73 :param end: 74 :return: 75 """ 76 for i in range(start, end): 77 self.urls.append('http://www.umei.cc/tags/xiezhen_{}.htm'.format(i)) 78 return self.urls 79 80 81 if __name__ == '__main__': 82 seven = SevenOneSixZero() 83 84 for url in seven.get_url_list(22, 30): 85 seven.url = url 86 img_list = seven.get_img_src_list() 87 seven.download_img(img_list)