爬取图片low版
import re import requests import random import time import os class GetPhoto(object): def __init__(self, url1, class_): self.agent = "user_agents.txt" self.head_url_list = {} self.pic_item_url = [] self.pic_url_list = [] self.page_url_list = [] # self.url = url1 self.class_ = class_ with open(self.agent, 'r', encoding='utf-8') as f: agents = f.readlines() self.agents = list(map(lambda x: {"user-agent": x.strip()}, agents)) def chartset(self, rsp): """ 解决中文乱码问题 :param rsp: :return: """ _chart = requests.utils.get_encoding_from_headers(rsp.headers) if _chart == 'ISO-8859-1': rsp.encoding = requests.utils.get_encodings_from_content(rsp.text)[0] # def get_first_url(self): # rsp = requests.get(self.url, headers=random.choice(self.agents)) # self.chartset(rsp) # content = rsp.text # pattern = r'<div class="nav both">.*?<!-- top end -->' # com = re.compile(pattern, re.S) # content_info = com.findall(content) # pattern2 = 'href="(http://(?:[a-zA-Z0-9]+\.){1,2}[a-zA-Z]{2,6}.*?)".*?<span>(.+?)</span>' # com2 = re.compile(pattern2, re.S) # self.head_url_list.update(map(lambda x:x[::-1],com2.findall(content_info[0]))) def get_page_urllist(self): print('开始获取首页下方链接列表url!') rsp = requests.get(self.url, headers=random.choice(self.agents)) self.chartset(rsp) content = rsp.text pattern = r'<div class="page both">.*?</div>' com = re.compile(pattern, re.S) content_info = com.findall(content) pattern2 = r"href='(.*?)'" com2 = re.compile(pattern2, re.S) self.page_url_list.append(self.url) self.page_url_list.extend(list(map(lambda x: url.rsplit("/", 1)[0]+"/"+x, com2.findall(content_info[0])))) print("首页下方链接列表url获取完成!") print(self.page_url_list) def get_sub_url(self): self.get_page_urllist() print("开始获取整页图片项目链接:") for i in self.page_url_list[:10]: try: rsp = requests.get(i, headers=random.choice(self.agents)) except Exception as e: with open('error.log', 'a', encoding='utf-8') as f: new_time = time.strftime('%Y-%m-%d %H:%M:%S') f.write('{} {} {}+'.format(new_time, i, e.args[0])) print('访问失败,暂停10s!') time.sleep(10) return self.chartset(rsp) content = rsp.text pattern = r'<div class="imgList2">.*?</div>' com = re.compile(pattern, re.S) content_info = com.findall(content) # print(content_info) pattern2 = r'href="(?P<url>http://(?:[\w]+\.){1,2}[a-zA-Z]{2,6}.*?)".*?target="_blank" title="(.*?)">' com2 = re.compile(pattern2, re.S) sub_url_list = com2.findall(content_info[0]) self.pic_item_url.append(sub_url_list) print(sub_url_list) time.sleep(random.randint(1, 3)) #延时1-3秒 print("获取所有页面图片项目完成!") def get_pic_url(self, url2): rsp = requests.get(url2, headers=random.choice(self.agents)) self.chartset(rsp) content = rsp.text pattern = r'<div class="page".*?</div>' com = re.compile(pattern, re.S) content_info = com.findall(content) pattern2 = r"<a href='([^#]+?)'>" com2 = re.compile(pattern2, re.S) pic_url_list = list(map(lambda x: url2.rsplit("/",1)[0]+"/"+x, com2.findall(content_info[0]))) return pic_url_list def get_pic(self, pic_page_url, dirname, count): try: rsp = requests.get(pic_page_url, headers=random.choice(self.agents)) except Exception as e: with open('error.log', 'a', encoding='utf-8') as f: new_time = time.strftime('%Y-%m-%d %H:%M:%S') f.write('{} {} {}\n'.format(new_time, pic_page_url, e.args[0])) print('访问失败,暂停30s!') time.sleep(30) print('下载继续') return self.chartset(rsp) content = rsp.text # pattern = r"<p align=\"center\" id=\"contents\">.*?img\ssrc='(.*?)'" pattern = '<p align="center">.*?src="(.*?)"' # print(content) com = re.compile(pattern, re.S) pic_url = com.findall(content) image_name = dirname + str(count) + '.jpg' if pic_url: try: pic_rsp = requests.get(pic_url[0], headers=random.choice(self.agents)) except Exception as e: with open('error.log', 'a', encoding='utf-8') as f: new_time = time.strftime('%Y-%m-%d %H:%M:%S') f.write('{} {} {}\n'.format(new_time, pic_url[0], e.args[0])) print('访问失败,暂停30s!') time.sleep(30) print('下载继续') return img_dirpath = os.path.join(self.class_, dirname) if not os.path.exists(img_dirpath): os.makedirs(img_dirpath) img_path = os.path.join(img_dirpath, image_name) # print(img_dirpath) # print(image_name) # print(img_path) # return with open(img_path, 'wb') as f: f.write(pic_rsp.content) print('{}下载完成!'.format(image_name)) else: print('{}下载失败!'.format(image_name)) time.sleep(random.randint(1, 3)) def download(self): self.get_sub_url() for i in self.pic_item_url[:3]: for z in i: pic_item, dir_name = z lst = self.get_pic_url(pic_item) print("开始下载{}系列!".format(dir_name)) for index, j in enumerate(lst, 1): self.get_pic(j, dir_name, index) print("{}系列下载完成!".format(dir_name)) if __name__ == '__main__': url = 'http://www.5442.com/youxi/' pic = GetPhoto(url, '游戏壁纸') pic.download()