python 爬取5566图库图片

python 爬取5566图库图片

  1 import requests
  2 import random
  3 import re
  4 import time
  5 import os
  6 from bs4 import BeautifulSoup
  7 
  8 
  9 class GetGirlsPhoto(object):
 10     def __init__(self, head_url, repository_name):
 11         self.url = head_url
 12         self.list_url = []
 13         self.list_pic_url = dict()
 14         self.header_file = 'user_agents.txt'
 15         self.path = repository_name
 16 
 17     #编码问题解决
 18     def chartset(self, rsp):
 19         _chart = requests.utils.get_encoding_from_headers(rsp.headers)
 20         if _chart == 'ISO-8859-1':
 21             rsp.encoding = requests.utils.get_encodings_from_content(rsp.text)
 22 
 23     #随机User-Agent
 24     def get_header(self):
 25         with open(self.header_file, 'r') as f:
 26             headers = f.readlines()
 27             header = random.choice(headers).strip()
 28             header = {'User-Agent': header}
 29             return header
 30 
 31     #获取首页下方页码列表的链接，存入list_url
 32     def get_url_list(self):
 33         rsp = requests.get(self.url, headers=self.get_header())
 34         self.chartset(rsp)
 35         tg_bf = BeautifulSoup(rsp.text, 'lxml')
 36         tag = tg_bf.find_all('a', target='_self')
 37         res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
 38         link = re.findall(res_url, str(tag), re.I | re.S | re.M)
 39         for i in link[1:-3]:
 40             url = self.url+i
 41             self.list_url.append(url)
 42         print('获取\“%s\”子链接成功' % self.url)
 43 
 44     #根据list_url,获取每页的图片入口链接，存入list_pic_url（所有的图片入口链接）
 45     def get_pic_link(self):
 46         self.get_url_list()
 47         for url in self.list_url:
 48             rsp = requests.get(url, headers=self.get_header())
 49             self.chartset(rsp)
 50             tag_bf = BeautifulSoup(rsp.text, 'lxml')
 51             a_tag = tag_bf.find_all('a', class_='picLink')
 52             for i in a_tag:
 53                 self.list_pic_url[i.get('title')] = i.get('href')
 54             time.sleep(1)
 55             print('获取\“%s\”子链接成功！' % url)
 56 
 57     #根据list_pic_url获取图片详细页的连接，然后分析出图片地址，最后进行下载
 58     def get_pic(self):
 59         self.get_pic_link()
 60         for title, url in self.list_pic_url.items():
 61             print('开始下载%s系列' % title)
 62             rsp = requests.get(url, headers=self.get_header()).text
 63             tag_bf = BeautifulSoup(rsp, 'lxml')
 64             tag = tag_bf.find('div', class_='pages')
 65             res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
 66             link = re.findall(res_url, str(tag), re.I | re.S | re.M)
 67             dir_path = self.path+'/'+title
 68             is_exist = os.path.exists(dir_path)
 69             if not is_exist:
 70                 os.makedirs(dir_path)
 71             for index, i in enumerate(link[1:-1]):
 72                 real_url = url.rsplit('/', 1)[0]+'/'+i
 73                 if i == "#":
 74                     rsp = requests.get(url+i, headers=self.get_header())
 75                 else:
 76                     rsp = requests.get(real_url, headers=self.get_header())
 77                 self.chartset(rsp)
 78                 a_bf = BeautifulSoup(rsp.text, 'lxml')
 79                 img = a_bf.find('div', class_='articleBody')
 80                 res_url = r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')"
 81                 img_url = re.findall(res_url, str(img), re.I | re.S | re.M)
 82                 pic_rsp = requests.get(img_url[0], headers=self.get_header())
 83                 img_name = title+str(index+1)+'.jpg'
 84                 img_path = dir_path+'/'+img_name
 85                 with open(img_path, 'wb') as f:
 86                     f.write(pic_rsp.content)
 87                     f.flush()
 88                 f.close()
 89                 print('%s下载完成!' % img_name)
 90                 time.sleep(3)
 91             print("*" * 30)
 92 
 93 
 94 if __name__ == '__main__':
 95     urls = ['http://www.55156.com/a/Mygirl',
 96             'http://www.55156.com/a/Beautyleg']
 97     for i in urls:
 98         url = i
 99         path_name = i.rsplit('/', 1)[1]
100         print(i, path_name)
101         pd = GetGirlsPhoto(head_url=url, repository_name=path_name)
102         pd.get_pic()
103         time.sleep(120)

posted on 2018-04-04 10:54 Louiszj 阅读(866) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Louiszj

python 爬取5566图库图片

导航

公告