爬虫下载校花网美女信息-lxml
# coding=utf-8 # !/usr/bin/env python ''' author: dangxusheng desc : 下载校花网上的个人信息:名字-学校-图片地址-点赞数 date : 2018-08-29 ''' # 导入模块 import requests from lxml import etree import json #准备全局变量 home_url = "http://www.xiaohuar.com/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", "Referer": home_url } # 定义单页解析方法 def one_page_info(page_index=0): url = home_url + "list-1-" + str(page_index) + ".html" r = requests.get(url, headers=headers) html = r.content.decode('gbk') # print(html) # exit(1) html = etree.HTML(html) div_list = html.xpath('//div[@class="item masonry_brick"]') info_list = [] for div in div_list: name = div.xpath('.//span[@class="price"]/text()')[0] name = name if name != None else '暂无名字' school = div.xpath('.//a[@class="img_album_btn"]/text()')[0] school = school if school != None else '暂无学校' img_url = div.xpath('./div[1]/div[1]/a[1]/img[1]/@src')[0] img_url = img_url if img_url != None else '暂无图片' # 有些url需要补全:/d/file/20180907/075025972927c8e7541b09e272afe5cc.jpg if str(img_url).find('http') == -1: img_url = home_url[0:-1] + img_url else: pass dianz = div.xpath('.//em[1]/text()') dianz = dianz if dianz != None else '0' info_list.append({'name': name, 'school': school, 'img_url': img_url, 'dianzan': dianz}) return info_list # print(info_list) # 遍历列表并按照URL下载保存到文件 def donwload_jpg_2_file(info_list): for info in info_list: url = info['img_url'] r = requests.get(url, headers=headers, stream=True) with open('./xiaohua/%s.jpg' % info['name'], 'wb') as file: # 分字节下载 for i in r.iter_content(1024): file.write(i) print('%s 下载成功' % info['name']) # 入口函数 if __name__ == '__main__': for i in range(50): ls = one_page_info(i) donwload_jpg_2_file(ls)