爬虫下载校花网美女信息-lxml

# coding=utf-8
# !/usr/bin/env python
'''
    author: dangxusheng
    desc  :  下载校花网上的个人信息:名字-学校-图片地址-点赞数
    date  : 2018-08-29
'''

# 导入模块
import requests
from lxml import etree
import json

#准备全局变量
home_url = "http://www.xiaohuar.com/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Referer": home_url
}

# 定义单页解析方法
def one_page_info(page_index=0):
    url = home_url + "list-1-" + str(page_index) + ".html"
    r = requests.get(url, headers=headers)
    html = r.content.decode('gbk')
    # print(html)
    # exit(1)
    html = etree.HTML(html)
    div_list = html.xpath('//div[@class="item masonry_brick"]')
    info_list = []
    for div in div_list:
        name = div.xpath('.//span[@class="price"]/text()')[0]
        name = name if name != None else '暂无名字'

        school = div.xpath('.//a[@class="img_album_btn"]/text()')[0]
        school = school if school != None else '暂无学校'

        img_url = div.xpath('./div[1]/div[1]/a[1]/img[1]/@src')[0]
        img_url = img_url if img_url != None else '暂无图片'
        # 有些url需要补全:/d/file/20180907/075025972927c8e7541b09e272afe5cc.jpg
        if str(img_url).find('http') == -1:
            img_url = home_url[0:-1] + img_url
        else:
            pass

        dianz = div.xpath('.//em[1]/text()')
        dianz = dianz if dianz != None else '0'
        info_list.append({'name': name, 'school': school, 'img_url': img_url, 'dianzan': dianz})
    return info_list
    # print(info_list)

# 遍历列表并按照URL下载保存到文件
def donwload_jpg_2_file(info_list):
    for info in info_list:
        url = info['img_url']
        r = requests.get(url, headers=headers, stream=True)
        with open('./xiaohua/%s.jpg' % info['name'], 'wb') as file:
            # 分字节下载
            for i in r.iter_content(1024):
                file.write(i)
        print('%s 下载成功' % info['name'])


# 入口函数
if __name__ == '__main__':
    for i in range(50):
        ls = one_page_info(i)
        donwload_jpg_2_file(ls)

 

posted @ 2018-11-09 21:33  dangxusheng  阅读(277)  评论(0编辑  收藏  举报