爬虫下载校花网美女信息-lxml

复制代码
# coding=utf-8
# !/usr/bin/env python
'''
    author: dangxusheng
    desc  :  下载校花网上的个人信息:名字-学校-图片地址-点赞数
    date  : 2018-08-29
'''

# 导入模块
import requests
from lxml import etree
import json

#准备全局变量
home_url = "http://www.xiaohuar.com/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Referer": home_url
}

# 定义单页解析方法
def one_page_info(page_index=0):
    url = home_url + "list-1-" + str(page_index) + ".html"
    r = requests.get(url, headers=headers)
    html = r.content.decode('gbk')
    # print(html)
    # exit(1)
    html = etree.HTML(html)
    div_list = html.xpath('//div[@class="item masonry_brick"]')
    info_list = []
    for div in div_list:
        name = div.xpath('.//span[@class="price"]/text()')[0]
        name = name if name != None else '暂无名字'

        school = div.xpath('.//a[@class="img_album_btn"]/text()')[0]
        school = school if school != None else '暂无学校'

        img_url = div.xpath('./div[1]/div[1]/a[1]/img[1]/@src')[0]
        img_url = img_url if img_url != None else '暂无图片'
        # 有些url需要补全:/d/file/20180907/075025972927c8e7541b09e272afe5cc.jpg
        if str(img_url).find('http') == -1:
            img_url = home_url[0:-1] + img_url
        else:
            pass

        dianz = div.xpath('.//em[1]/text()')
        dianz = dianz if dianz != None else '0'
        info_list.append({'name': name, 'school': school, 'img_url': img_url, 'dianzan': dianz})
    return info_list
    # print(info_list)

# 遍历列表并按照URL下载保存到文件
def donwload_jpg_2_file(info_list):
    for info in info_list:
        url = info['img_url']
        r = requests.get(url, headers=headers, stream=True)
        with open('./xiaohua/%s.jpg' % info['name'], 'wb') as file:
            # 分字节下载
            for i in r.iter_content(1024):
                file.write(i)
        print('%s 下载成功' % info['name'])


# 入口函数
if __name__ == '__main__':
    for i in range(50):
        ls = one_page_info(i)
        donwload_jpg_2_file(ls)
复制代码

 

posted @   dangxusheng  阅读(285)  评论(0编辑  收藏  举报
编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示