【爬虫】python+urllib+beautifusoup爬取花瓣网美女图片

爬取花瓣网的图片

#!/usr/bin/env python
# -*- encoding:utf-8 -*-

import urllib2
from bs4 import BeautifulSoup
import re
import requests

url = 'http://huaban.com/favorite/beauty/'


def requestMain():
    request = urllib2.Request(url)
    request.headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"

    }

    html_doc = urllib2.urlopen(request)

    print html_doc.getcode()
    return html_doc


def getPins():
    html_doc = requestMain().read()

    soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')

    pins = soup.find_all('a', href=re.compile(r"/pins/\d+/"))
    # print pins
    huaban = 'http://huaban.com'
    i = 0
    for pin in pins:
        pin_urls = huaban + pin['href']
        req = urllib2.Request(pin_urls)
        resp = urllib2.urlopen(req)
        soup = BeautifulSoup(resp, 'html.parser', from_encoding='utf-8')
        div_tag = soup.find_all('div', class_="image-holder")
        i = i+1
        print i
        for tag in div_tag:

            img = tag.find('img')

            link = 'http:'+img.get('src')

            print link

            a = requests.get(link)
            imgname = i
            #imgname = link.split('/')[-1]
            with open(r'C:\Users\wuzhi_000\Desktop\Python\py_scrapy\image\%s.jpg' % imgname, 'wb') as pic:
                pic.write(a.content)
               

if __name__ == '__main__':
    print getPins()


# print (soup.prettify())

# print soup.title
# print soup.title.name
#
# print soup.title.string
#
# print soup.p
#
# print soup.p['class']
#
# print soup.a
#
# print soup.find_all('img')
#
# print ('\r\n')
#
# print soup.find(href="/pins/1147154763/")
#
# print ('\r\n')
#
# for img in soup.find_all('img'):
#     print (img.get('src'))

 

posted @ 2017-05-17 17:26  wuzhiyi  阅读(1378)  评论(0编辑  收藏  举报