下载百度贴吧图片

使用正则表达式

# -*- coding: utf-8 -*-
import urllib
import re


def get_content(url):
    '''
    获取网页内容
    '''
    html = urllib.urlopen(url)
    content = html.read()
    html.close()
    return content


def get_images(info):
    '''
    获取需要的图片
    '''
    '''
    <img class="BDE_Image" src="http://imgsrc.baidu.com/forum/w%3D580/
    sign=269396684d4a20a4311e3ccfa0539847/0aa95edf8db1cb132cd1f269df54564e92584b15.jpg"
    pic_ext="jpeg" width="510" height="765">
    '''
    regex = r'class="BDE_Image" src="(.+?\.jpg)"'
    mod = re.compile(regex)
    images = re.findall(mod, info)

    i = 0
    for image in images:
        urllib.urlretrieve(image, "images/%s.jpg" % i)
        i = i + 1


url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)

使用BeautifulSoup

# -*- coding: utf-8 -*-
import urllib
from bs4 import BeautifulSoup


def get_content(url):
    '''
    获取网页内容
    '''
    html = urllib.urlopen(url)
    content = html.read()
    html.close()
    return content


def get_images(info):
    '''
    获取需要的图片
    '''

    soup = BeautifulSoup(info)
    images = soup.find_all(class_="BDE_Image")
    i = 1
    for image in images:
        image_add = image.get('src')
        print i, '--', image_add
        if i < 10:
            i = '00' + str(i)
        elif i < 100:
            i = '0' + str(i)

        image_add = image.get('src')
        urllib.urlretrieve(image_add, "images/bs4-%s.jpg" % i)
        i = int(i) + 1


url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)
posted @ 2016-10-29 22:05  你我之间  阅读(250)  评论(0编辑  收藏  举报