使用正则表达式
# -*- coding: utf-8 -*-
import urllib
import re
def get_content(url):
'''
获取网页内容
'''
html = urllib.urlopen(url)
content = html.read()
html.close()
return content
def get_images(info):
'''
获取需要的图片
'''
'''
<img class="BDE_Image" src="http://imgsrc.baidu.com/forum/w%3D580/
sign=269396684d4a20a4311e3ccfa0539847/0aa95edf8db1cb132cd1f269df54564e92584b15.jpg"
pic_ext="jpeg" width="510" height="765">
'''
regex = r'class="BDE_Image" src="(.+?\.jpg)"'
mod = re.compile(regex)
images = re.findall(mod, info)
i = 0
for image in images:
urllib.urlretrieve(image, "images/%s.jpg" % i)
i = i + 1
url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)
使用BeautifulSoup
# -*- coding: utf-8 -*-
import urllib
from bs4 import BeautifulSoup
def get_content(url):
'''
获取网页内容
'''
html = urllib.urlopen(url)
content = html.read()
html.close()
return content
def get_images(info):
'''
获取需要的图片
'''
soup = BeautifulSoup(info)
images = soup.find_all(class_="BDE_Image")
i = 1
for image in images:
image_add = image.get('src')
print i, '--', image_add
if i < 10:
i = '00' + str(i)
elif i < 100:
i = '0' + str(i)
image_add = image.get('src')
urllib.urlretrieve(image_add, "images/bs4-%s.jpg" % i)
i = int(i) + 1
url = "http://tieba.baidu.com/p/2772656630"
info = get_content(url)
get_images(info)