1. 使用正则表达式匹配
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    # coding:utf-8
     
    import  re
    import urllib
     
    def get_content(url):
        """ Evilxr, """
        html = urllib.urlopen(url)
        content = html.read()
        html.close()
        return content
     
    def get_images(info):
        """" Download Baidu pictures.
        <img class="BDE_Image" src="http:*****">
        """
        regex = r' class="BDE_Image" src="(.+?\.jpg)" '
        pat = re.compile(regex)
        images_code = re.findall(pat, info)
     
        i = 0
        for image_url in images_code:
            print image_url
            urllib.urlretrieve(image_url, '%s.jpg' % i)
            i = i +1
        print len(images_code)
     
    info = get_content("http://tieba.baidu.com/p/2299704181")
    print get_images(info)

      

  2. 使用第三方库BeautifulSoup匹配
    1
    # 安装 sudo pip install beautifulsoup4

      

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    # coding:utf-8
     
    import urllib
    from bs4 import BeautifulSoup
     
    def get_content(url):
        """ Evilxr, """
        html = urllib.urlopen(url)
        content = html.read()
        html.close()
        return content
     
     
    def get_images(info):
        """
         使用BeautifulSoup在网页源码中匹配图片地址
        """
        soup = BeautifulSoup(info)
        all_img = soup.find_all('img', class_="BDE_Image" )
     
        i = 1
     
        for img in all_img:
            print img['src']
            urllib.urlretrieve(img['src'], '%s.jpg' % i)
            i = i +1
        print "一共下载了 ", len(all_img), "张图片"
     
     
    info = get_content("http://tieba.baidu.com/p/3368845086")
     
    print get_images(info)