百度贴吧图片抓取

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import re
import os
import urllib2
import urllib

def download_img(urls,k):
    #urls = "http://tieba.baidu.com/p/4807867791"
    page = urllib2.urlopen(urls)
    html = page.read()
    soup = BeautifulSoup(html, 'html.parser')
    lists = soup.find('a', string='尾页')
    if lists == None:
        return False
    pageurl = lists['href'];
    totalPage = pageurl[-1]

    for j in range(1, int(totalPage)):
        url = urls + "?pn=" + str(j)
        page = urllib2.urlopen(url)
        html = page.read()
        soup = BeautifulSoup(html, 'html.parser')
        print(url)
        lists = soup.find_all('img', class_="BDE_Image")
        i = 1;
        for list in lists:
            filename = str(k)+"-"+str(j) + '-' + str(i);
            print(filename)
            urllib.urlretrieve(list['src'], './images/%s.jpg' % filename)
            i = i + 1;
k = 4807867791
url = "http://tieba.baidu.com/p/4807867791"
download_img(url,k)

 

posted @ 2016-11-30 15:01  brady-wang  阅读(434)  评论(0编辑  收藏  举报