GET请求/百度贴吧 有bug
1 # -*- coding:utf-8 -*- 2 import urllib, urllib2 3 import re 4 import sys 5 6 class Cuzz(): 7 """这是一个类""" 8 def __init__(self, url, header, start_page, end_page): 9 self.url = url 10 self.header = header 11 self.start_page = start_page 12 self.end_page = end_page 13 14 def deal_url(self): 15 """处理url""" 16 for i in range(self.start_page, self.end_page+1): 17 num = 50*(i-1) 18 url = self.url+str(num) 19 request = urllib2.Request(url, headers=self.header) 20 response = urllib2.urlopen(request) 21 htmltext = response.read() 22 self.load_images(htmltext) 23 24 25 def load_images(self, htmltext): 26 """下载图片""" 27 # 找出这一页的所有帖子类似这样的/p/1111111111 28 pattern = re.findall(r"/p/\d+", htmltext) 29 for temp in pattern: 30 url = "http://tieba.baidu.com" + temp 31 request = urllib2.Request(url, headers=self.header) 32 response = urllib2.urlopen(request) 33 htmltext1 = response.read() 34 self.save_images(htmltext1) 35 36 37 def save_images(self, htmltext1): 38 """保存到本地""" 39 image_links = re.findall(r"https://imgsa\.baidu.+?\.jpg",htmltext1) 40 for url in image_links: 41 request = urllib2.Request(url, headers=self.header) 42 response = urllib2.urlopen(request) 43 htmltext1 = response.read() 44 print htmltext1 45 with open("./images/"+str(url[-10:-1]), "w") as f: 46 f.write(htmltext1) 47 48 def main(): 49 """控制""" 50 header = {"User-Agent":"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1"} 51 title = raw_input("请输入您要下载的贴吧:") 52 keyword = {"kw":title} # 是一个字典的格式,转换后"kw=%34dgfdg%fg 53 54 keyword = urllib.urlencode(keyword) # 有中文需要转成url的格式 55 56 url = "https://tieba.baidu.com/f?" + keyword + "&ie=utf-8&pn=" 57 58 start_page = int(raw_input("请输入起始页面:")) 59 end_page = int(raw_input("请输入截止页面:")) 60 61 62 cuzz = Cuzz(url, header, start_page, end_page) 63 cuzz.deal_url()