GET请求/百度贴吧 有bug

 1 #  -*- coding:utf-8 -*-
 2 import urllib, urllib2
 3 import re
 4 import sys
 5 
 6 class Cuzz():
 7     """这是一个类"""
 8     def __init__(self, url, header, start_page, end_page):
 9         self.url = url 
10         self.header = header
11         self.start_page = start_page
12         self.end_page = end_page
13 
14     def deal_url(self):
15         """处理url"""
16         for i in range(self.start_page, self.end_page+1):
17             num = 50*(i-1)
18             url = self.url+str(num)
19             request = urllib2.Request(url, headers=self.header)
20             response = urllib2.urlopen(request)
21             htmltext = response.read()
22             self.load_images(htmltext)
23 
24 
25     def load_images(self, htmltext):
26         """下载图片"""
27         # 找出这一页的所有帖子类似这样的/p/1111111111
28         pattern = re.findall(r"/p/\d+", htmltext)        
29         for temp in pattern:
30             url = "http://tieba.baidu.com" + temp
31             request = urllib2.Request(url, headers=self.header)
32             response = urllib2.urlopen(request)
33             htmltext1 = response.read()
34             self.save_images(htmltext1)
35 
36 
37     def save_images(self, htmltext1):
38         """保存到本地"""
39         image_links = re.findall(r"https://imgsa\.baidu.+?\.jpg",htmltext1)    
40         for url in image_links:
41             request = urllib2.Request(url, headers=self.header)
42             response = urllib2.urlopen(request)
43             htmltext1 = response.read()
44             print htmltext1
45             with open("./images/"+str(url[-10:-1]), "w") as f:
46                 f.write(htmltext1)
47 
48 def main():
49     """控制"""
50     header = {"User-Agent":"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1"}
51     title = raw_input("请输入您要下载的贴吧:")
52     keyword = {"kw":title} # 是一个字典的格式,转换后"kw=%34dgfdg%fg
53     
54     keyword = urllib.urlencode(keyword) # 有中文需要转成url的格式
55     
56     url = "https://tieba.baidu.com/f?" + keyword + "&ie=utf-8&pn="
57 
58     start_page = int(raw_input("请输入起始页面:"))
59     end_page = int(raw_input("请输入截止页面:"))
60     
61     
62     cuzz = Cuzz(url, header, start_page, end_page)
63     cuzz.deal_url() 

 

posted @ 2017-10-09 22:03  cuzz_z  阅读(217)  评论(0编辑  收藏  举报