第一个Python小爬虫
这个爬虫是参考http://python.jobbole.com/81353/这篇文章写的
这篇文章可能年代过于久远,所以有些代码会报错,然后我自己稍微修改了一下,增加了一个getContentAll的方法
1 # -*- coding:utf-8 -*- 2 3 __author__ = 'HK' 4 5 import urllib 6 import urllib2 7 import re 8 9 class Tool: 10 #去除img标签,7位长空格 11 removeImg = re.compile('<img.*?>| {7}|') 12 #删除超链接标签 13 removeAddr = re.compile('<a.*?>|</a>') 14 #把换行的标签换为\n 15 replaceLine = re.compile('<tr>|<div>|</div>|</p>') 16 #将表格制表<td>替换为\t 17 replaceTD= re.compile('<td>') 18 #把段落开头换为\n加空两格 19 replacePara = re.compile('<p.*?>') 20 #将换行符或双换行符替换为\n 21 replaceBR = re.compile('<br><br>|<br>') 22 #将其余标签剔除 23 removeExtraTag = re.compile('<.*?>') 24 def replace(self,x): 25 x = re.sub(self.removeImg,"",x) 26 x = re.sub(self.removeAddr,"",x) 27 x = re.sub(self.replaceLine,"\n",x) 28 x = re.sub(self.replaceTD,"\t",x) 29 x = re.sub(self.replacePara,"\n ",x) 30 x = re.sub(self.replaceBR,"\n",x) 31 x = re.sub(self.removeExtraTag,"",x) 32 #strip()将前后多余内容删除 33 return x.strip() 34 35 class BDTB: 36 37 #initalizing,get the base url,and set parments is only see up 38 def __init__(self, baseURL, seeup): 39 self.baseURL = baseURL 40 self.seeup = str(seeup) 41 #self.user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 42 #self.referer = 'http://tieba.baidu.com/p/4899608185?see_lz=1&pn=1' 43 #self.Host = 'tieba.baidu.com' 44 45 #iput the index, get the pageindex current post 46 def getPage(self, pageNum): 47 try: 48 #header = {'User-Agent': self.user_agent, 'Host': self.Host, 'Referer': self.referer} 49 url = self.baseURL + '?' + 'pn=' + str(pageNum) 50 print url 51 request = urllib2.Request(url) 52 response = urllib2.urlopen(request) 53 content = response.read() 54 return content 55 except urllib2.URLError, e: 56 if hasattr(e, "reason"): 57 print u'链接百度贴吧失败,错误原因', e.reason 58 return None 59 60 def getTitel(self): 61 page = self.getPage(1) 62 pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S) 63 result = re.search(pattern, page) 64 if result: 65 print result.group(1).strip() 66 else: 67 print 'there is no content catch the re' 68 return None 69 70 def getContent(self, pageIndex): 71 pattern = re.compile('<div id="post_content_.*?>(.*?)</div>', re.S) 72 items = re.findall(pattern, self.getPage(pageIndex)) 73 tool = Tool() 74 txt = "" 75 for item in items: 76 txt += '\t' + tool.replace(str(item)) + '\r\n' 77 return txt 78 79 def getPageNum(self): 80 pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S) 81 result = re.search(pattern, bdtb.getPage(1)) 82 if result: 83 return result.group(1).strip() 84 else: 85 return None 86 87 def getContentAll(self): 88 pageMax = self.getPageNum() 89 txtlog = open('txtContent.txt', 'wb+') 90 txtlog.seek(0) 91 for index in range(1, int(pageMax)+1): 92 txtlog.write(str(index) + self.getContent(index)) 93 txtlog.flush() 94 txtlog.close() 95 print 'Over' 96 97 baseURL = 'http://tieba.baidu.com/p/4899608185' 98 bdtb = BDTB(baseURL, 1) 99 bdtb.getTitel() 100 print bdtb.getPageNum() 101 bdtb.getContentAll()
直接行就能看到结果