正弦

第一个Python小爬虫

这个爬虫是参考http://python.jobbole.com/81353/这篇文章写的

这篇文章可能年代过于久远,所以有些代码会报错,然后我自己稍微修改了一下,增加了一个getContentAll的方法

  1 # -*- coding:utf-8 -*-
  2 
  3 __author__ = 'HK'
  4 
  5 import urllib
  6 import urllib2
  7 import re
  8 
  9 class Tool:
 10     #去除img标签,7位长空格
 11     removeImg = re.compile('<img.*?>| {7}|')
 12     #删除超链接标签
 13     removeAddr = re.compile('<a.*?>|</a>')
 14     #把换行的标签换为\n
 15     replaceLine = re.compile('<tr>|<div>|</div>|</p>')
 16     #将表格制表<td>替换为\t
 17     replaceTD= re.compile('<td>')
 18     #把段落开头换为\n加空两格
 19     replacePara = re.compile('<p.*?>')
 20     #将换行符或双换行符替换为\n
 21     replaceBR = re.compile('<br><br>|<br>')
 22     #将其余标签剔除
 23     removeExtraTag = re.compile('<.*?>')
 24     def replace(self,x):
 25         x = re.sub(self.removeImg,"",x)
 26         x = re.sub(self.removeAddr,"",x)
 27         x = re.sub(self.replaceLine,"\n",x)
 28         x = re.sub(self.replaceTD,"\t",x)
 29         x = re.sub(self.replacePara,"\n    ",x)
 30         x = re.sub(self.replaceBR,"\n",x)
 31         x = re.sub(self.removeExtraTag,"",x)
 32         #strip()将前后多余内容删除
 33         return x.strip()
 34 
 35 class BDTB:
 36 
 37     #initalizing,get the base url,and set parments is only see up
 38     def __init__(self, baseURL, seeup):
 39         self.baseURL = baseURL
 40         self.seeup = str(seeup)
 41         #self.user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
 42         #self.referer = 'http://tieba.baidu.com/p/4899608185?see_lz=1&pn=1'
 43         #self.Host = 'tieba.baidu.com'
 44 
 45     #iput the index, get the pageindex current post
 46     def getPage(self, pageNum):
 47         try:
 48             #header = {'User-Agent': self.user_agent, 'Host': self.Host, 'Referer': self.referer}
 49             url = self.baseURL + '?' + 'pn=' + str(pageNum)
 50             print url
 51             request = urllib2.Request(url)
 52             response = urllib2.urlopen(request)
 53             content = response.read()
 54             return content
 55         except urllib2.URLError, e:
 56             if hasattr(e, "reason"):
 57                 print u'链接百度贴吧失败,错误原因', e.reason
 58                 return None
 59 
 60     def getTitel(self):
 61         page = self.getPage(1)
 62         pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
 63         result = re.search(pattern, page)
 64         if result:
 65             print result.group(1).strip()
 66         else:
 67             print 'there is no content catch the re'
 68             return None
 69 
 70     def getContent(self, pageIndex):
 71         pattern = re.compile('<div id="post_content_.*?>(.*?)</div>', re.S)
 72         items = re.findall(pattern, self.getPage(pageIndex))
 73         tool = Tool()
 74         txt = ""
 75         for item in items:
 76             txt += '\t' + tool.replace(str(item)) + '\r\n'
 77         return txt
 78 
 79     def getPageNum(self):
 80         pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
 81         result = re.search(pattern, bdtb.getPage(1))
 82         if result:
 83             return result.group(1).strip()
 84         else:
 85             return None
 86 
 87     def getContentAll(self):
 88         pageMax = self.getPageNum()
 89         txtlog = open('txtContent.txt', 'wb+')
 90         txtlog.seek(0)
 91         for index in range(1, int(pageMax)+1):
 92             txtlog.write(str(index) + self.getContent(index))
 93             txtlog.flush()
 94         txtlog.close()
 95         print 'Over'
 96 
 97 baseURL = 'http://tieba.baidu.com/p/4899608185'
 98 bdtb = BDTB(baseURL, 1)
 99 bdtb.getTitel()
100 print bdtb.getPageNum()
101 bdtb.getContentAll()

直接行就能看到结果

posted on 2016-12-20 16:28  HKplus  阅读(299)  评论(0编辑  收藏  举报

导航