贴吧测试
__author__ == 'Jeffery Gao' #coding=utf-8 import urllib2 import re import os class Tool: removeImg = re.compile('<img.*?>| {7}') removeAddr = re.compile('<a.*?>|</a>') replaceLine = re.compile('<tr>|<div>|</div>|</p>') replaceTD = re.compile('<td>') replacePara = re.compile('<p.*?>') replaceBR = re.compile('<br><br>|</br>') removeExtraTag = re.compile('<.*?>') def repalce(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replacePara, "\n ", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.removeExtraTag, "", x) return x.strip() class BDTB: #baseURL and seelz def __init__(self, baseURL = '', seeLz = 1): if baseURL: self.baseURL = baseURL else: self.baseURL = 'https://tieba.baidu.com/p/3138733512' self.seeLz = '?see_lz='+str(seeLz) self.defaultTitle = 'NewPost' self.cutOffRule = '*' * 60 self.postsAndPageNum = [0, 0] #self.page_index = 0 #get page html code def getPageCode(self, pageNum): pageStr = '&pn=' + str(pageNum) try: url = self.baseURL + self.seeLz + '&pn=' + pageStr request = urllib2.Request(url) response = urllib2.urlopen(request) pageCode = response.read().decode('utf-8') #print(pageCode) return pageCode except urllib2.URLError, e: if hasattr(e, 'reason'): print('Get Page Code Failure:'+e.reason) return None # extract the title/total pages/contents/floor number from pageCode def getTitle(self, pageCode): if pageCode: pattern = re.compile(r'<h3 class=".*?"> title="([\s\S]*?)".*?>') titleName = re.search(pattern, pageCode) # print(titleName.groups()) return titleName.group(1).strip() else: print('Get Title Failure !') return None # get total pages and reply posts number def getTotalPage(self, pageCode): if pageCode: pattern = re.compile(r'<li class="l_reply_num".*?>[\s\S]*?<span class=.*?>(.*?)</span>\ [\s\S]*?<span class="red">(.*?)</span>') items = re.search(pattern, pageCode) # print(type(items)) if items: # total reply posts and page number postsAndPageNum = [items.group(1), items.group(2)] self.postsAndPageNum = postsAndPageNum return postsAndPageNum return None # get contents / floor number / reply time def getContents(self, pageIndex): pageCode = self.getPageCode(pageIndex) pattern = re.compile(r'<div id="post_content.*?>([\s\S]*?)<div>[\s\S]*?\ <span class="tail_info">(.*?)</span>[\s\S]*?<span class="tail_info">(.*?)</span>') items = re.findall(pattern, pageCode) if not items: print('Re Module Error: getContents') contentAndFloorAndTime = [] for item in items: #print(items[0]) #print(items[1]) #print(items[2]) #content---floor number----time contentAndFloorAndTime.append([item[0], item[1], item[2]]) return contentAndFloorAndTime return None #save file def setFileName(self, title=''): if title: fileName = title + '.txt' self.fileFP = open(fileName, 'w+') self.fileFP.write(title) self.fileFP.write('Post total '+ self.postsAndPageNum[0] + 'pages, and ' + self.postsAndPageNum[1]) else: filename = self.defaultTitle + '.txt' self.fileFP = open(fileName, 'w+') self.fileFP.write('This Post is None') self.fileFP.write('Post total ' + self.postsAndPageNum[0] + 'pages, and ' + self.postsAndPageNum[1]) return self.fileFP def witreData(self, contents = []): for item in contents: self.fileFP.write(self.cutOffRule) self.fileFP.write(item1[0]+item[1]) self.fileFP.write(item[2]) def start(self): pageCode = self.getPageCode(1) title = self.getTitle(pageCode) postsAndPageNum = self.getTotalPage(pageCode) self.setFileName(title) print('this post total {0} pages and {1} reply'.format(postsAndPageNum)) contents = self.getContents(1) print('Now is loading page 1') self.witreData(contents) totalPage = postsAndPageNum[2] pageNum = 2 while pageNum<= totalPage: print('page {0} is finished! Now is loading page {1}'.format(pageNum-1, pageNum)) pageCode = self.getPageCode(pageNum) contents = self.getContents(pageCode) self.witreData(contents) print('All Finished') def main(): baseURL = raw_input("输入帖子网址:") seeLz = raw_input("是否选择只看楼主(0否1是):") bdtb = BDTB(baseURL,seeLz) bdtb.start() if __name__ == '__main__': main()