贴吧测试

__author__ == 'Jeffery Gao'
#coding=utf-8

import urllib2
import re
import os

class Tool:
    removeImg = re.compile('<img.*?>| {7}')
    removeAddr = re.compile('<a.*?>|</a>')
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    replaceTD = re.compile('<td>')
    replacePara = re.compile('<p.*?>')
    replaceBR = re.compile('<br><br>|</br>')
    removeExtraTag = re.compile('<.*?>')
    def repalce(self, x):
        x = re.sub(self.removeImg, "", x)
        x = re.sub(self.removeAddr, "", x)
        x = re.sub(self.replaceLine, "\n", x)
        x = re.sub(self.replaceTD, "\t", x)
        x = re.sub(self.replacePara, "\n    ", x)
        x = re.sub(self.replaceBR, "\n", x)
        x = re.sub(self.removeExtraTag, "", x)
        return x.strip()


class BDTB:
   #baseURL and seelz
    def __init__(self, baseURL = '', seeLz = 1):
        if baseURL:
            self.baseURL = baseURL
         else:
             self.baseURL = 'https://tieba.baidu.com/p/3138733512'
        self.seeLz = '?see_lz='+str(seeLz)
        self.defaultTitle = 'NewPost'
        self.cutOffRule = '*' * 60
        self.postsAndPageNum = [0, 0]
        #self.page_index = 0

    #get page html code
    def getPageCode(self, pageNum):
        pageStr =  '&pn=' + str(pageNum)
        try:
            url = self.baseURL + self.seeLz + '&pn=' + pageStr
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            pageCode = response.read().decode('utf-8')
            #print(pageCode)
            return pageCode
        except urllib2.URLError, e:
            if hasattr(e, 'reason'):
                print('Get Page Code Failure:'+e.reason)
            return None

    # extract the title/total pages/contents/floor number from pageCode
    def getTitle(self, pageCode):
        if pageCode:
            pattern = re.compile(r'<h3 class=".*?"> title="([\s\S]*?)".*?>')
            titleName = re.search(pattern, pageCode)
            # print(titleName.groups())
            return titleName.group(1).strip()
         else:
            print('Get Title Failure !')
         return None

   # get total pages and reply posts number
    def getTotalPage(self, pageCode):
        if pageCode:
            pattern = re.compile(r'<li class="l_reply_num".*?>[\s\S]*?<span class=.*?>(.*?)</span>\
               [\s\S]*?<span class="red">(.*?)</span>')
            items = re.search(pattern, pageCode)
            # print(type(items))
            if items:
                 # total reply posts and page number
                postsAndPageNum = [items.group(1), items.group(2)]
                self.postsAndPageNum = postsAndPageNum
                return postsAndPageNum
        return None

    # get contents / floor number / reply time
    def getContents(self, pageIndex):
        pageCode = self.getPageCode(pageIndex)
        pattern = re.compile(r'<div id="post_content.*?>([\s\S]*?)<div>[\s\S]*?\
        <span class="tail_info">(.*?)</span>[\s\S]*?<span class="tail_info">(.*?)</span>')
        items = re.findall(pattern, pageCode)
        if not items:
            print('Re Module Error: getContents')
        contentAndFloorAndTime = []
        for item in items:
            #print(items[0])
            #print(items[1])
            #print(items[2])
            #content---floor number----time
            contentAndFloorAndTime.append([item[0], item[1], item[2]])
            return contentAndFloorAndTime
        return None

    #save file
    def setFileName(self, title=''):
        if title:
            fileName = title + '.txt'
            self.fileFP = open(fileName, 'w+')
            self.fileFP.write(title)
            self.fileFP.write('Post total '+ self.postsAndPageNum[0] + 'pages, and ' + self.postsAndPageNum[1])
        else:
            filename = self.defaultTitle + '.txt'
            self.fileFP = open(fileName, 'w+')
            self.fileFP.write('This Post is None')
            self.fileFP.write('Post total ' + self.postsAndPageNum[0] + 'pages, and ' + self.postsAndPageNum[1])
        return self.fileFP

    def witreData(self, contents = []):
        for item in contents:
            self.fileFP.write(self.cutOffRule)
            self.fileFP.write(item1[0]+item[1])
            self.fileFP.write(item[2])

    def start(self):
        pageCode = self.getPageCode(1)
        title = self.getTitle(pageCode)
        postsAndPageNum = self.getTotalPage(pageCode)
        self.setFileName(title)
        print('this post total {0} pages and {1} reply'.format(postsAndPageNum))
        contents = self.getContents(1)
        print('Now is loading page 1')
        self.witreData(contents)
        totalPage = postsAndPageNum[2]
        pageNum = 2
        while pageNum<= totalPage:
            print('page {0} is finished! Now is loading page {1}'.format(pageNum-1, pageNum))
            pageCode = self.getPageCode(pageNum)
            contents = self.getContents(pageCode)
            self.witreData(contents)

        print('All Finished')


def main():
    baseURL = raw_input("输入帖子网址:")
    seeLz = raw_input("是否选择只看楼主(0否1是):")
    bdtb = BDTB(baseURL,seeLz)
    bdtb.start()

if __name__ == '__main__':
    main()

 

posted @ 2018-01-19 15:34  v_keys  阅读(161)  评论(0编辑  收藏  举报