百度贴吧

  • 0
  • 0
  • #coding:utf-8
    
    import urllib
    import urllib2
    
    def loadPage(url,filename):
        '''
            作用:根据url发送请求,获取服务器响应文件
            url:需要爬去的url地址
            filename : 处理的文件名
        '''
        print "正在下载" + filename
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
        request = urllib2.Request(url, headers = headers)
        return urllib2.urlopen(request).read()
    
    
    def writePage(html,filename):
        '''
            作用:将html内容写入到本地
            html:服务器相应文件内容
    
    
        '''
    
        print "正在保存" + filename
        with open(filename, "w") as f:
            f.write(html)
        print "_" * 30
    
    def tiebaSpider(url, beginPage, endPage):
        '''
            作用:贴吧爬虫调度器,负责组合处理每个页面的url
            url: url的前部分
            beginPage: 起始页
            endPage: 结束页
        '''
    
        for page in range(beginPage, endPage + 1):
            pn = (page -1)*50
            filename = "" + str(page) + "页.html"
            fullurl = url + "&pn=" + str(pn)
            html = loadPage(fullurl, filename)
            #print html
            writePage(html, filename)
            print "谢谢使用"
    
    if __name__ == "__main__":
        kw = raw_input("请输入需要爬取的贴吧名:")
        beginPage = int(raw_input("请输入起始页:"))
        endPage = int(raw_input("请输入结束页:"))
    
        url = "http://tieba.baidu.com/f?"
        key = {"kw" : kw}
        key = urllib.urlencode(key)
        fullurl = url + key
        tiebaSpider(fullurl, beginPage, endPage)

     

posted @ 2018-01-21 19:56  zy--  阅读(628)  评论(0编辑  收藏  举报