#!/usr/bin/env python
# _*_ coding utf-8 _*_
# time:09-01-2017 15:39
#version:1.0
#__author__ = 'lijin'
import requests
from lxml import etree
import urllib

class Spider_tieba:
def __init__(self):
self.tieba_name = input("请输入要爬取的贴吧:")
self.start_page = int(input("请输入开始页:"))
self.end_page = int(input("请输入结束页:"))
self.base_url = "https://tieba.baidu.com/f?"

def load_page(self, page):
'''
:param page: 要爬取的贴吧对应的页数
:return:将本页中所有帖子的链接地址组合成一个列表并返回
'''
pn = (page-1) * 50
url_dict = {"kw":self.tieba_name,"pn":pn}
url = self.base_url + urllib.parse.urlencode(url_dict)
response = requests.get(url).content.decode("UTF-8", 'ignore')
selector = etree.HTML(response)
links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
return links

def print_page(self, link):
'''
:param link:贴吧中帖子的链接地址
:return:获取本贴中的response并返回
'''
url = self.base_url.split("/f?")[0] + link
response = requests.get(url).content.decode("UTF-8", 'ignore')
return response

def save_data(self, response):
'''
:param response: 帖子中获取的html
:return:获取文本信息打印并保存到文件中
'''

selector = etree.HTML(response)
contents = selector.xpath("//div[@class = 'd_post_content j_d_post_content ']")

for content in contents:
print("==========")
text = content.xpath("string()")
print(text)
self.savefile(text)
# print(content.text)
# for sub_content in content:
# if sub_content.text != None:
# print(sub_content.text)

def savefile(self, item):
with open("贴吧.txt","a",encoding='utf-8') as f:
f.write("\n====================\n")
f.write(item)

def print_tieba(self, links):
for link in links:
response = self.print_page(link)
self.save_data(response)




def spider_main(self):
for page in range(self.start_page, self.end_page+1):
print("正在打印第{}页".format(page))
self.savefile("第{}页".format(page))
links = self.load_page(page) #获取此页所有帖子链接地址
self.print_tieba(links)




if __name__ == "__main__":
spider = Spider_tieba()
spider.spider_main()