无法解决的问题

学习python时做了一个爬虫爬取百度贴吧的内容,但是用BeautifulSoup得到的结果使用find_all函数却无法获取。

 

getCommentInfo.py:

 1 from urllib import request
 2 import requests
 3 from bs4 import BeautifulSoup
 4 from mylog import MyLog as mylog
 5 import random
 6 
 7 class Item(object):
 8     title = None    #帖子标题
 9     firstAuthor = None  #创建者
10     firstTime = None    #创建时间
11     reNum = None    #总回复数
12     content = None  #最后回复内容
13     lastAuthor = None   #最后回复者
14     lastTime = None     #最后回复时间
15 
16 class GetTiebaInfo(object):
17     def __init__(self,url):
18         self.url = url
19         self.log = mylog()
20         self.pageSum = 5
21         self.urls = self.getUrls(self.pageSum)
22         self.items = self.spider(self.urls)
23         self.pipelines(self.items)
24 
25     def getUrls(self,pageSum):
26         urls = []
27         pns = [str(i*50) for i in range(pageSum)]
28         ul = self.url.split('=')
29         for pn in pns:
30             ul[-1] = pn
31             url = '='.join(ul)
32             urls.append(url)
33         self.log.info(u"获取URLS成功 ")
34         return urls
35 
36     def spider(self,urls):
37         items = []
38         for url in urls:
39             htmlContent = self.getResponseContent(url)
40             with open("content.html","w",encoding='utf-8') as f:
41                 f.write(htmlContent)
42             soup = BeautifulSoup(htmlContent,'lxml')
43             with open('soup.txt','w',encoding='utf-8') as fp:
44                 fp.write(soup.text)
45 
46             tagsli = soup.find_all('li',attrs={'class':'j_thread_list clearfix'})
47             for tag in tagsli:
48                 item = Item()
49                 item.title = tag.find('a',attrs={'class':'j_th_tit '}).get_text().strip()
50                 item.firstAuthor = tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()
51                 item.firstTime = tag.find('span',attrs={'title':u'创建时间'.encode('utf-8')}).get_text().strip()
52                 item.reNum = tag.find('span',attrs={'title':u'回复'.encode('utf-8')}).get_text().strip()
53                 item.content = tag.find('div',attrs={'class':'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
54                 item.lastAuthor = tag.find('span',attrs={'class':'tb_icon_author_rely j_replyer'}).a.get_text().strip()
55                 item.lastTime = tag.find('span',attrs={'title':u'最后回复时间'.encode('utf-8')}).get_text().strip()
56                 items.append(item)
57                 self.log.info(u'获取标题为<<%s>>的项成功 ...' %item.title)
58         return items
59 
60     def pipelines(self,items):
61         fileName = u'百度贴吧_权力的游戏.txt'.encode('utf-8')
62         with open(fileName,'w') as fp:
63             for item in items:
64                 fp.write('title:%s \t author:%s \t firstTime:%s \n content:%s \n return:%s \n lastAuthor:%s \t lastTime:%s \n\n\n\n'
65                          %(item.title.encode('utf-8'),item.firstAuthor.encode('utf-8'),item.firstTime.encode('utf-8'),item.content.encode('utf-8'),item.lastTime.encode('utf-8')))
66                 self.log.info(u'标题为<<%s>>的项输入到"%s"成功' %(item.title,fileName.decode('utf-8')))
67 
68     def getResponseContent(self,url):
69         header = {
70             'Accept': 'text/heml,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
71             'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'zh_CN,zh;q=0.8',
72             'Connect': 'keep-alive',
73             'User-Agent': 'Mozilla/5.0(Windows NT 6.3;WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.235'
74         }
75         timeout = random.choice(range(80, 180))
76         try:
77             response = requests.get(url,headers = header,timeout = timeout)
78             response.encoding = 'utf-8'
79         except:
80             self.log.error(u'Python 返回 URL:%s 数据失败' %url)
81         else:
82             self.log.info(u'Python 返回URL:%s 数据成功' %url)
83             return response.content.decode('utf-8')
84 if __name__=='__main__':
85     url = u'http://tieba.baidu.com/f?kw=权力的游戏&ie=utf-8&pn=50'
86     GTI = GetTiebaInfo(url)
View Code

 

mylog.py

 1 import logging
 2 import getpass
 3 import sys
 4 
 5 #定义MyLog类
 6 class MyLog(object):
 7     def __init__(self):
 8         self.user = getpass.getuser()
 9         self.logger = logging.getLogger(self.user)
10         self.logger.setLevel(logging.DEBUG)
11 
12         #日志文件名
13         self.logFile = sys.argv[0][0:-3] + '.log'
14         self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
15 
16         #文件显示到屏幕并输出到日志文件
17         self.logHand = logging.FileHandler(self.logFile,encoding='utf-8')
18         self.logHand.setFormatter(self.formatter)
19         self.logHand.setLevel(logging.DEBUG)
20 
21         self.logHandSt = logging.StreamHandler()
22         self.logHand.setFormatter(self.formatter)
23         self.logHandSt.setLevel(logging.DEBUG)
24 
25         self.logger.addHandler(self.logHand)
26         self.logger.addHandler(self.logHandSt)
27 
28     def debug(self,msg):
29         self.logger.debug(msg)
30 
31     def info(self,msg):
32         self.logger.info(msg)
33 
34     def warn(self,msg):
35         self.logger.warning(msg)
36 
37     def error(self,msg):
38         self.logger.error(msg)
39 
40     def critical(self,msg):
41         self.logger.critical(msg)
42 
43 # if __name__=='__main__':
44 # #     mylog = MyLog()
45 # #     mylog.debug(u"I'm debug 测试中文")
46 # #     mylog.info("I'm info")
47 # #     mylog.warn("I'm warn")
48 # #     mylog.error(u"I'm error 测试中文")
49 # #     mylog.critical("I'm critical")
View Code

 

错误:

  在getCommentInfo.py中40行左右的htmlContent可得到原html的正确内容,但经BeautifulSoup后,返回的soup内容变化,导致无法爬取结果。可从两个调式文件content.html和soup.txt得知。

posted @ 2019-04-19 23:41  吉光一片羽  阅读(146)  评论(0编辑  收藏  举报