python之爬虫(二)
1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 # author:Momo time:2018/6/30 4 5 """ 6 目标网站:http://tieba.baidu.com/p/3522395718 7 目标内容:跟帖用户名,跟帖内容,跟帖时间 8 涉及知识: 9 Requests 获取网页 10 XPath 提取内容 11 map 实现多线程爬虫 12 掌握以下知识:使用xpath进行网页提取 13 使用map实现多线程爬虫 14 """ 15 16 from lxml import etree 17 from multiprocessing.dummy import Pool as ThreadPool 18 import urllib.request 19 import json 20 # from imp import reload 21 22 # # "将贴吧拷下的代码保存为utf-8" 23 # import sys 24 # reload(sys) 25 # sys.setdefaultencoding('utf-8') 26 27 def towrite(contentdict): 28 f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n' ) 29 f.writelines(u'回帖内容:' + contentdict['topic_reply_content'] + '\n') 30 f.writelines(u'回帖人:' + str(contentdict['user_name']) + '\n\n') 31 32 def spider(url): 33 html_page = urllib.request.urlopen(url) 34 html_code = html_page.read().decode('utf-8') 35 selector = etree.HTML(html_code) 36 contetnt_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]') 37 item = {} 38 for each in contetnt_field: 39 reply_info = json.loads(each.xpath('@data-field')[0].replace('"', '')) 40 author = reply_info['author']['user_name'] 41 content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content clearfix"]/text()')[0] 42 reply_time = reply_info['content']['date'] 43 print(content) 44 print(reply_time) 45 print(author) 46 item['user_name'] = author 47 item['topic_reply_content'] = content 48 item['topic_reply_time'] = reply_time 49 towrite(item) 50 51 if __name__ == '__main__': 52 pool = ThreadPool(4) 53 f = open('content.txt', 'a',encoding='utf-8') 54 page = [] 55 for i in range(1,21): 56 newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i) 57 page.append(newpage) 58 59 results = pool.map(spider, page ) 60 pool.close() 61 pool.join() 62 f.close()