scrapy 知乎关键字爬虫spider代码
以下是spider部分的代码。爬知乎是需要登录的,建议使用cookie就可以了,如果需要爬的数量预计不多,请不要使用过大的线程数量,否则会过快的被封杀,需要等十几个小时账号才能重新使用,比起损失的这十几个小时的时间,即使是单线程也能够爬取很多页面了,得不偿失。
知乎是基于账号策略反爬的,换ua和ip并没用,如果需要高并发,需要采用几十个账号的方式来爬取。
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy import Request 4 from scrapy import log 5 import logging 6 #from zhihu.items import ZhihuItem 7 from zhihu.items import ZhihuItem 8 from scrapy_redis.spiders import RedisSpider 9 import re 10 import json 11 import time 12 13 class BaoxianSpider(RedisSpider): ##使用redis分布式 14 15 name = "baoxian" 16 allowed_domains = ["zhihu.com"] 17 #redis_key='baoxian:start_urls' 18 keywords='软件测试' ###要爬的关键词 19 from urllib import quote 20 urlencode_keywords=quote(keywords) 21 22 start_urls = ['https://www.zhihu.com/r/search?q='+urlencode_keywords+'&type=content&offset=0'] #'https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=0' 23 def start_requests(self): 24 for url in self.start_urls: 25 yield Request(url=url, callback=self.parse,dont_filter=True) 26 27 def parse(self, response): 28 body=response.body #{"paging":{"next":"\/r\/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=50"},"htmls" 29 #print body 30 31 #获取问题链接 32 question_href_reg=r'<div class=\\"title\\"><a target=\\"_blank\\" href=\\"\\/question\\/(.*?)\\"' 33 all_question_href=re.findall(question_href_reg,body) 34 print 'all_question_href:',all_question_href 35 for aqh in all_question_href: 36 question_href='https://www.zhihu.com/question/'+str(aqh) 37 yield Request(url=question_href, callback=self.parse_question,dont_filter=True) 38 print question_href 39 40 log.msg("question_href:%s \n list_question_page:%s"%(question_href,response.url), level=log.INFO) 41 #self.log 42 #获取下一页的链接 43 44 reg=r'{"paging":{"next":"(\\/r\\/search\?q=.*?&type=content&offset=.*?)"},"htmls"' 45 next_page=re.findall(reg,body) 46 print '下一页问题:',next_page 47 if len(next_page): 48 #print next_page[0] #https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=10 49 next_page_url='https://www.zhihu.com'+ next_page[0].replace('\\','') 50 print 'next_page_url:',next_page_url 51 yield Request(url=next_page_url, callback=self.parse,dont_filter=True) 52 log.msg("next_page_url:%s"%next_page_url, level=log.INFO) 53 54 #data-type=\"Answer\"><div class=\"title\"><a target=\"_blank\" href=\"\/question\/22316395\" 55 56 57 def parse_question(self,response): ####问题详情页面 58 #print response.body 59 60 print 'response.url:',response.url 61 title=response.xpath('//h1[@class="QuestionHeader-title"]/text()').extract_first() 62 print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) 63 print 'title:',title 64 #editableDetail":",国内的保险员说风险太大,不受法律保护什么的。大神推荐我赴港买保险吗?","visitCount" 65 reg='editableDetail":"([\s\S]*?)","visitCount"' 66 content_match=re.findall(reg,response.body) 67 if content_match: 68 content=content_match[0] 69 else: 70 content='' #有可能问题无具体描述 71 print 'content:',content 72 question={} 73 question['url']=response.url 74 question['title']=title 75 76 question['content']=content 77 #https://www.zhihu.com/question/19904068 78 question['comment']=[] 79 #https://www.zhihu.com/api/v4/questions/20214716/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=3&offset=3 80 answer_json='https://www.zhihu.com/api/v4/questions/'+re.findall('(\d+)',response.url)[0]+'/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0' 81 print 'answer_json:',answer_json 82 yield Request(url=answer_json, callback=self.parse_json,meta=question,dont_filter=False) 83 """ 84 item=ZhihuItem() 85 item['title']=question['title'] 86 item['url']=question['url'] 87 item['content']=question['content'] 88 yield item 89 print item 90 """ 91 92 def parse_json(self,response): ####答案列表 93 meta=response.meta 94 dict=json.loads(response.body) 95 96 #print 'dict:',dict 97 print 'dcit to json:',json.dumps(dict,ensure_ascii=False) 98 comment_list=meta['comment'] 99 for data in dict['data']: # dict['data']是列表,每个元素是字典 100 try: 101 comment_dict={} 102 comment_dict['comment_content']=data['content'] 103 if data['author']['name']: 104 comment_dict['author']=data['author']['name'] 105 else: 106 comment_dict['author']='' 107 comment_dict['voteup_count']=data['voteup_count'] 108 comment_dict['comment_count']=data['comment_count'] 109 comment_dict['comment_time']=time.strftime('%Y-%m-%d',time.localtime(data['created_time'])) 110 comment_list.append(comment_dict) 111 except Exception,e: 112 print e 113 meta['comment']=comment_list 114 meta['answer_num']=dict['paging']['totals'] 115 116 117 118 if dict['paging']['is_end']==False: ###自动翻页 119 yield Request(url=dict['paging']['next'], callback=self.parse_json,meta=meta,dont_filter=False) 120 else: 121 #log.msg("last:%s"%next_page_url, level=log.INFO) 122 print 'last:',meta['title'],meta['url'] ,meta['content'],meta['answer_num'],len(meta['comment'])#,meta['comment'] 123 item=ZhihuItem() 124 item['title']=meta['title'] 125 item['url']=meta['url'] 126 item['content']=meta['content'] 127 item['answer_num']=meta['answer_num'] 128 item['comment']=meta['comment'] 129 yield item
发下运行结果,存储用的mongodb
comment的内容
反对极端面向过程编程思维方式,喜欢面向对象和设计模式的解读,喜欢对比极端面向过程编程和oop编程消耗代码代码行数的区别和原因。致力于使用oop和36种设计模式写出最高可复用的框架级代码和使用最少的代码行数完成任务,致力于使用oop和设计模式来使部分代码减少90%行,使绝大部分py文件最低减少50%-80%行的写法。