糗事百科python爬虫
# -*- coding: utf-8 -*- #coding=utf-8 import urllib import urllib2 import re import thread import time class QSBK: def __init__(self): self.pageIndex=1 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' self.header={'User-Agent':self.user_agent} self.store=[] self.enable=False def getPage(self,pageIndex): try: url = 'http://www.qiushibaike.com/hot/page/'+str(pageIndex) request = urllib2.Request(url,headers=self.header) response = urllib2.urlopen(request) pageHtml =response.read().decode('utf-8') return pageHtml except urllib2.URLError,e: print '链接网络失败'+e.reason return None def getPageItem(self,pageIndex): page = self.getPage(pageIndex) if page==None: print "页面获得失败" return None pattern = re.compile('<div class="author.*?<a.*?<img.*?</a>.*?<a.*?<h2>(.*?)</h2>.*?class="content.*?<span>\s*(.*?)\s*</span>',re.S) items = re.findall(pattern, page) pageStories = [] for item in items: pageStories.append([item[0],item[1]]) return pageStories def loadPage(self): if self.enable==True: if len(self.store)<2: pageStories = self.getPageItem(self.pageIndex) if pageStories!=None: self.store.append(pageStories) self.pageIndex+=1 def getOneStory(self,pageStories): for story in pageStories: input= raw_input() self.loadPage() if input=='Q': self.enable=False return print u'%s %s'%(story[0],story[1]) def start(self): print u"正在读取糗事百科的数据,按Q退出" self.enable=True self.loadPage() nowPage=0 while self.enable: if len(self.store)>0: pageStore=self.store[0] nowPage+=1 del self.store[0] self.getOneStory(pageStore) spider =QSBK() spider.start()