#
-*- coding:utf-8 -*- import urllib import urllib2 import re import time import types import tool import mysql import requests from bs4 import BeautifulSoup class Spider: #初始化 def __init__(self): self.total_num = 30 self.tool = tool.Tool() self.mysql = mysql.Mysql() #获取当前时间 def getCurrentTime(self): return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time())) #通过传入网页页码来获取网页的HTML def getPageByNum(self,num): url = self.url + str(num) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request = requests.get(url,headers = headers) except Exception, e: if hasattr(e,"code"): print self.getCurrentTime(),"获取页面失败,错误代号", e.code return None if hasattr(e,"reason"): print self.getCurrentTime(),"获取页面失败,原因", e.reason return None else: page = request.text return page #获取糗事内容 时间 def getQiushiInfo(self,qiushi): if not type(qiushi) is types.StringType: qiushi = str(qiushi) pattern = re.compile(u'<div class="content">(.*?)<!--(.*?)-->.*?</div>', re.S) print qiushi exit() match = re.search(pattern, qiushi) if match: content = self.tool.replace(match.group(1)) time = match.group(2) return [content,time] else: return None #返回当前页糗事 def getScandal(self,num): #获得HTML page = self.getPageByNum(num) soup = BeautifulSoup(page) #获得所有糗事 qiushis = soup.select("div.content") #遍历所有糗事 for qiushi in qiushis: #获得糗事 info = self.getQiushiInfo(qiushi) good_ans_dict = { "content": info[0], "time": info[1], } self.mysql.insertData("qiushia",good_ans_dict) #主函数 def main(self,type): if type == 'new': self.url = 'http://www.qiushibaike.com/textnew/page/' else: self.url = 'http://www.qiushibaike.com/text/page/' print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事" for x in range(1,self.total_num+1): print self.getCurrentTime(),"正在抓取第",x,"个页面" try: self.getScandal(x) except urllib2.URLError, e: if hasattr(e,'reason'): print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason except Exception, e: print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",e spider = Spider() spider.main('hot')

mysql 类

# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import time
import types
import tool
import mysql
import requests
from bs4 import BeautifulSoup

class Spider:

    #初始化
    def __init__(self):
        self.total_num = 30
        self.tool = tool.Tool()
        self.mysql = mysql.Mysql()

    #获取当前时间
    def getCurrentTime(self):
        return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))

    #通过传入网页页码来获取网页的HTML
    def getPageByNum(self,num):
        url = self.url + str(num)
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        try:
            request = requests.get(url,headers = headers)
        except Exception, e:
            if hasattr(e,"code"):
                print self.getCurrentTime(),"获取页面失败,错误代号", e.code
                return None
            if hasattr(e,"reason"):
                print self.getCurrentTime(),"获取页面失败,原因", e.reason
                return None
        else:
            page = request.text
            return page
    #获取糗事内容 时间
    def getQiushiInfo(self,qiushi):
        if not type(qiushi) is types.StringType:
            qiushi = str(qiushi)
        pattern = re.compile(u'<div class="content">(.*?)<!--(.*?)-->.*?</div>', re.S)
        print qiushi
        exit()
        match = re.search(pattern, qiushi)
        if match:
            content = self.tool.replace(match.group(1))
            time = match.group(2)
            return [content,time]
        else:
            return None

    #返回当前页糗事
    def getScandal(self,num):
        #获得HTML
        page = self.getPageByNum(num)
        soup = BeautifulSoup(page)
        #获得所有糗事
        qiushis = soup.select("div.content")
        #遍历所有糗事
        for qiushi in qiushis:
            #获得糗事
            info = self.getQiushiInfo(qiushi)
            good_ans_dict = {
                "content": info[0],
                "time": info[1],
                }
            self.mysql.insertData("qiushia",good_ans_dict)





    #主函数
    def main(self,type):
        if type == 'new':
            self.url = 'http://www.qiushibaike.com/textnew/page/'
        else:
            self.url = 'http://www.qiushibaike.com/text/page/'

        print self.getCurrentTime(),"爬虫正在启动,开始爬取糗事"

        for x in range(1,self.total_num+1):
            print self.getCurrentTime(),"正在抓取第",x,"个页面"
            try:
                self.getScandal(x)
            except urllib2.URLError, e:
                if hasattr(e,'reason'):
                    print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因", e.reason
            except Exception, e:
                print self.getCurrentTime(),"某页面内抓取或提取失败,错误原因:",e



spider = Spider()
spider.main('hot')

工具类

#-*- coding:utf-8 -*-
import re

#处理页面标签类
class Tool:
    #将超链接广告剔除
    removeADLink = re.compile('<div class="link_layer.*?</div>')
    #去除img标签,1-7位空格,&nbsp;
    removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #将表格制表<td>替换为\t
    replaceTD= re.compile('<td>')
    #将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    #将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    #将多行空行删除
    removeNoneLine = re.compile('\n+')

    def replace(self,x):
        x = re.sub(self.removeADLink,"",x)
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        x = re.sub(self.removeNoneLine,"\n",x)
        #strip()将前后多余内容删除
        return x.strip()