自定义爬虫基本框架

class HttpRequest:
    def __init__(self,sk,host,callback):
        self.socket = sk
        self.host = host
        self.callback = callback
    def fileno(self):
        return self.socket.fileno()         #返回文件描述符

class HttpResponse:
    def __init__(self,recv_data):
        self.recv_data = recv_data
        self.header_dict = {}
        self.body = None
        self.initialize()       #执行方法

    def initialize(self):
        headers, body = self.recv_data.split(b'\r\n\r\n', 1)        #分离请求头请求体,b表示字节类型分割,1因为请求体可能有,只找第一个
        self.body = body
        header_list = headers.split(b'\r\n')        #分离请求头
        for h in header_list:
            h_str = str(h,encoding='utf-8')     #先变成字符串
            v = h_str.split(':',1)
            if len(v) == 2:         #部分响应头格式没有冒号
                self.header_dict[v[0]] = v[1]       #响应头字典形式,所有的外部框架都是这样来做的


class AsyncRequest:
    def __init__(self):
        self.conn = []          # 用于检测是否已经返回
        self.connection = []    # 用于检测是否已经连接成功

    def add_request(self,host,callback):
        try:
            sk = socket.socket()
            sk.setblocking(0)
            sk.connect((host,80,))
        except BlockingIOError as e:
            pass
        request = HttpRequest(sk,host,callback)
        self.conn.append(request)
        self.connection.append(request)

    def run(self):

        while True:             #事件循环
            rlist,wlist,elist = select.select(self.conn,self.connection,self.conn,0.05)
            for w in wlist:
                print(w.host,'连接成功...')
                # 只要能循环到,表示socket和服务器端已经连接成功
                tpl = "GET / HTTP/1.0\r\nHost:%s\r\n\r\n"  %(w.host,)       #通过封装socket对象为httprequest对象,就能传host
                w.socket.send(bytes(tpl,encoding='utf-8'))
                self.connection.remove(w)               #已发送数据的从列表清除
            for r in rlist:
                #   r是HttpRequest对象
                recv_data = bytes()         #空字节,
                while True:         #一直接收数据
                    try:
                        chunck = r.socket.recv(8096)        #8096是大小,超过大小的分为一块块chunks接受,没数据则报错
                        recv_data += chunck
                    except Exception as e:      #没数据执行这步
                        break
                # print(r.host,'有数据返回',recv_data)
                response = HttpResponse(recv_data)
                r.callback(response)
                r.socket.close()
                self.conn.remove(r)         #不需再监听
            if len(self.conn) == 0:
                break

def f1(response):
    print('保存到文件',response.header_dict) #打印响应头字典

def f2(response):
    print('保存到数据库', response.header_dict)

url_list = [
    {'host':'www.baidu.com','callback': f1},    #用用户自己选择哪个回调函数处理返回结果
    {'host':'cn.bing.com','callback': f2},
    {'host':'www.cnblogs.com','callback': f2},
]

req = AsyncRequest()
for item in url_list:
    req.add_request(item['host'],item['callback'])

req.run()

  

posted @ 2018-08-17 16:47  心平万物顺  阅读(296)  评论(0编辑  收藏  举报