自定义异步IO爬虫

"""
##########浏览器的本质#############
sk=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
#连接 IO阻塞 sk.connect(('www.baidu.com',80)) print('连接成功') #连接成功后发送消息 GET请求 sk.send(b'GET / HTTP/1.0\r\nHost: www.baidu.com\r\n\r\n') #POST请求发送数据 # sk.send(b'POST / HTTP/1.0\r\nHost: www.baidu.com\r\n\r\nk1=v1&k2=v2') #等待服务器响应 IO阻塞 data=sk.recv(8096) print(data) sk.close()
"""
#select 实际监听对象的fileno()方法
#所以对象必须有: fileno()方法,并返回一个文件描述符

#IO多路复用;实际监听多个socket对象
#异步:-非阻塞的socket+IO多路复用
import socket
import select


class SocketResponce:
    def __init__(self,data):
        self.data=data
        self.header_dict={}
        self.body=None
        self.iniitialize()

    def iniitialize(self):
        headers,body=self.data.split(b'\r\n\r\n',1)
        self.body=body
        headers_list=headers.split(b'\r\n')
        for h in headers_list:
            h_str=str(h,encoding='utf-8')
            v=h_str.split(':',1)
            if len(v)==2:
                self.header_dict[v[0]]=v[1]


class SocketRequest:

    def __init__(self,sk,host,callback):
        self.socket=sk
        self.host=host
        self.callback=callback

    def fileno(self):
        return self.socket.fileno()


class AsyncioRequest:

    def __init__(self):
        self.conn=[]
        self.connection=[] #用于检测是否已经连接成功

    def add_request(self,host,callback):
        try:
            sk=socket.socket()
            sk.setblocking(False)
            sk.connect((host,80))

        except Exception as e:
            pass
        socketreq=SocketRequest(sk,host,callback)
        self.conn.append(socketreq)
        self.connection.append(socketreq)

    def run(self):
        while True:
            rlist,wlist,elist=select.select(self.conn,self.connection,self.connection)
            for w in wlist:
                #只要能循环到,表示socket和服务器建立连接成功
                tcp='GET / HTTP/1.0\r\nHost: %s\r\n\r\n' %(w.host,)
                w.socket.send(bytes(tcp,encoding='utf-8'))
                self.connection.remove(w)
            data_bytes = bytes()
            for r in rlist:
                while True:
                   try:
                       data=r.socket.recv(8096)
                       data_bytes+=data
                       if len(data)==0:
                           break
                   except Exception as e:
                       break
                response=SocketResponce(data_bytes)
                r.callback(response)
                r.socket.close()
                self.conn.remove(r)

            if len(self.conn)==0:
                break


def recv_data(responce):
    print(responce.body)
    print(responce.header_dict)


url_list=[
    {'host':'www.baidu.com','callback':recv_data},
    {'host':'www.cnblogs.com','callback':recv_data}
]

request=AsyncioRequest()
for url in url_list:
    request.add_request(url['host'],url['callback'])

request.run()

 

posted @ 2019-01-11 16:59  lujiacheng-python  阅读(164)  评论(0编辑  收藏  举报