【ceph】async通信message路径

send message

AsyncMessenger::send_message(Message *m, ……);
AsyncMessenger::_send_message(Message *m,, ……);
AsyncMessenger::submit_message(Message *m,, ……);

AsyncConnection::send_message(Message *m)

out_q[m->get_priority()].emplace_back(std::move(bl), m);
center->dispatch_event_external(write_handler);


write_handler = new C_handle_write(this);//this=AsyncConn

void C_handle_write::do_request(int fd)
{
    conn->handle_write(); //AsyncConn->handle_write();
}


……
int EventCenter::process_events(int timeout_microseconds)

{
    ……
    if (external_num_events.load())
    {
        external_lock.lock();
        deque<EventCallbackRef> cur_process;
        cur_process.swap(external_events);
        external_num_events.store(0);
        external_lock.unlock();
        while (!cur_process.empty())
        {
            EventCallbackRef e = cur_process.front();
            ldout(cct, 20) << __func__ << " do " << e << dendl;
            e->do_request(0);
            cur_process.pop_front();
            numevents++;
        }
    }

    e->do_request  == 》handle_write


    void AsyncConnection::handle_write()
    {

        ……

        while (1)
        {
            bufferlist data;
            Message *m = _get_next_outgoing(&data);

            ……

            r = write_message(m, data, _has_next_outgoing());

            ……
        }

        uint64_t left = ack_left.read();
        if (left)
        {
            ……
            ack_left.sub(left);
            left = ack_left.read();
            r = _try_send(left);
        }
        else if (is_queued())
        {
            r = _try_send();
        }


    }


    ssize_t AsyncConnection::write_message(Message * m, bufferlist & bl, bool more)
    {

        ……//准备消息

        ssize_t rc = _try_send(more);

        return rc;
    }



    ConnectedSocket cs

    ssize_t AsyncConnection::_try_send(bool more)
    {
        ……
        ssize_t r = cs.send(outcoming_bl, more);
        ……
    }


    ssize_t send(bufferlist & bl, bool more)
    {
        return _csi->send(bl, more);
    }

    RDMAConnectedSocketImpl  _csi

    ssize_t RDMAConnectedSocketImpl::send(bufferlist & bl, bool more)
    {
        ……
        ssize_t r = submit(more);
        ……
    }

    ssize_t RDMAConnectedSocketImpl::submit(bool more)
    {
        ……


        int r = post_work_request(tx_buffers);
        ……
    }




    int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk *> &tx_buffers)
    {

        if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request))
        {
            ……
        }
        ……
    }

Messenger::send_message(Message *m, dest)
AsyncMessenger::send_message(Message *m, dest)
--|AsyncMessenger::_send_message(Message *m, dest)
----|conn=_lookup_conn(dest.addr)   ===============================AsyncMessenger::conns <--accpet_conn()    <--bind<--accpeting_conns[]<--add_accept() or create_connect
----|AsyncMessenger::submit_message(Message *m,conn,dest,...)
------|conn->send_message(m) # AsyncConnection::send_message(Message *m)
--------|out_q[priority].emplace_back(std::move(bl),m)  #放入队列
--------|EventCenter::dispatch_event_external(write_handler)        #回调操作(write_handler= new C_handle_write(this))放入event中心,wakeup线程执行
----------|external_event.push_back(write_handler)
----------|wakeup()
             |
             |
 w->center.process_events
             |
         cb = event->read_cb;
         cb->do_request()
             |C_handle_write
             |
			 |
--|write_handler = new C_handle_write(this)
C_handle_write::do_request(int fd)
--|conn->handle_write()   # AsyncConnection::handle_write()
----|bufferlist data;m=_get_next_outgoing(&data); #out_q
----|AsyncConnection::write_message(m,data,more)
------|AsyncConnection::outcoming_bl <--bl
------|AsyncConnection::_try_send(bool more)
--------|AsyncConnection::connectedSocket cs->send(outcoming_bl,more)
----------|connectedSocket::_csi->send(outcoming_bl,more)  #std::unique_ptr<ConnectedSocketImpl> _csi;
ConnectedSocketImpl:: virtual ssizet_t send(bl,more) <=== RDMAConnectedSocketImpl::send
----------|RDMAConnectedSocketImpl::send(outcoming_bl,more)#----------------------------------------------------RDMA send 入口
------------|RDMAConnectedSocketImpl::pending_bl <-bl
------------|RDMAConnectedSocketImpl::_submit_by_write(more)
--------------|RDMAConnectedSocketImpl::_sbnmit_send_and_write(more,is_worker)
               or
			  |RDMAConnectedSocketImpl::write_data_to_raddrs(more,is_worker)
              or
              |RDMAConnectedSocketImpl::_submit_send_data(is_worker)
----------------|msg = get_send_msg_worker()/get_send_msg_polling()
----------------|RDMAConnectedSocketImpl::post_send_msg(msg)
------------------|pending_bl-->msg.data-->wr.id=&msg or pending_bl-->write_res->bl-->send_bl[i]
------------------|ibv_post_send(qp,&wr,&bad_wr)
                |
				|
				qp
				|
				|


AsyncConnection::AsyncConnection()
AsyncMessenger::accpet_conn(Worker *w,ConnectedSocket cli_socket, addr,flag)



messager-->create-->new AsyncMessenger()
--lookup_or_create_singleton_object<StackSingleton>(single,...)
----new StackSingleton
--single.ready()
-----stack=NetWorkStack::create(cct,type,name,dev_info)
-------return std::make_shared<RDMAStack>(c,t,n,dev_info) #RDMAStack.cc:783
---------:NetWorkStack(cct,t,n)
------------w=create_worker(cct,type,name)
---------:ib(cct,dev_info)
------------get_rdma_device_info("public")
---------:dispatcher(cct,this)                            #RDMAStack 构造函数时,创建dispatcher并赋值给rdma_dispatcher






messager-->bind
--for i:processor
  processor->bind()
----f = worker->listen(listen_addr,opts,listen_socket)
    worker->center.submit_to(f,...)
    RDMAWorker::listen
-----get_stack()->get_infiniband().init()
-----dispatcher->polling_start()
-----p= new RDMAConnectedSocketImpl(cct,&get_stack()->get_infiniband(),&get_infiniband()->get_dispatcher(),this)   #connect和accpet都会new一个
--------qp=infiniband->create_queue_pair(cct,s->get_tx_cq(),s->get_rx_cq(),...)




RDMADispatcher::polling_start()
|--|tx_cc=get_stack()->get_infiniband().create_comp_channel(cct);
|--|rx_cc=...
|--|tx_cq=...
|--|--|cq=new Infiniband::CompletionQueue(cct,*this,CQ_DEPTH,cc);
|--|--|cq->init();
|--|--|--|cq=ibv_create_cq(...)
|--|rx_cq=...
|--|ceph_pthread_setname(&RDMADispatcher::polling,this);










AsyncConnection::process()
--AsyncConnection::process_connection()

AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
                                 Worker *w, bool m2, bool local)


AsyncMessenger::create_connect(entity_addr_t &addr,int type,bool is_separate_wk)
--w=AsyncMessenger::stack->ge_worker();
--conn = new AsyncConnection(cct,this,dispatch_queue,w,is_separate_wk,logArray)
----:dispatch_queue(q),async_msgr(m)、
----|read_handler = new C_handle_read(this);
----|write_handler = new C_handle_write(this);
----|write_callback_handler = new C_handle_write_callback(this);
----|wakeup_handler = new C_time_wakeup(this);
----|tick_handler = new C_tick_wakeup(this);
--conn->connect(addr,type);
----set_peer_type(type);
----set_peer_addrs(addrs);
----_connect();
------center->dispatch_event_external(read_handler);---------------->EventCenter-->worker 调用read_handler->switch()状态机->_process_connection
--conn[addr] = conn;
--return conn


class C_handle_read : public EventCallback {
  AsyncConnectionRef conn;

 public:
  explicit C_handle_read(AsyncConnectionRef c): conn(c) {}
  void do_request(uint64_t fd_or_id) override {
    conn->process();
  }
};




_process_connection
--switch(state)状态机
----worker->connect(get_peer_addr(),opts,&cs)----------------------->RDMAWorker->connect(entity_addr_t &addr,SocketOptions &opts,ConnectedSocket *socket);
// 连接成功后,将socket fd加入epoll进行管理
----center->create_file_event(cs.fd, EVENT_READABLE, read_handler);
----state = STATE_CONNECTING_RE;






RDMAWorker::connect(&addr,SocketOptions &opts,ConnectedSocket *socket)
--get_stack()->get_infiniband().init();
--dispatcher->polling_start()
--p = RDMAConnectedSocketImpl(cct,&get_stack()->get_infiniband(),&get_stack()->get_stack()->get_dispatcher(),this)
--r=p->try_connect(entity_addr_t &addr,SocketOptions &opts);
--std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
--*socket = ConnectedSocket(std::move(csi));






--get_stack()->get_infiniband().init();
--device = device_list->get_device(device_name.c_str());
--device->binding_port(cct, ib_port,gid_index);
--pd = new ProtectionDomain(cct, device);
----ProtectionDomain::pd=ibv_alloc_pd(device->ctxt)
---->infiniband::pd =ProtectionDomain::pd
--infiniband::sr_info.create_tx_queue(tx_queue_len);
--if(_support_srq){srq=create_shared_receive_queue(rx_queue_len,MAX_SHARED_RX_SGE_COUNT);}



--dispatcher->polling_start()
RDMADispatcher::polling_start()
|--|tx_cc=get_stack()->get_infiniband().create_comp_channel(cct);
|--|rx_cc=...
|--|tx_cq=...
|--|--|cq=new Infiniband::CompletionQueue(cct,*this,CQ_DEPTH,cc);
|--|--|cq->init();
|--|--|--|cq=ibv_create_cq(...)
|--|rx_cq=...
|--|ceph_pthread_setname(&RDMADispatcher::polling,this);
|--|t = std::thread(&RDMADispatcher::polling,this);
|--|--polling()
|--|----while(ture)
|--|------|r = poll(channel_poll,2,cct->_conf->ms_async_rdma_poll_timeout_ms)
|--|------|while(r >0 && tx_cq->get_cq_event())
|--|------|--{handle_tx_event()}
|--|------|while(r >0 && rx_cq->get_cq_event())
|--|------|--{handle_rx_event()}
|--|------|if(!r)
|--|------|--{handle_async_event()}



--r=p->try_connect(entity_addr_t &addr,SocketOptions &opts);
----tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
----worker->center.create_file_event(tcp_fd, EVENT_READABLE, con_handler);<---RDMAConnectedSocketImpl():con_handler(new C_handle_connection(this))

  class C_handle_connection : public EventCallback {
    RDMAConnectedSocketImpl *csi;
    bool active;
   public:
    C_handle_connection(RDMAConnectedSocketImpl *w): csi(w), active(true) {}
    void do_request(int fd) {
      if (active)
        csi->handle_connection();
    }
    void close() {
      active = false;
    }
  };
};


RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
						 RDMAWorker *w):
cct(cct), 
connected(0),
error(0),
infiniband(ib),
dispatcher(s), 
worker(w), 
lock("RDMAConnectedSocketImpl::lock"),
is_server(false), 
con_handler(new C_handle_connection(this)),
active(false), detached(false)
--qp=infiniband->create_queue_pair(cct,s->get_tx_cq(),s->get_rx_cq(),...)
----Infiniband::QueuePair *qp = new QueuePair(
      cct, *this, type, ib_physical_port, srq, tx, rx, max_send_wr, max_recv_wr);
----qp->init()
------qp  = ibv_create_cq(pd,ibv_qp_init_attr qpia);
------ret = ibv_modify_qp(qp,&qpa,mask);
------if(!srq){infiniband.post_receives_to_rq(max_recv_wr,this)}
--infiniband->SetNewItem(my_msg.qpn,bitmap_index,stamp_item_index);




infiniband->send_msg_size : 用send发送的最大消息尺寸(数据+含头部),即小于这个尺寸的消息,可以用send发送。





RDMA 交换内存地址

客户端:发送的时候

RDMAConnectedSocketImpl::_sunmit_send_and_write(bool more,bool is_worker)
--len = pending_bl.length();
--if(len > max_size_fo_send_op)
	{
		if(resv_mem_mod)  //use memory pool for RDMA op 
			msg->ststus = WRITE_REQUEST_AND_RESV;
		else
			msg->ststus = WRITE_REQUEST;

	}else
	{
	  if(resv_mem_mod)  //use memory pool for RDMA op 
			msg->ststus = SEND_DATA_AND_RESV;
		else
		    msg->ststus = SEND_DATA;
	}
--post_send_msg(msg)



服务端:接收到数据,通知worker (由于handle_receive_rdma_msg 是异步的,所以cur_recv_seq和recv_seq用于保序)

RDMADispatcher::polling()
--RDMADispatcher::handle_rx_event
----RDMADispatcher::handle_receive_rdma_msg(rdma_msg *msg,RDMAConnectedSocketImpl *conn,bool succ)
------switch(msg->status)
	{
		……
		case SEND_DATA_AND_RESV
	    conn->handle_send_data(msg);
		break;
		case WRITE_REQUEST_AND_RESV
		conn->handle_write_request(msg);
		break;
		……
	}


;

 



conn->handle_send_data(msg);
--seq = recv_seq++;
--recv_bl[seq].mr=NULL;
--recv_bl[seq].data.push_back(buffer::create(msg->send.len))
--recv_bl[seq].data.copy_in(0,msg->send.len,msg->data)
--recv_bl[seq].get_data = ture;
--notify_read()
--if(SEND_DATA_AND_RESV == msg->ststus)
----push_mem_infos_to_peer()


conn->handle_write_request(msg);
--处理write
--notify_read();
--push_mem_infos_to_peer()

RDMADispatcher::notify_read()
--for(auto it = recv_bl.begin();it != recv_bl.end();)
----cur_recv_seq = it->first + 1;
----RDMADispatcher::bufs.push_back(it->second.data)
----r = write(notify_fd,&i,sizeof(i));
       |
     notify
       |
worker


RDMADispatcher::handle_receive_rdma_msg(rdma_msg *msg,RDMAConnectedSocketImpl *conn,bool succ)
--handle_send_data(rdma_msg *msg)
---








bufferlist bldata;
bldata.copy_in(0,msg->datelen,msg->data) #将msg->data起始的,msg->datelen长度的数据拷贝到bldata的_raw->data + _off + 0 地址处;



  void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src)
  {
    copy_in(o, l, src, true);
  }

……
  return memcpy(dest, src, l);
……




  void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src, bool crc_reset)
  {
    assert(_raw);
    assert(o <= _len);
    assert(o+l <= _len);
    char* dest = _raw->data + _off + o;
    if (crc_reset)
        _raw->invalidate_crc();
    maybe_inline_memcpy(dest, src, l, 64);
  }



  buffer::raw* buffer::create(unsigned len) {
    return buffer::create_aligned(len, sizeof(size_t));
  }




cct->_conf->ms_async_rdma_submit_time_trigger #开关打开
且
tx_wr_inflight = qp->get_tx_wr() > cct->_conf->ms_async_rdma_count_trigger,即发出的msg 排队的长度已经超过阈值cct->_conf->ms_async_rdma_count_trigger
就进入。

第一次进入conn->merge_send_mode = true 开启合并发送,每合并conn->target_count_trigger 就post一次

if(conn->merge_send_mode)不是第一次进入,说明当前每合并conn->target_count_trigger个再post一次,还是无法降低排队数量,
tx_wr_inflight还是大于cct->_conf->ms_async_rdma_count_trigger,所以再conn->target_count_trigger++,提高合并的数量。
最大10个合并到一个post



if(qp->get_tx_wr() > cct->_conf->ms_async_rdma_count_trigger &&cct->_conf->ms_async_rdma_submit_time_trigger)
{

	if(conn->merge_send_mode)
	{

		if(conn->target_count_trigger < 10)
		{
			conn->target_count_trigger++;
		}
	}
	else
    {
			conn->merge_send_mode   = true;
			conn->target_count_trigger = 3;
	  
	}


}

ms_async_rdma_poll_timeout_ms



RDMADispatcher::polling()
{

	RDMADispatcher::handle_rx_event()
	{

		RDMADispatcher::handle_receive_rdma_msg(rdma_msg *msg,RDMAConnectedSocketImpl *conn,bool succ)
		 {


/*
		 	switch(msg->status)
			{
				……
				case SEND_DATA_AND_RESV
			    conn->handle_send_data(msg);

				break;
				case WRITE_REQUEST_AND_RESV
				conn->handle_write_request(msg);
				break;
				……
			}
*/
		}

	}

}






 通信模块posix和RDMA选择

ceph_osd.cc

Messenger *ms_cluster = Messenger::create(g_ceph_context, cluster_msgr_type,
					    entity_name_t::OSD(whoami), "cluster",
					    getpid(),
					    Messenger::HAS_HEAVY_TRAFFIC |
					    Messenger::HAS_MANY_CONNECTIONS);
#type==cluster_msgr_type
……
if (r == 1 || type.find("async") != std::string::npos)
    return new AsyncMessenger(cct, name, type, std::move(lname), nonce);
……


type
AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
                               const std::string &type, string mname, uint64_t _nonce)
……
{
  
  std::string transport_type = "posix";
  if (type.find("rdma") != std::string::npos)
    transport_type = "rdma";
……

  StackSingleton *single;
  cct->lookup_or_create_singleton_object<StackSingleton>(single, "AsyncMessenger::NetworkStack::"+transport_type); #创建或获取一个stack(stack里面创建了n个worker)
  local_worker = stack->get_worker();                                              #获取一个worker
  local_connection = new AsyncConnection(cct, this, &dispatch_queue, local_worker);#创建一条本地连接
}

lookup_or_create_singleton_object里面找不到,则:p = new T(this) 即single = new StackSingleton(this)  -----------------------那么多messager共享一个RDMAstack,一个RDMAstack只有一个dispatcher 分发不过来(一个网卡一个dispatcher?),一个messager有一个dispatchqueue。


StackSingleton里面 
StackSingleton::ready(type,name,dev_info)
{
	……
	stack = NetworkStack::create(cct,type,name,dev_info)
	……
}

 NetworkStack::create(cct,type,name,dev_info)
 {
	if (t == rdma)
	{
		return std::make_shared<RDMAStack>(c,t,n,dev_info)
	}
 }


class RDMAStack : public NetworkStack 
{
  vector<std::thread> threads;
  RDMADispatcher dispatcher[num];
  ……
}


RDMAStack::RDMAStack(CephContext *cct, const string &t): NetworkStack(cct, t)
{
    for(i= 0;i <num;i++)
    dispatcher[i](cct, global_infiniband, this);
   
    unsigned num = get_num_worker();
    for (unsigned i = 0; i < num; ++i)
    {
        RDMAWorker *w = dynamic_cast<RDMAWorker *>(get_worker(i));
        w->set_ib(global_infiniband);
        w->set_stack(this);
    }

    # NetworkStack(cct, t)里面

	    NetworkStack::NetworkStack(CephContext *c, const string &t): type(t), started(false), cct(c)
		{
		 ……
		  num_workers = cct->_conf->ms_async_op_threads;
		 ……
		  for (unsigned i = 0; i < num_workers; ++i) {
		    Worker *w = create_worker(cct, type, i);
		    w->center.init(InitEventNumber, i, type);
		    workers.push_back(w);
		  }
		  cct->register_fork_watcher(this);
		}


}





RDMAStack::RDMAStack创建(初始化)NetworkStack
NetworkStack创建worker
RDMAStack::RDMAStack把stack给每个worker


查看生产者dispatcher 100% 的cpu,而消费者worker 30%,



await = svctm  + 在队列里面等待的时间。







messager::create
--AsyncMessenger(cct,type,lname,nonce,keepalive)
---|dispatch_queue = new DispatchQueue(cct,this,mname)
   |stack = NetworkStack::create(cct,type,name,dev_info)
   |--------RDMAStack(c,t,n,dev_info)
   |----------|dispatcher(cct,this)
   |----------|ib(cct,dev_info)
   |----------|create_worker(cct,type,i,name)
   |------------|center(c)
   |stack.start()
   --|for(i=0;i < num_workers;i++)
   --|threads[i] = std::move(std::thread( w->center.process_events(EventMaxWaitUs)));



messager
|--dispatch_queue
|--stack
   |--dispatcher
   |--ib
   |--workers--RDMAworker


 num_workers = cct->_conf->ms_async_op_threads;




 _submit_by_write()
{
	……
	用将要发送的数据已经放在pending_bl


}

posted on 2022-10-04 01:24  bdy  阅读(23)  评论(0编辑  收藏  举报

导航