send message
AsyncMessenger::send_message(Message *m, ……);
AsyncMessenger::_send_message(Message *m,, ……);
AsyncMessenger::submit_message(Message *m,, ……);
AsyncConnection::send_message(Message *m)
out_q[m->get_priority()].emplace_back(std::move(bl), m);
center->dispatch_event_external(write_handler);
write_handler = new C_handle_write(this);//this=AsyncConn
void C_handle_write::do_request(int fd)
{
conn->handle_write(); //AsyncConn->handle_write();
}
……
int EventCenter::process_events(int timeout_microseconds)
{
……
if (external_num_events.load())
{
external_lock.lock();
deque<EventCallbackRef> cur_process;
cur_process.swap(external_events);
external_num_events.store(0);
external_lock.unlock();
while (!cur_process.empty())
{
EventCallbackRef e = cur_process.front();
ldout(cct, 20) << __func__ << " do " << e << dendl;
e->do_request(0);
cur_process.pop_front();
numevents++;
}
}
e->do_request == 》handle_write
void AsyncConnection::handle_write()
{
……
while (1)
{
bufferlist data;
Message *m = _get_next_outgoing(&data);
……
r = write_message(m, data, _has_next_outgoing());
……
}
uint64_t left = ack_left.read();
if (left)
{
……
ack_left.sub(left);
left = ack_left.read();
r = _try_send(left);
}
else if (is_queued())
{
r = _try_send();
}
}
ssize_t AsyncConnection::write_message(Message * m, bufferlist & bl, bool more)
{
……//准备消息
ssize_t rc = _try_send(more);
return rc;
}
ConnectedSocket cs
ssize_t AsyncConnection::_try_send(bool more)
{
……
ssize_t r = cs.send(outcoming_bl, more);
……
}
ssize_t send(bufferlist & bl, bool more)
{
return _csi->send(bl, more);
}
RDMAConnectedSocketImpl _csi
ssize_t RDMAConnectedSocketImpl::send(bufferlist & bl, bool more)
{
……
ssize_t r = submit(more);
……
}
ssize_t RDMAConnectedSocketImpl::submit(bool more)
{
……
int r = post_work_request(tx_buffers);
……
}
int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk *> &tx_buffers)
{
if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request))
{
……
}
……
}
Messenger::send_message(Message *m, dest)
AsyncMessenger::send_message(Message *m, dest)
--|AsyncMessenger::_send_message(Message *m, dest)
----|conn=_lookup_conn(dest.addr) ===============================AsyncMessenger::conns <--accpet_conn() <--bind<--accpeting_conns[]<--add_accept() or create_connect
----|AsyncMessenger::submit_message(Message *m,conn,dest,...)
------|conn->send_message(m) # AsyncConnection::send_message(Message *m)
--------|out_q[priority].emplace_back(std::move(bl),m) #放入队列
--------|EventCenter::dispatch_event_external(write_handler) #回调操作(write_handler= new C_handle_write(this))放入event中心,wakeup线程执行
----------|external_event.push_back(write_handler)
----------|wakeup()
|
|
w->center.process_events
|
cb = event->read_cb;
cb->do_request()
|C_handle_write
|
|
--|write_handler = new C_handle_write(this)
C_handle_write::do_request(int fd)
--|conn->handle_write() # AsyncConnection::handle_write()
----|bufferlist data;m=_get_next_outgoing(&data); #out_q
----|AsyncConnection::write_message(m,data,more)
------|AsyncConnection::outcoming_bl <--bl
------|AsyncConnection::_try_send(bool more)
--------|AsyncConnection::connectedSocket cs->send(outcoming_bl,more)
----------|connectedSocket::_csi->send(outcoming_bl,more) #std::unique_ptr<ConnectedSocketImpl> _csi;
ConnectedSocketImpl:: virtual ssizet_t send(bl,more) <=== RDMAConnectedSocketImpl::send
----------|RDMAConnectedSocketImpl::send(outcoming_bl,more)#----------------------------------------------------RDMA send 入口
------------|RDMAConnectedSocketImpl::pending_bl <-bl
------------|RDMAConnectedSocketImpl::_submit_by_write(more)
--------------|RDMAConnectedSocketImpl::_sbnmit_send_and_write(more,is_worker)
or
|RDMAConnectedSocketImpl::write_data_to_raddrs(more,is_worker)
or
|RDMAConnectedSocketImpl::_submit_send_data(is_worker)
----------------|msg = get_send_msg_worker()/get_send_msg_polling()
----------------|RDMAConnectedSocketImpl::post_send_msg(msg)
------------------|pending_bl-->msg.data-->wr.id=&msg or pending_bl-->write_res->bl-->send_bl[i]
------------------|ibv_post_send(qp,&wr,&bad_wr)
|
|
qp
|
|
AsyncConnection::AsyncConnection()
AsyncMessenger::accpet_conn(Worker *w,ConnectedSocket cli_socket, addr,flag)
messager-->create-->new AsyncMessenger()
--lookup_or_create_singleton_object<StackSingleton>(single,...)
----new StackSingleton
--single.ready()
-----stack=NetWorkStack::create(cct,type,name,dev_info)
-------return std::make_shared<RDMAStack>(c,t,n,dev_info) #RDMAStack.cc:783
---------:NetWorkStack(cct,t,n)
------------w=create_worker(cct,type,name)
---------:ib(cct,dev_info)
------------get_rdma_device_info("public")
---------:dispatcher(cct,this) #RDMAStack 构造函数时,创建dispatcher并赋值给rdma_dispatcher
messager-->bind
--for i:processor
processor->bind()
----f = worker->listen(listen_addr,opts,listen_socket)
worker->center.submit_to(f,...)
RDMAWorker::listen
-----get_stack()->get_infiniband().init()
-----dispatcher->polling_start()
-----p= new RDMAConnectedSocketImpl(cct,&get_stack()->get_infiniband(),&get_infiniband()->get_dispatcher(),this) #connect和accpet都会new一个
--------qp=infiniband->create_queue_pair(cct,s->get_tx_cq(),s->get_rx_cq(),...)
RDMADispatcher::polling_start()
|--|tx_cc=get_stack()->get_infiniband().create_comp_channel(cct);
|--|rx_cc=...
|--|tx_cq=...
|--|--|cq=new Infiniband::CompletionQueue(cct,*this,CQ_DEPTH,cc);
|--|--|cq->init();
|--|--|--|cq=ibv_create_cq(...)
|--|rx_cq=...
|--|ceph_pthread_setname(&RDMADispatcher::polling,this);
AsyncConnection::process()
--AsyncConnection::process_connection()
AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
Worker *w, bool m2, bool local)
AsyncMessenger::create_connect(entity_addr_t &addr,int type,bool is_separate_wk)
--w=AsyncMessenger::stack->ge_worker();
--conn = new AsyncConnection(cct,this,dispatch_queue,w,is_separate_wk,logArray)
----:dispatch_queue(q),async_msgr(m)、
----|read_handler = new C_handle_read(this);
----|write_handler = new C_handle_write(this);
----|write_callback_handler = new C_handle_write_callback(this);
----|wakeup_handler = new C_time_wakeup(this);
----|tick_handler = new C_tick_wakeup(this);
--conn->connect(addr,type);
----set_peer_type(type);
----set_peer_addrs(addrs);
----_connect();
------center->dispatch_event_external(read_handler);---------------->EventCenter-->worker 调用read_handler->switch()状态机->_process_connection
--conn[addr] = conn;
--return conn
class C_handle_read : public EventCallback {
AsyncConnectionRef conn;
public:
explicit C_handle_read(AsyncConnectionRef c): conn(c) {}
void do_request(uint64_t fd_or_id) override {
conn->process();
}
};
_process_connection
--switch(state)状态机
----worker->connect(get_peer_addr(),opts,&cs)----------------------->RDMAWorker->connect(entity_addr_t &addr,SocketOptions &opts,ConnectedSocket *socket);
// 连接成功后,将socket fd加入epoll进行管理
----center->create_file_event(cs.fd, EVENT_READABLE, read_handler);
----state = STATE_CONNECTING_RE;
RDMAWorker::connect(&addr,SocketOptions &opts,ConnectedSocket *socket)
--get_stack()->get_infiniband().init();
--dispatcher->polling_start()
--p = RDMAConnectedSocketImpl(cct,&get_stack()->get_infiniband(),&get_stack()->get_stack()->get_dispatcher(),this)
--r=p->try_connect(entity_addr_t &addr,SocketOptions &opts);
--std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
--*socket = ConnectedSocket(std::move(csi));
--get_stack()->get_infiniband().init();
--device = device_list->get_device(device_name.c_str());
--device->binding_port(cct, ib_port,gid_index);
--pd = new ProtectionDomain(cct, device);
----ProtectionDomain::pd=ibv_alloc_pd(device->ctxt)
---->infiniband::pd =ProtectionDomain::pd
--infiniband::sr_info.create_tx_queue(tx_queue_len);
--if(_support_srq){srq=create_shared_receive_queue(rx_queue_len,MAX_SHARED_RX_SGE_COUNT);}
--dispatcher->polling_start()
RDMADispatcher::polling_start()
|--|tx_cc=get_stack()->get_infiniband().create_comp_channel(cct);
|--|rx_cc=...
|--|tx_cq=...
|--|--|cq=new Infiniband::CompletionQueue(cct,*this,CQ_DEPTH,cc);
|--|--|cq->init();
|--|--|--|cq=ibv_create_cq(...)
|--|rx_cq=...
|--|ceph_pthread_setname(&RDMADispatcher::polling,this);
|--|t = std::thread(&RDMADispatcher::polling,this);
|--|--polling()
|--|----while(ture)
|--|------|r = poll(channel_poll,2,cct->_conf->ms_async_rdma_poll_timeout_ms)
|--|------|while(r >0 && tx_cq->get_cq_event())
|--|------|--{handle_tx_event()}
|--|------|while(r >0 && rx_cq->get_cq_event())
|--|------|--{handle_rx_event()}
|--|------|if(!r)
|--|------|--{handle_async_event()}
--r=p->try_connect(entity_addr_t &addr,SocketOptions &opts);
----tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
----worker->center.create_file_event(tcp_fd, EVENT_READABLE, con_handler);<---RDMAConnectedSocketImpl():con_handler(new C_handle_connection(this))
class C_handle_connection : public EventCallback {
RDMAConnectedSocketImpl *csi;
bool active;
public:
C_handle_connection(RDMAConnectedSocketImpl *w): csi(w), active(true) {}
void do_request(int fd) {
if (active)
csi->handle_connection();
}
void close() {
active = false;
}
};
};
RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
RDMAWorker *w):
cct(cct),
connected(0),
error(0),
infiniband(ib),
dispatcher(s),
worker(w),
lock("RDMAConnectedSocketImpl::lock"),
is_server(false),
con_handler(new C_handle_connection(this)),
active(false), detached(false)
--qp=infiniband->create_queue_pair(cct,s->get_tx_cq(),s->get_rx_cq(),...)
----Infiniband::QueuePair *qp = new QueuePair(
cct, *this, type, ib_physical_port, srq, tx, rx, max_send_wr, max_recv_wr);
----qp->init()
------qp = ibv_create_cq(pd,ibv_qp_init_attr qpia);
------ret = ibv_modify_qp(qp,&qpa,mask);
------if(!srq){infiniband.post_receives_to_rq(max_recv_wr,this)}
--infiniband->SetNewItem(my_msg.qpn,bitmap_index,stamp_item_index);
infiniband->send_msg_size : 用send发送的最大消息尺寸(数据+含头部),即小于这个尺寸的消息,可以用send发送。
RDMA 交换内存地址
客户端:发送的时候
RDMAConnectedSocketImpl::_sunmit_send_and_write(bool more,bool is_worker)
--len = pending_bl.length();
--if(len > max_size_fo_send_op)
{
if(resv_mem_mod) //use memory pool for RDMA op
msg->ststus = WRITE_REQUEST_AND_RESV;
else
msg->ststus = WRITE_REQUEST;
}else
{
if(resv_mem_mod) //use memory pool for RDMA op
msg->ststus = SEND_DATA_AND_RESV;
else
msg->ststus = SEND_DATA;
}
--post_send_msg(msg)
服务端:接收到数据,通知worker (由于handle_receive_rdma_msg 是异步的,所以cur_recv_seq和recv_seq用于保序)
RDMADispatcher::polling()
--RDMADispatcher::handle_rx_event
----RDMADispatcher::handle_receive_rdma_msg(rdma_msg *msg,RDMAConnectedSocketImpl *conn,bool succ)
------switch(msg->status)
{
……
case SEND_DATA_AND_RESV
conn->handle_send_data(msg);
break;
case WRITE_REQUEST_AND_RESV
conn->handle_write_request(msg);
break;
……
}
;
conn->handle_send_data(msg);
--seq = recv_seq++;
--recv_bl[seq].mr=NULL;
--recv_bl[seq].data.push_back(buffer::create(msg->send.len))
--recv_bl[seq].data.copy_in(0,msg->send.len,msg->data)
--recv_bl[seq].get_data = ture;
--notify_read()
--if(SEND_DATA_AND_RESV == msg->ststus)
----push_mem_infos_to_peer()
conn->handle_write_request(msg);
--处理write
--notify_read();
--push_mem_infos_to_peer()
RDMADispatcher::notify_read()
--for(auto it = recv_bl.begin();it != recv_bl.end();)
----cur_recv_seq = it->first + 1;
----RDMADispatcher::bufs.push_back(it->second.data)
----r = write(notify_fd,&i,sizeof(i));
|
notify
|
worker
RDMADispatcher::handle_receive_rdma_msg(rdma_msg *msg,RDMAConnectedSocketImpl *conn,bool succ)
--handle_send_data(rdma_msg *msg)
---
bufferlist bldata;
bldata.copy_in(0,msg->datelen,msg->data) #将msg->data起始的,msg->datelen长度的数据拷贝到bldata的_raw->data + _off + 0 地址处;
void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src)
{
copy_in(o, l, src, true);
}
……
return memcpy(dest, src, l);
……
void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src, bool crc_reset)
{
assert(_raw);
assert(o <= _len);
assert(o+l <= _len);
char* dest = _raw->data + _off + o;
if (crc_reset)
_raw->invalidate_crc();
maybe_inline_memcpy(dest, src, l, 64);
}
buffer::raw* buffer::create(unsigned len) {
return buffer::create_aligned(len, sizeof(size_t));
}
cct->_conf->ms_async_rdma_submit_time_trigger #开关打开
且
tx_wr_inflight = qp->get_tx_wr() > cct->_conf->ms_async_rdma_count_trigger,即发出的msg 排队的长度已经超过阈值cct->_conf->ms_async_rdma_count_trigger
就进入。
第一次进入conn->merge_send_mode = true 开启合并发送,每合并conn->target_count_trigger 就post一次
if(conn->merge_send_mode)不是第一次进入,说明当前每合并conn->target_count_trigger个再post一次,还是无法降低排队数量,
tx_wr_inflight还是大于cct->_conf->ms_async_rdma_count_trigger,所以再conn->target_count_trigger++,提高合并的数量。
最大10个合并到一个post
if(qp->get_tx_wr() > cct->_conf->ms_async_rdma_count_trigger &&cct->_conf->ms_async_rdma_submit_time_trigger)
{
if(conn->merge_send_mode)
{
if(conn->target_count_trigger < 10)
{
conn->target_count_trigger++;
}
}
else
{
conn->merge_send_mode = true;
conn->target_count_trigger = 3;
}
}
ms_async_rdma_poll_timeout_ms
RDMADispatcher::polling()
{
RDMADispatcher::handle_rx_event()
{
RDMADispatcher::handle_receive_rdma_msg(rdma_msg *msg,RDMAConnectedSocketImpl *conn,bool succ)
{
/*
switch(msg->status)
{
……
case SEND_DATA_AND_RESV
conn->handle_send_data(msg);
break;
case WRITE_REQUEST_AND_RESV
conn->handle_write_request(msg);
break;
……
}
*/
}
}
}
通信模块posix和RDMA选择
ceph_osd.cc
Messenger *ms_cluster = Messenger::create(g_ceph_context, cluster_msgr_type,
entity_name_t::OSD(whoami), "cluster",
getpid(),
Messenger::HAS_HEAVY_TRAFFIC |
Messenger::HAS_MANY_CONNECTIONS);
#type==cluster_msgr_type
……
if (r == 1 || type.find("async") != std::string::npos)
return new AsyncMessenger(cct, name, type, std::move(lname), nonce);
……
type
AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
const std::string &type, string mname, uint64_t _nonce)
……
{
std::string transport_type = "posix";
if (type.find("rdma") != std::string::npos)
transport_type = "rdma";
……
StackSingleton *single;
cct->lookup_or_create_singleton_object<StackSingleton>(single, "AsyncMessenger::NetworkStack::"+transport_type); #创建或获取一个stack(stack里面创建了n个worker)
local_worker = stack->get_worker(); #获取一个worker
local_connection = new AsyncConnection(cct, this, &dispatch_queue, local_worker);#创建一条本地连接
}
lookup_or_create_singleton_object里面找不到,则:p = new T(this) 即single = new StackSingleton(this) -----------------------那么多messager共享一个RDMAstack,一个RDMAstack只有一个dispatcher 分发不过来(一个网卡一个dispatcher?),一个messager有一个dispatchqueue。
StackSingleton里面
StackSingleton::ready(type,name,dev_info)
{
……
stack = NetworkStack::create(cct,type,name,dev_info)
……
}
NetworkStack::create(cct,type,name,dev_info)
{
if (t == rdma)
{
return std::make_shared<RDMAStack>(c,t,n,dev_info)
}
}
class RDMAStack : public NetworkStack
{
vector<std::thread> threads;
RDMADispatcher dispatcher[num];
……
}
RDMAStack::RDMAStack(CephContext *cct, const string &t): NetworkStack(cct, t)
{
for(i= 0;i <num;i++)
dispatcher[i](cct, global_infiniband, this);
unsigned num = get_num_worker();
for (unsigned i = 0; i < num; ++i)
{
RDMAWorker *w = dynamic_cast<RDMAWorker *>(get_worker(i));
w->set_ib(global_infiniband);
w->set_stack(this);
}
# NetworkStack(cct, t)里面
NetworkStack::NetworkStack(CephContext *c, const string &t): type(t), started(false), cct(c)
{
……
num_workers = cct->_conf->ms_async_op_threads;
……
for (unsigned i = 0; i < num_workers; ++i) {
Worker *w = create_worker(cct, type, i);
w->center.init(InitEventNumber, i, type);
workers.push_back(w);
}
cct->register_fork_watcher(this);
}
}
RDMAStack::RDMAStack创建(初始化)NetworkStack
NetworkStack创建worker
RDMAStack::RDMAStack把stack给每个worker
查看生产者dispatcher 100% 的cpu,而消费者worker 30%,
await = svctm + 在队列里面等待的时间。
messager::create
--AsyncMessenger(cct,type,lname,nonce,keepalive)
---|dispatch_queue = new DispatchQueue(cct,this,mname)
|stack = NetworkStack::create(cct,type,name,dev_info)
|--------RDMAStack(c,t,n,dev_info)
|----------|dispatcher(cct,this)
|----------|ib(cct,dev_info)
|----------|create_worker(cct,type,i,name)
|------------|center(c)
|stack.start()
--|for(i=0;i < num_workers;i++)
--|threads[i] = std::move(std::thread( w->center.process_events(EventMaxWaitUs)));
messager
|--dispatch_queue
|--stack
|--dispatcher
|--ib
|--workers--RDMAworker
num_workers = cct->_conf->ms_async_op_threads;
_submit_by_write()
{
……
用将要发送的数据已经放在pending_bl
}