转载:OSD接收IO流程
转载链接:
https://www.cnblogs.com/yi-mu-xi/p/10282678.html
消息从(pipe.cc) pipe->reader() 处理后,若ms_can_fast_dispatch()就fast_dispatch()(DispatchQueue.cc );
否则 in_q->enqueue()进入队列。
fast_dispatch()
---ms_fast_dispatch()【OSD.cc】
将message转化为OpRequestRef op,后续直接对这个op进行处理
1 //zym 处理client发来的各种消息 2 void OSD::ms_fast_dispatch(Message *m) 3 { 4 FUNCTRACE(cct); 5 //判断osd服务是否正在关闭,若是则减少一个message的引用,引用为0时空间会被释放。 6 if (service.is_stopping()) { 7 m->put(); 8 return; 9 } 10 11 // peering event? 12 switch (m->get_type()) { 13 case CEPH_MSG_PING: 14 dout(10) << "ping from " << m->get_source() << dendl; 15 m->put(); 16 return; 17 case MSG_MON_COMMAND: 18 handle_command(static_cast<MMonCommand*>(m)); 19 return; 20 case MSG_OSD_FORCE_RECOVERY: 21 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m)); 22 return; 23 case MSG_OSD_SCRUB2: 24 handle_fast_scrub(static_cast<MOSDScrub2*>(m)); 25 return; 26 27 case MSG_OSD_PG_CREATE2: 28 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m)); 29 case MSG_OSD_PG_QUERY: 30 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m)); 31 case MSG_OSD_PG_NOTIFY: 32 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m)); 33 case MSG_OSD_PG_INFO: 34 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m)); 35 case MSG_OSD_PG_REMOVE: 36 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m)); 37 38 // these are single-pg messages that handle themselves 39 case MSG_OSD_PG_LOG: 40 case MSG_OSD_PG_TRIM: 41 case MSG_OSD_BACKFILL_RESERVE: 42 case MSG_OSD_RECOVERY_RESERVE: 43 { 44 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m); 45 if (require_osd_peer(pm)) { 46 enqueue_peering_evt( 47 pm->get_spg(), 48 PGPeeringEventRef(pm->get_event())); 49 } 50 pm->put(); 51 return; 52 } 53 } 54 55 //将message结构转变成OpRequest结构,有智能指针op指向。 56 //op的类型是 typedef boost::intrusive_ptr<OpRequest> Ref 57 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);//OpTracker op_tracker; 58 { 59 #ifdef WITH_LTTNG 60 osd_reqid_t reqid = op->get_reqid(); 61 #endif 62 tracepoint(osd, ms_fast_dispatch, reqid.name._type, 63 reqid.name._num, reqid.tid, reqid.inc); 64 }//tracepoint的作用? 65 66 if (m->trace) 67 op->osd_trace.init("osd op", &trace_endpoint, &m->trace); 68 69 // note sender epoch, min req's epoch 70 //获取epoch,什么是epoch?m的继承类是那些? 71 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch(); 72 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch(); 73 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check! 74 75 //延时执行 76 service.maybe_inject_dispatch_delay(); 77 78 //如果不是CEPH_MSG_OSD_OP消息 或者 has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) 直接加入队列 79 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) || 80 m->get_type() != CEPH_MSG_OSD_OP) { 81 // queue it directly 82 enqueue_op( 83 static_cast<MOSDFastDispatchOp*>(m)->get_spg(), 84 std::move(op), 85 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch()); 86 } else { 87 // legacy client, and this is an MOSDOp (the *only* fast dispatch 88 // message that didn't have an explicit spg_t); we need to map 89 // them to an spg_t while preserving delivery order. 90 auto priv = m->get_connection()->get_priv(); 91 if (auto session = static_cast<Session*>(priv.get()); session) { 92 std::lock_guard l{session->session_dispatch_lock}; 93 op->get();//加计数 94 session->waiting_on_map.push_back(*op); 95 OSDMapRef nextmap = service.get_nextmap_reserved(); 96 dispatch_session_waiting(session, nextmap); 97 service.release_map(nextmap);//释放之前预留的osdmap epoch。 98 } 99 } 100 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false); 101 }
在enqueue_op()中加入OSDService->op_wq队列.该队列时由OSD->SharedOpWQ队列初始化,实际上是保存到了OSD->SharedOpWQ,然后保存到ShardData中等待被处理,然后唤醒处理这个队列的线程,线程处理函数OSD::SharedOpWQ::_process().
1 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch) 2 { 3 const utime_t stamp = op->get_req()->get_recv_stamp(); 4 const utime_t latency = ceph_clock_now() - stamp; 5 const unsigned priority = op->get_req()->get_priority(); 6 const int cost = op->get_req()->get_cost(); 7 const uint64_t owner = op->get_req()->get_source().num(); 8 9 dout(15) << "enqueue_op " << op << " prio " << priority 10 << " cost " << cost 11 << " latency " << latency 12 << " epoch " << epoch 13 << " " << *(op->get_req()) << dendl; 14 op->osd_trace.event("enqueue op"); 15 op->osd_trace.keyval("priority", priority); 16 op->osd_trace.keyval("cost", cost); 17 op->mark_queued_for_pg(); 18 logger->tinc(l_osd_op_before_queue_op_lat, latency); 19 op_shardedwq.queue( // sdata->scheduler->enqueue(std::move(item)) 20 OpQueueItem( 21 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))), 22 cost, priority, stamp, owner, epoch)); 23 }
处理函数void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
1 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb) 2 { 3 ······ 4 OpQueueItem item = sdata->pqueue->dequeue();//出对列 5 6 ······· 7 8 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval, 9 suicide_interval); 10 11 // take next item 12 auto qi = std::move(slot->to_process.front()); //deque<OpQueueItem> to_process; class OpQueueItem{}在OSD OpQueueItem.h中
13 ············ 14 qi.run(osd, sdata, pg, tp_handle);//处理线程 PGOpItem::run 15 16 ········· 17 18 }
1 void PGOpItem::run( 2 OSD *osd, 3 OSDShard *sdata, 4 PGRef& pg, 5 ThreadPool::TPHandle &handle) 6 { 7 osd->dequeue_op(pg, op, handle); 8 pg->unlock(); 9 } 10 11 void PGPeeringItem::run( 12 OSD *osd, 13 OSDShard *sdata, 14 PGRef& pg, 15 ThreadPool::TPHandle &handle) 16 { 17 osd->dequeue_peering_evt(sdata, pg.get(), evt, handle); 18 } 19 20 void PGSnapTrim::run() 21 22 void PGScrub::run() 23 24 void PGRecovery::run() 25 26 void PGRecoveryContext::run() 27 28 void PGDelete::run()
osd->dequeue_op() 中调用pg->do_request() // 处理请求 PrimaryLogPG::do_request 【在PrimaryLogPG.cc中】
根据不用的消息类型对op进行处理
1 void PrimaryLogPG::do_request( 2 OpRequestRef& op, 3 ThreadPool::TPHandle &handle) 4 { 5 if (op->osd_trace) { 6 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace); 7 op->pg_trace.event("do request"); 8 } 9 // make sure we have a new enough map 10 auto p = waiting_for_map.find(op->get_source()); 11 if (p != waiting_for_map.end()) { 12 // preserve ordering 13 dout(20) << __func__ << " waiting_for_map " 14 << p->first << " not empty, queueing" << dendl; 15 p->second.push_back(op); 16 op->mark_delayed("waiting_for_map not empty"); 17 return; 18 } 19 if (!have_same_or_newer_map(op->min_epoch)) { 20 dout(20) << __func__ << " min " << op->min_epoch 21 << ", queue on waiting_for_map " << op->get_source() << dendl; 22 waiting_for_map[op->get_source()].push_back(op); 23 op->mark_delayed("op must wait for map"); 24 osd->request_osdmap_update(op->min_epoch); //---更新map 25 return; 26 } 27 28 if (can_discard_request(op)) {//条件成立 return 29 return; 30 } 31 32 // pg-wide backoffs 33 const Message *m = op->get_req(); 34 int msg_type = m->get_type(); 35 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) { 36 SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())}; 37 if (!session) 38 return; // drop it. 39 40 if (msg_type == CEPH_MSG_OSD_OP) { 41 if (session->check_backoff(cct, info.pgid, 42 info.pgid.pgid.get_hobj_start(), m)) { 43 return; 44 } 45 46 bool backoff = 47 is_down() || 48 is_incomplete() || 49 (!is_active() && is_peered()); 50 if (g_conf()->osd_backoff_on_peering && !backoff) { 51 if (is_peering()) { 52 backoff = true; 53 } 54 } 55 if (backoff) { 56 add_pg_backoff(session);//稍后处理 57 return; 58 } 59 } 60 // pg backoff acks at pg-level 61 if (msg_type == CEPH_MSG_OSD_BACKOFF) { 62 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m); 63 if (ba->begin != ba->end) { 64 handle_backoff(op); //处理backoff的pg 65 return; 66 } 67 } 68 } 69 70 if (!is_peered()) { 71 // Delay unless PGBackend says it's ok 72 if (pgbackend->can_handle_while_inactive(op)) { 73 bool handled = pgbackend->handle_message(op); 74 ceph_assert(handled); 75 return; 76 } else { 77 waiting_for_peered.push_back(op); 78 op->mark_delayed("waiting for peered"); 79 return; 80 } 81 } 82 83 if (flushes_in_progress > 0) { 84 dout(20) << flushes_in_progress 85 << " flushes_in_progress pending " 86 << "waiting for flush on " << op << dendl; 87 waiting_for_flush.push_back(op);//pg处于flash状态,将op放入等待队列,等待pg变为可用状态 88 op->mark_delayed("waiting for flush"); 89 return; 90 } 91 92 ceph_assert(is_peered() && flushes_in_progress == 0); 93 if (pgbackend->handle_message(op)) 94 return; 95 96 switch (msg_type) { 97 case CEPH_MSG_OSD_OP: 98 case CEPH_MSG_OSD_BACKOFF: 99 if (!is_active()) { 100 dout(20) << " peered, not active, waiting for active on " << op << dendl; 101 waiting_for_active.push_back(op); 102 op->mark_delayed("waiting for active"); 103 return; 104 } 105 switch (msg_type) { 106 case CEPH_MSG_OSD_OP: 107 // verify client features 108 if ((pool.info.has_tiers() || pool.info.is_tier()) && 109 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) { 110 osd->reply_op_error(op, -EOPNOTSUPP); 111 return; 112 } 113 do_op(op); //处理op PrimaryLogPG::do_op(OpRequestRef& op) 【PrimaryLogPG.cc 中】 114 break; 115 case CEPH_MSG_OSD_BACKOFF: 116 // object-level backoff acks handled in osdop context 117 handle_backoff(op); 118 break; 119 } 120 break; 121 122 case MSG_OSD_PG_SCAN: 123 do_scan(op, handle); 124 break; 125 126 case MSG_OSD_PG_BACKFILL: 127 do_backfill(op); 128 break; 129 130 case MSG_OSD_PG_BACKFILL_REMOVE: 131 do_backfill_remove(op); 132 break; 133 134 case MSG_OSD_SCRUB_RESERVE: 135 { 136 const MOSDScrubReserve *m = 137 static_cast<const MOSDScrubReserve*>(op->get_req()); 138 switch (m->type) { 139 case MOSDScrubReserve::REQUEST: 140 handle_scrub_reserve_request(op); 141 break; 142 case MOSDScrubReserve::GRANT: 143 handle_scrub_reserve_grant(op, m->from); 144 break; 145 case MOSDScrubReserve::REJECT: 146 handle_scrub_reserve_reject(op, m->from); 147 break; 148 case MOSDScrubReserve::RELEASE: 149 handle_scrub_reserve_release(op); 150 break; 151 } 152 } 153 break; 154 155 case MSG_OSD_REP_SCRUB: 156 replica_scrub(op, handle); 157 break; 158 159 case MSG_OSD_REP_SCRUBMAP: 160 do_replica_scrub_map(op); 161 break; 162 163 case MSG_OSD_PG_UPDATE_LOG_MISSING: 164 do_update_log_missing(op); 165 break; 166 167 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: 168 do_update_log_missing_reply(op); 169 break; 170 171 default: 172 ceph_abort_msg("bad message type in do_request"); 173 } 174 }
1 /** do_op - do an op 2 * pg lock will be held (if multithreaded) 3 * osd_lock NOT held. 4 */ 5 void PrimaryLogPG::do_op(OpRequestRef& op) 6 { 7 FUNCTRACE(cct); 8 // NOTE: take a non-const pointer here; we must be careful not to 9 // change anything that will break other reads on m (operator<<). 10 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req()); 11 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); 12 if (m->finish_decode()) { //解码什么? 13 op->reset_desc(); // for TrackedOp 14 m->clear_payload(); 15 } 16 17 dout(20) << __func__ << ": op " << *m << dendl; 18 19 hobject_t head = m->get_hobj(); 20 head.snap = CEPH_NOSNAP; 21 22 if (!info.pgid.pgid.contains( 23 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {//??PG 处于分裂状态 24 derr << __func__ << " " << info.pgid.pgid << " does not contain " 25 << head << " pg_num " << pool.info.get_pg_num() << " hash " 26 << std::hex << head.get_hash() << std::dec << dendl; 27 osd->clog->warn() << info.pgid.pgid << " does not contain " << head 28 << " op " << *m; 29 ceph_assert(!cct->_conf->osd_debug_misdirected_ops); 30 return; 31 } 32 33 bool can_backoff = 34 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF); 35 SessionRef session; 36 if (can_backoff) { 37 session = static_cast<Session*>(m->get_connection()->get_priv().get()); 38 if (!session.get()) { 39 dout(10) << __func__ << " no session" << dendl; 40 return; 41 } 42 43 if (session->check_backoff(cct, info.pgid, head, m)) { 44 return; 45 } 46 } 47 48 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) { 49 // not implemented. 50 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl; 51 osd->reply_op_error(op, -EINVAL); //并行执行状态,直接返回失败? 52 return; 53 } 54 55 if (op->rmw_flags == 0) { 56 int r = osd->osd->init_op_flags(op); 57 if (r) { 58 osd->reply_op_error(op, r); 59 return; 60 } 61 } 62 63 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | 64 CEPH_OSD_FLAG_LOCALIZE_READS)) && 65 op->may_read() && 66 !(op->may_write() || op->may_cache())) {//此时只有副本才能执行操作 67 // balanced reads; any replica will do 68 if (!(is_primary() || is_replica())) { 69 osd->handle_misdirected_op(this, op); 70 return; 71 } 72 } else { 73 // normal case; must be primary 74 if (!is_primary()) { 75 osd->handle_misdirected_op(this, op); 76 return; 77 } 78 } 79 80 if (!op_has_sufficient_caps(op)) { //没有足够cap,则直接返回失败 81 osd->reply_op_error(op, -EPERM); 82 return; 83 } 84 85 if (op->includes_pg_op()) {//op is OpRequest 对于请求中包含对PG的操作 CEPH_OSD_RMW_FLAG_PGOD 86 return do_pg_op(op);//void PrimaryLogPG::do_pg_op(OpRequestRef op) 87 } 88 89 // object name too long? 90 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {//对象长度,如果大于osd_max_object_name_len,则return 91 dout(4) << "do_op name is longer than " 92 << cct->_conf->osd_max_object_name_len 93 << " bytes" << dendl; 94 osd->reply_op_error(op, -ENAMETOOLONG); 95 return; 96 } 97 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {//对象local 名称空间长度,如果大于osd_max_object_name_len,则return 98 dout(4) << "do_op locator is longer than " 99 << cct->_conf->osd_max_object_name_len 100 << " bytes" << dendl; 101 osd->reply_op_error(op, -ENAMETOOLONG); 102 return; 103 } 104 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {//对象local 名称空间长度,如果大于osd_max_object_name_len,则return 105 dout(4) << "do_op namespace is longer than " 106 << cct->_conf->osd_max_object_namespace_len 107 << " bytes" << dendl; 108 osd->reply_op_error(op, -ENAMETOOLONG); 109 return; 110 } 111 112 if (int r = osd->store->validate_hobject_key(head)) {// object的head是否有效 113 dout(4) << "do_op object " << head << " invalid for backing store: " 114 << r << dendl; 115 osd->reply_op_error(op, r); 116 return; 117 } 118 119 // blacklisted? 120 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {//检查op请求的地址是否在OSDMAP的blacklist中 121 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl; 122 osd->reply_op_error(op, -EBLACKLISTED); 123 return; 124 } 125 126 // order this op as a write? 127 bool write_ordered = op->rwordered(); //是否是写请求 128 129 // discard due to cluster full transition? (we discard any op that 130 // originates before the cluster or pool is marked full; the client 131 // will resend after the full flag is removed or if they expect the 132 // op to succeed despite being full). The except is FULL_FORCE and 133 // FULL_TRY ops, which there is no reason to discard because they 134 // bypass all full checks anyway. If this op isn't write or 135 // read-ordered, we skip. 136 // FIXME: we exclude mds writes for now. 137 if (write_ordered && !(m->get_source().is_mds() || 138 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) || 139 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) && 140 info.history.last_epoch_marked_full > m->get_map_epoch()) { 141 dout(10) << __func__ << " discarding op sent before full " << m << " " 142 << *m << dendl; 143 return; 144 } 145 // mds should have stopped writing before this point. 146 // We can't allow OSD to become non-startable even if mds 147 // could be writing as part of file removals. 148 if (write_ordered && osd->check_failsafe_full(get_dpp()) && 149 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) { 150 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl; 151 return; 152 } 153 int64_t poolid = get_pgid().pool(); 154 if (op->may_write()) { //如果是写请求 155 156 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);//获取对应的pool,pool获取失败,直接return?难道没有返回消息? 157 if (!pi) { 158 return; 159 } 160 161 // invalid? 162 if (m->get_snapid() != CEPH_NOSNAP) { 163 dout(20) << __func__ << ": write to clone not valid " << *m << dendl; 164 osd->reply_op_error(op, -EINVAL); 165 return; 166 } 167 168 // too big? 169 if (cct->_conf->osd_max_write_size && 170 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {//写请求的数据大于osd_max_write_size << 20 171 // journal can't hold commit! 172 derr << "do_op msg data len " << m->get_data_len() 173 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20) 174 << " on " << *m << dendl; 175 osd->reply_op_error(op, -OSD_WRITETOOBIG); 176 return; 177 } 178 } 179 180 dout(10) << "do_op " << *m 181 << (op->may_write() ? " may_write" : "") 182 << (op->may_read() ? " may_read" : "") 183 << (op->may_cache() ? " may_cache" : "") 184 << " -> " << (write_ordered ? "write-ordered" : "read-ordered") 185 << " flags " << ceph_osd_flag_string(m->get_flags()) 186 << dendl; 187 188 // missing object? 189 if (is_unreadable_object(head)) {//head 有效 190 if (!is_primary()) {// 如果不是主OSD则reply_op_error 191 osd->reply_op_error(op, -EAGAIN); 192 return; 193 } 194 if (can_backoff && 195 (g_conf()->osd_backoff_on_degraded || 196 (g_conf()->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) { 197 add_backoff(session, head, head); 198 maybe_kick_recovery(head);//条件成立,add_backoff(),尝试启动recovery 199 } else { 200 wait_for_unreadable_object(head, op); 201 } 202 return; 203 } 204 205 if (write_ordered) { 206 // degraded object? 207 if (is_degraded_or_backfilling_object(head)) { 208 if (can_backoff && g_conf()->osd_backoff_on_degraded) { 209 add_backoff(session, head, head); 210 maybe_kick_recovery(head); 211 } else { 212 wait_for_degraded_object(head, op); 213 } 214 return; 215 } 216 217 if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) { 218 dout(20) << __func__ << ": waiting for scrub" << dendl; 219 waiting_for_scrub.push_back(op); 220 op->mark_delayed("waiting for scrub"); 221 return; 222 } 223 224 // blocked on snap? 225 //s head在objects_blocked_on_degraded_snap 则将op放入放入waiting_for_degraded_object 226 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head); 227 blocked_iter != std::end(objects_blocked_on_degraded_snap)) { 228 hobject_t to_wait_on(head); 229 to_wait_on.snap = blocked_iter->second; 230 wait_for_degraded_object(to_wait_on, op); 231 return; 232 } 233 234 //检查head是否在objects_blocked_on_snap_promotion,如果是则将op放入waiting_for_blocked_object 235 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head); 236 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) { 237 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op); 238 return; 239 } 240 241 //检查head是否在objects_blocked_on_cache_full中,如果是则将op放入waiting_for_cache_not_full 242 if (objects_blocked_on_cache_full.count(head)) { 243 block_write_on_full_cache(head, op); 244 return; 245 } 246 } 247 248 // dup/resent? 249 if (op->may_write() || op->may_cache()) { 250 // warning: we will get back *a* request for this reqid, but not 251 // necessarily the most recent. this happens with flush and 252 // promote ops, but we can't possible have both in our log where 253 // the original request is still not stable on disk, so for our 254 // purposes here it doesn't matter which one we get. 255 eversion_t version; 256 version_t user_version; 257 int return_code = 0; 258 bool got = check_in_progress_op( 259 m->get_reqid(), &version, &user_version, &return_code); 260 if (got) { 261 dout(3) << __func__ << " dup " << m->get_reqid() 262 << " version " << version << dendl; 263 if (already_complete(version)) { 264 osd->reply_op_error(op, return_code, version, user_version); 265 } else { 266 dout(10) << " waiting for " << version << " to commit" << dendl; 267 // always queue ondisk waiters, so that we can requeue if needed 268 waiting_for_ondisk[version].emplace_back(op, user_version, return_code); 269 op->mark_delayed("waiting for ondisk"); 270 } 271 return; 272 } 273 } 274 275 ObjectContextRef obc; 276 bool can_create = op->may_write(); 277 hobject_t missing_oid; 278 279 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS 280 hobject_t _oid_head; 281 if (m->get_snapid() == CEPH_SNAPDIR) { 282 _oid_head = m->get_hobj().get_head(); 283 } 284 const hobject_t& oid = 285 m->get_snapid() == CEPH_SNAPDIR ? _oid_head : m->get_hobj(); 286 287 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else 288 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) { 289 OSDOp& osd_op = *p; 290 291 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) { 292 if (m->get_snapid() != CEPH_SNAPDIR) { 293 dout(10) << "LIST_SNAPS with incorrect context" << dendl; 294 osd->reply_op_error(op, -EINVAL); 295 return; 296 } 297 } else { 298 if (m->get_snapid() == CEPH_SNAPDIR) { 299 dout(10) << "non-LIST_SNAPS on snapdir" << dendl; 300 osd->reply_op_error(op, -EINVAL); 301 return; 302 } 303 } 304 } 305 306 // io blocked on obc? 307 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) && 308 maybe_await_blocked_head(oid, op)) { 309 return; 310 } 311 312 int r = find_object_context( 313 oid, &obc, can_create, 314 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE), 315 &missing_oid); 316 317 // LIST_SNAPS needs the ssc too 318 if (obc && 319 m->get_snapid() == CEPH_SNAPDIR && 320 !obc->ssc) { 321 obc->ssc = get_snapset_context(oid, true); 322 } 323 324 if (r == -EAGAIN) { 325 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise, 326 // we have to wait for the object. 327 if (is_primary()) { 328 // missing the specific snap we need; requeue and wait. 329 ceph_assert(!op->may_write()); // only happens on a read/cache 330 wait_for_unreadable_object(missing_oid, op); 331 return; 332 } 333 } else if (r == 0) { 334 if (is_unreadable_object(obc->obs.oi.soid)) { 335 dout(10) << __func__ << ": clone " << obc->obs.oi.soid 336 << " is unreadable, waiting" << dendl; 337 wait_for_unreadable_object(obc->obs.oi.soid, op); 338 return; 339 } 340 341 // degraded object? (the check above was for head; this could be a clone) 342 if (write_ordered && 343 obc->obs.oi.soid.snap != CEPH_NOSNAP && 344 is_degraded_or_backfilling_object(obc->obs.oi.soid)) { 345 dout(10) << __func__ << ": clone " << obc->obs.oi.soid 346 << " is degraded, waiting" << dendl; 347 wait_for_degraded_object(obc->obs.oi.soid, op); 348 return; 349 } 350 } 351 352 bool in_hit_set = false; 353 if (hit_set) { 354 if (obc.get()) { 355 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid)) 356 in_hit_set = true; 357 } else { 358 if (missing_oid != hobject_t() && hit_set->contains(missing_oid)) 359 in_hit_set = true; 360 } 361 if (!op->hitset_inserted) { 362 hit_set->insert(oid); 363 op->hitset_inserted = true; 364 if (hit_set->is_full() || 365 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) { 366 hit_set_persist(); 367 } 368 } 369 } 370 371 if (agent_state) { 372 if (agent_choose_mode(false, op)) 373 return; 374 } 375 376 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) { 377 if (maybe_handle_manifest(op, 378 write_ordered, 379 obc)) 380 return; 381 } 382 383 if (maybe_handle_cache(op, 384 write_ordered, 385 obc, 386 r, 387 missing_oid, 388 false, 389 in_hit_set)) 390 return; 391 392 if (r && (r != -ENOENT || !obc)) { 393 // copy the reqids for copy get on ENOENT 394 if (r == -ENOENT && 395 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) { 396 fill_in_copy_get_noent(op, oid, m->ops[0]); 397 return; 398 } 399 dout(20) << __func__ << ": find_object_context got error " << r << dendl; 400 if (op->may_write() && 401 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { 402 record_write_error(op, oid, nullptr, r); 403 } else { 404 osd->reply_op_error(op, r); 405 } 406 return; 407 } 408 409 // make sure locator is consistent 410 object_locator_t oloc(obc->obs.oi.soid); 411 if (m->get_object_locator() != oloc) { 412 dout(10) << " provided locator " << m->get_object_locator() 413 << " != object's " << obc->obs.oi.soid << dendl; 414 osd->clog->warn() << "bad locator " << m->get_object_locator() 415 << " on object " << oloc 416 << " op " << *m; 417 } 418 419 // io blocked on obc? 420 if (obc->is_blocked() && 421 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) { 422 wait_for_blocked_object(obc->obs.oi.soid, op); 423 return; 424 } 425 426 dout(25) << __func__ << " oi " << obc->obs.oi << dendl; 427 428 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);//这里要创建一个OpContext结构,该结构会接管message中的所有ops的操作,ops的操作就是客户端将rbd请求拆分成object的请求。 429 430 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) { 431 dout(20) << __func__ << ": skipping rw locks" << dendl; 432 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) { 433 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl; 434 435 // verify there is in fact a flush in progress 436 // FIXME: we could make this a stronger test. 437 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid); 438 if (p == flush_ops.end()) { 439 dout(10) << __func__ << " no flush in progress, aborting" << dendl; 440 reply_ctx(ctx, -EINVAL); 441 return; 442 } 443 } else if (!get_rw_locks(write_ordered, ctx)) { 444 dout(20) << __func__ << " waiting for rw locks " << dendl; 445 op->mark_delayed("waiting for rw locks"); 446 close_op_ctx(ctx); 447 return; 448 } 449 dout(20) << __func__ << " obc " << *obc << dendl; 450 451 if (r) { 452 dout(20) << __func__ << " returned an error: " << r << dendl; 453 close_op_ctx(ctx); 454 if (op->may_write() && 455 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) { 456 record_write_error(op, oid, nullptr, r); 457 } else { 458 osd->reply_op_error(op, r); 459 } 460 return; 461 } 462 463 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) { 464 ctx->ignore_cache = true; 465 } 466 467 if ((op->may_read()) && (obc->obs.oi.is_lost())) { 468 // This object is lost. Reading from it returns an error. 469 dout(20) << __func__ << ": object " << obc->obs.oi.soid 470 << " is lost" << dendl; 471 reply_ctx(ctx, -ENFILE); 472 return; 473 } 474 if (!op->may_write() && 475 !op->may_cache() && 476 (!obc->obs.exists || 477 ((m->get_snapid() != CEPH_SNAPDIR) && 478 obc->obs.oi.is_whiteout()))) { 479 // copy the reqids for copy get on ENOENT 480 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) { 481 fill_in_copy_get_noent(op, oid, m->ops[0]); 482 close_op_ctx(ctx); 483 return; 484 } 485 reply_ctx(ctx, -ENOENT); 486 return; 487 } 488 489 op->mark_started(); 490 491 execute_ctx(ctx); 492 utime_t prepare_latency = ceph_clock_now(); 493 prepare_latency -= op->get_dequeued_time(); 494 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency); 495 if (op->may_read() && op->may_write()) { 496 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency); 497 } else if (op->may_read()) { 498 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency); 499 } else if (op->may_write() || op->may_cache()) { 500 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency); 501 } 502 503 // force recovery of the oldest missing object if too many logs 504 maybe_force_recovery(); 505 }