转载:OSD接收IO流程

转载链接:

https://www.cnblogs.com/yi-mu-xi/p/10282678.html

 

消息从(pipe.cc) pipe->reader() 处理后,若ms_can_fast_dispatch()就fast_dispatch()(DispatchQueue.cc );

否则 in_q->enqueue()进入队列。

 

fast_dispatch()

---ms_fast_dispatch()【OSD.cc】

    将message转化为OpRequestRef op,后续直接对这个op进行处理

复制代码
  1 //zym 处理client发来的各种消息
  2 void OSD::ms_fast_dispatch(Message *m)
  3 {
  4   FUNCTRACE(cct);
  5   //判断osd服务是否正在关闭,若是则减少一个message的引用,引用为0时空间会被释放。
  6   if (service.is_stopping()) {
  7     m->put();
  8     return;
  9   }
 10 
 11   // peering event?
 12   switch (m->get_type()) {
 13   case CEPH_MSG_PING:
 14     dout(10) << "ping from " << m->get_source() << dendl;
 15     m->put();
 16     return;
 17   case MSG_MON_COMMAND:
 18     handle_command(static_cast<MMonCommand*>(m));
 19     return;
 20   case MSG_OSD_FORCE_RECOVERY:
 21     handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
 22     return;
 23   case MSG_OSD_SCRUB2:
 24     handle_fast_scrub(static_cast<MOSDScrub2*>(m));
 25     return;
 26 
 27   case MSG_OSD_PG_CREATE2:
 28     return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
 29   case MSG_OSD_PG_QUERY:
 30     return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
 31   case MSG_OSD_PG_NOTIFY:
 32     return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
 33   case MSG_OSD_PG_INFO:
 34     return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
 35   case MSG_OSD_PG_REMOVE:
 36     return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
 37 
 38     // these are single-pg messages that handle themselves
 39   case MSG_OSD_PG_LOG:
 40   case MSG_OSD_PG_TRIM:
 41   case MSG_OSD_BACKFILL_RESERVE:
 42   case MSG_OSD_RECOVERY_RESERVE:
 43     {
 44       MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
 45       if (require_osd_peer(pm)) {
 46     enqueue_peering_evt(
 47       pm->get_spg(),
 48       PGPeeringEventRef(pm->get_event()));
 49       }
 50       pm->put();
 51       return;
 52     }
 53   }
 54   
 55   //将message结构转变成OpRequest结构,有智能指针op指向。
 56   //op的类型是 typedef boost::intrusive_ptr<OpRequest> Ref 
 57   OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);//OpTracker op_tracker;
 58   {
 59 #ifdef WITH_LTTNG
 60     osd_reqid_t reqid = op->get_reqid();
 61 #endif
 62     tracepoint(osd, ms_fast_dispatch, reqid.name._type,
 63         reqid.name._num, reqid.tid, reqid.inc);
 64   }//tracepoint的作用?
 65 
 66   if (m->trace)
 67     op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
 68 
 69   // note sender epoch, min req's epoch
 70   //获取epoch,什么是epoch?m的继承类是那些?
 71   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
 72   op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
 73   ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
 74 
 75 //延时执行
 76   service.maybe_inject_dispatch_delay();
 77 
 78 //如果不是CEPH_MSG_OSD_OP消息 或者 has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) 直接加入队列
 79   if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
 80       m->get_type() != CEPH_MSG_OSD_OP) {
 81     // queue it directly
 82     enqueue_op(
 83       static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
 84       std::move(op),
 85       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
 86   } else {
 87     // legacy client, and this is an MOSDOp (the *only* fast dispatch
 88     // message that didn't have an explicit spg_t); we need to map
 89     // them to an spg_t while preserving delivery order.
 90     auto priv = m->get_connection()->get_priv();
 91     if (auto session = static_cast<Session*>(priv.get()); session) {
 92       std::lock_guard l{session->session_dispatch_lock};
 93       op->get();//加计数
 94       session->waiting_on_map.push_back(*op);
 95       OSDMapRef nextmap = service.get_nextmap_reserved();
 96       dispatch_session_waiting(session, nextmap);
 97       service.release_map(nextmap);//释放之前预留的osdmap epoch。
 98     }
 99   }
100   OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false); 
101 }
复制代码

在enqueue_op()中加入OSDService->op_wq队列.该队列时由OSD->SharedOpWQ队列初始化,实际上是保存到了OSD->SharedOpWQ,然后保存到ShardData中等待被处理,然后唤醒处理这个队列的线程,线程处理函数OSD::SharedOpWQ::_process().

复制代码
 1 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
 2 {
 3   const utime_t stamp = op->get_req()->get_recv_stamp();
 4   const utime_t latency = ceph_clock_now() - stamp;
 5   const unsigned priority = op->get_req()->get_priority();
 6   const int cost = op->get_req()->get_cost();
 7   const uint64_t owner = op->get_req()->get_source().num();
 8 
 9   dout(15) << "enqueue_op " << op << " prio " << priority
10        << " cost " << cost
11        << " latency " << latency
12        << " epoch " << epoch
13        << " " << *(op->get_req()) << dendl;
14   op->osd_trace.event("enqueue op");
15   op->osd_trace.keyval("priority", priority);
16   op->osd_trace.keyval("cost", cost);
17   op->mark_queued_for_pg();
18   logger->tinc(l_osd_op_before_queue_op_lat, latency);
19   op_shardedwq.queue(   // sdata->scheduler->enqueue(std::move(item))
20     OpQueueItem(
21       unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
22       cost, priority, stamp, owner, epoch));
23 }
复制代码

 处理函数void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)

复制代码
 1 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
 2 {
 3 ······
 4 OpQueueItem item = sdata->pqueue->dequeue();//出对列
 5 
 6 ·······
 7 
 8 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
 9                  suicide_interval);
10 
11   // take next item
12   auto qi = std::move(slot->to_process.front());       //deque<OpQueueItem> to_process;  class OpQueueItem{}在OSD OpQueueItem.h中
13 ············ 14 qi.run(osd, sdata, pg, tp_handle);//处理线程 PGOpItem::run 15 16 ········· 17 18 }
复制代码
复制代码
 1 void PGOpItem::run(
 2   OSD *osd,
 3   OSDShard *sdata,
 4   PGRef& pg,
 5   ThreadPool::TPHandle &handle)
 6 {
 7   osd->dequeue_op(pg, op, handle);
 8   pg->unlock();
 9 }
10 
11 void PGPeeringItem::run(
12   OSD *osd,
13   OSDShard *sdata,
14   PGRef& pg,
15   ThreadPool::TPHandle &handle)
16 {
17   osd->dequeue_peering_evt(sdata, pg.get(), evt, handle);
18 }
19 
20 void PGSnapTrim::run()
21 
22 void PGScrub::run()
23 
24 void PGRecovery::run()
25 
26 void PGRecoveryContext::run()
27 
28 void PGDelete::run()
复制代码

 osd->dequeue_op() 中调用pg->do_request()     // 处理请求  PrimaryLogPG::do_request 【在PrimaryLogPG.cc中】

根据不用的消息类型对op进行处理

复制代码
  1 void PrimaryLogPG::do_request(
  2   OpRequestRef& op,
  3   ThreadPool::TPHandle &handle)
  4 {
  5   if (op->osd_trace) {
  6     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
  7     op->pg_trace.event("do request");
  8   }
  9   // make sure we have a new enough map
 10   auto p = waiting_for_map.find(op->get_source());
 11   if (p != waiting_for_map.end()) {
 12     // preserve ordering
 13     dout(20) << __func__ << " waiting_for_map "
 14          << p->first << " not empty, queueing" << dendl;
 15     p->second.push_back(op);
 16     op->mark_delayed("waiting_for_map not empty");
 17     return;
 18   }
 19   if (!have_same_or_newer_map(op->min_epoch)) {
 20     dout(20) << __func__ << " min " << op->min_epoch
 21          << ", queue on waiting_for_map " << op->get_source() << dendl;
 22     waiting_for_map[op->get_source()].push_back(op);
 23     op->mark_delayed("op must wait for map");
 24     osd->request_osdmap_update(op->min_epoch);   //---更新map
 25     return;
 26   }
 27 
 28   if (can_discard_request(op)) {//条件成立 return
 29     return;
 30   }
 31 
 32   // pg-wide backoffs
 33   const Message *m = op->get_req();
 34   int msg_type = m->get_type();
 35   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
 36     SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())};
 37     if (!session)
 38       return;  // drop it.
 39 
 40     if (msg_type == CEPH_MSG_OSD_OP) {
 41       if (session->check_backoff(cct, info.pgid,
 42                  info.pgid.pgid.get_hobj_start(), m)) {
 43     return;
 44       }
 45 
 46       bool backoff =
 47     is_down() ||
 48     is_incomplete() ||
 49     (!is_active() && is_peered());
 50       if (g_conf()->osd_backoff_on_peering && !backoff) {
 51     if (is_peering()) {
 52       backoff = true;
 53     }
 54       }
 55       if (backoff) {
 56     add_pg_backoff(session);//稍后处理
 57     return;
 58       }
 59     }
 60     // pg backoff acks at pg-level
 61     if (msg_type == CEPH_MSG_OSD_BACKOFF) {
 62       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
 63       if (ba->begin != ba->end) {
 64     handle_backoff(op);    //处理backoff的pg
 65     return;
 66       }
 67     }
 68   }
 69 
 70   if (!is_peered()) {
 71     // Delay unless PGBackend says it's ok
 72     if (pgbackend->can_handle_while_inactive(op)) {
 73       bool handled = pgbackend->handle_message(op);
 74       ceph_assert(handled);
 75       return;
 76     } else {
 77       waiting_for_peered.push_back(op);
 78       op->mark_delayed("waiting for peered");
 79       return;
 80     }
 81   }
 82 
 83   if (flushes_in_progress > 0) {
 84     dout(20) << flushes_in_progress
 85          << " flushes_in_progress pending "
 86          << "waiting for flush on " << op << dendl;
 87     waiting_for_flush.push_back(op);//pg处于flash状态,将op放入等待队列,等待pg变为可用状态
 88     op->mark_delayed("waiting for flush");
 89     return;
 90   }
 91 
 92   ceph_assert(is_peered() && flushes_in_progress == 0);
 93   if (pgbackend->handle_message(op))
 94     return;
 95 
 96   switch (msg_type) {
 97   case CEPH_MSG_OSD_OP:
 98   case CEPH_MSG_OSD_BACKOFF:
 99     if (!is_active()) {
100       dout(20) << " peered, not active, waiting for active on " << op << dendl;
101       waiting_for_active.push_back(op);
102       op->mark_delayed("waiting for active");
103       return;
104     }
105     switch (msg_type) {
106     case CEPH_MSG_OSD_OP:
107       // verify client features
108       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
109       !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
110     osd->reply_op_error(op, -EOPNOTSUPP);
111     return;
112       }
113       do_op(op);  //处理op PrimaryLogPG::do_op(OpRequestRef& op) 【PrimaryLogPG.cc 中】
114       break;
115     case CEPH_MSG_OSD_BACKOFF:
116       // object-level backoff acks handled in osdop context
117       handle_backoff(op);
118       break;
119     }
120     break;
121 
122   case MSG_OSD_PG_SCAN:
123     do_scan(op, handle);
124     break;
125 
126   case MSG_OSD_PG_BACKFILL:
127     do_backfill(op);
128     break;
129 
130   case MSG_OSD_PG_BACKFILL_REMOVE:
131     do_backfill_remove(op);
132     break;
133 
134   case MSG_OSD_SCRUB_RESERVE:
135     {
136       const MOSDScrubReserve *m =
137     static_cast<const MOSDScrubReserve*>(op->get_req());
138       switch (m->type) {
139       case MOSDScrubReserve::REQUEST:
140     handle_scrub_reserve_request(op);
141     break;
142       case MOSDScrubReserve::GRANT:
143     handle_scrub_reserve_grant(op, m->from);
144     break;
145       case MOSDScrubReserve::REJECT:
146     handle_scrub_reserve_reject(op, m->from);
147     break;
148       case MOSDScrubReserve::RELEASE:
149     handle_scrub_reserve_release(op);
150     break;
151       }
152     }
153     break;
154 
155   case MSG_OSD_REP_SCRUB:
156     replica_scrub(op, handle);
157     break;
158 
159   case MSG_OSD_REP_SCRUBMAP:
160     do_replica_scrub_map(op);
161     break;
162 
163   case MSG_OSD_PG_UPDATE_LOG_MISSING:
164     do_update_log_missing(op);
165     break;
166 
167   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
168     do_update_log_missing_reply(op);
169     break;
170 
171   default:
172     ceph_abort_msg("bad message type in do_request");
173   }
174 }
复制代码

 

复制代码
  1 /** do_op - do an op
  2  * pg lock will be held (if multithreaded)
  3  * osd_lock NOT held.
  4  */
  5 void PrimaryLogPG::do_op(OpRequestRef& op)
  6 {
  7   FUNCTRACE(cct);
  8   // NOTE: take a non-const pointer here; we must be careful not to
  9   // change anything that will break other reads on m (operator<<).
 10   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
 11   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
 12   if (m->finish_decode()) {  //解码什么?
 13     op->reset_desc();   // for TrackedOp
 14     m->clear_payload();
 15   }
 16 
 17   dout(20) << __func__ << ": op " << *m << dendl;
 18 
 19   hobject_t head = m->get_hobj();
 20   head.snap = CEPH_NOSNAP;
 21 
 22   if (!info.pgid.pgid.contains(
 23     info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {//??PG 处于分裂状态
 24     derr << __func__ << " " << info.pgid.pgid << " does not contain "
 25      << head << " pg_num " << pool.info.get_pg_num() << " hash "
 26      << std::hex << head.get_hash() << std::dec << dendl;
 27     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
 28               << " op " << *m;
 29     ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
 30     return;
 31   }
 32 
 33   bool can_backoff =
 34     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
 35   SessionRef session;
 36   if (can_backoff) {
 37     session = static_cast<Session*>(m->get_connection()->get_priv().get());
 38     if (!session.get()) {
 39       dout(10) << __func__ << " no session" << dendl;
 40       return;
 41     }
 42 
 43     if (session->check_backoff(cct, info.pgid, head, m)) {
 44       return;
 45     }
 46   }
 47 
 48   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
 49     // not implemented.
 50     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
 51     osd->reply_op_error(op, -EINVAL);  //并行执行状态,直接返回失败?
 52     return;
 53   }
 54 
 55   if (op->rmw_flags == 0) {
 56     int r = osd->osd->init_op_flags(op);
 57     if (r) {
 58       osd->reply_op_error(op, r);
 59       return;
 60     }
 61   }
 62 
 63   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
 64              CEPH_OSD_FLAG_LOCALIZE_READS)) &&
 65       op->may_read() &&
 66       !(op->may_write() || op->may_cache())) {//此时只有副本才能执行操作
 67     // balanced reads; any replica will do
 68     if (!(is_primary() || is_replica())) {
 69       osd->handle_misdirected_op(this, op);
 70       return;
 71     }
 72   } else {
 73     // normal case; must be primary
 74     if (!is_primary()) {
 75       osd->handle_misdirected_op(this, op);
 76       return;
 77     }
 78   }
 79 
 80   if (!op_has_sufficient_caps(op)) {  //没有足够cap,则直接返回失败
 81     osd->reply_op_error(op, -EPERM);
 82     return;
 83   }
 84 
 85   if (op->includes_pg_op()) {//op is OpRequest  对于请求中包含对PG的操作 CEPH_OSD_RMW_FLAG_PGOD
 86     return do_pg_op(op);//void PrimaryLogPG::do_pg_op(OpRequestRef op)     
 87   }
 88 
 89   // object name too long?
 90   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {//对象长度,如果大于osd_max_object_name_len,则return
 91     dout(4) << "do_op name is longer than "
 92         << cct->_conf->osd_max_object_name_len
 93         << " bytes" << dendl;
 94     osd->reply_op_error(op, -ENAMETOOLONG);
 95     return;
 96   }
 97   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {//对象local 名称空间长度,如果大于osd_max_object_name_len,则return
 98     dout(4) << "do_op locator is longer than "
 99         << cct->_conf->osd_max_object_name_len
100         << " bytes" << dendl;
101     osd->reply_op_error(op, -ENAMETOOLONG);
102     return;
103   }
104   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {//对象local 名称空间长度,如果大于osd_max_object_name_len,则return
105     dout(4) << "do_op namespace is longer than "
106         << cct->_conf->osd_max_object_namespace_len
107         << " bytes" << dendl;
108     osd->reply_op_error(op, -ENAMETOOLONG);
109     return;
110   }
111 
112   if (int r = osd->store->validate_hobject_key(head)) {//  object的head是否有效
113     dout(4) << "do_op object " << head << " invalid for backing store: "
114         << r << dendl;
115     osd->reply_op_error(op, r);
116     return;
117   }
118 
119   // blacklisted?
120   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {//检查op请求的地址是否在OSDMAP的blacklist中
121     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
122     osd->reply_op_error(op, -EBLACKLISTED);
123     return;
124   }
125 
126   // order this op as a write?
127   bool write_ordered = op->rwordered(); //是否是写请求
128 
129   // discard due to cluster full transition?  (we discard any op that
130   // originates before the cluster or pool is marked full; the client
131   // will resend after the full flag is removed or if they expect the
132   // op to succeed despite being full).  The except is FULL_FORCE and
133   // FULL_TRY ops, which there is no reason to discard because they
134   // bypass all full checks anyway.  If this op isn't write or
135   // read-ordered, we skip.
136   // FIXME: we exclude mds writes for now.
137   if (write_ordered && !(m->get_source().is_mds() ||
138              m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
139              m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
140       info.history.last_epoch_marked_full > m->get_map_epoch()) {
141     dout(10) << __func__ << " discarding op sent before full " << m << " "
142          << *m << dendl;
143     return;
144   }
145   // mds should have stopped writing before this point.
146   // We can't allow OSD to become non-startable even if mds
147   // could be writing as part of file removals.
148   if (write_ordered && osd->check_failsafe_full(get_dpp()) && 
149       !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
150     dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
151     return;
152   }
153   int64_t poolid = get_pgid().pool();
154   if (op->may_write()) {      //如果是写请求
155 
156     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);//获取对应的pool,pool获取失败,直接return?难道没有返回消息?
157     if (!pi) {
158       return;
159     }
160 
161     // invalid?
162     if (m->get_snapid() != CEPH_NOSNAP) {
163       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
164       osd->reply_op_error(op, -EINVAL);
165       return;
166     }
167 
168     // too big?
169     if (cct->_conf->osd_max_write_size &&
170         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {//写请求的数据大于osd_max_write_size << 20 
171       // journal can't hold commit!
172       derr << "do_op msg data len " << m->get_data_len()
173            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
174            << " on " << *m << dendl;
175       osd->reply_op_error(op, -OSD_WRITETOOBIG);
176       return;
177     }
178   }
179 
180   dout(10) << "do_op " << *m
181        << (op->may_write() ? " may_write" : "")
182        << (op->may_read() ? " may_read" : "")
183        << (op->may_cache() ? " may_cache" : "")
184        << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
185        << " flags " << ceph_osd_flag_string(m->get_flags())
186        << dendl;
187 
188   // missing object?
189   if (is_unreadable_object(head)) {//head 有效
190     if (!is_primary()) {//        如果不是主OSD则reply_op_error
191       osd->reply_op_error(op, -EAGAIN);
192       return;
193     }
194     if (can_backoff &&
195     (g_conf()->osd_backoff_on_degraded ||
196      (g_conf()->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
197       add_backoff(session, head, head);
198       maybe_kick_recovery(head);//条件成立,add_backoff(),尝试启动recovery
199     } else {
200       wait_for_unreadable_object(head, op);
201     }
202     return;
203   }
204 
205   if (write_ordered) {
206     // degraded object?
207     if (is_degraded_or_backfilling_object(head)) {
208       if (can_backoff && g_conf()->osd_backoff_on_degraded) {
209         add_backoff(session, head, head);
210         maybe_kick_recovery(head);
211       } else {
212         wait_for_degraded_object(head, op);
213       }
214       return;
215     }
216 
217     if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
218       dout(20) << __func__ << ": waiting for scrub" << dendl;
219       waiting_for_scrub.push_back(op);
220       op->mark_delayed("waiting for scrub");
221       return;
222     }
223 
224     // blocked on snap?
225     //s    head在objects_blocked_on_degraded_snap 则将op放入放入waiting_for_degraded_object
226     if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
227     blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
228       hobject_t to_wait_on(head);
229       to_wait_on.snap = blocked_iter->second;
230       wait_for_degraded_object(to_wait_on, op);
231       return;
232     }
233     
234     //检查head是否在objects_blocked_on_snap_promotion,如果是则将op放入waiting_for_blocked_object
235     if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
236     blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
237       wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
238       return;
239     }
240     
241     //检查head是否在objects_blocked_on_cache_full中,如果是则将op放入waiting_for_cache_not_full
242     if (objects_blocked_on_cache_full.count(head)) {
243       block_write_on_full_cache(head, op);
244       return;
245     }
246   }
247 
248   // dup/resent?
249   if (op->may_write() || op->may_cache()) {
250     // warning: we will get back *a* request for this reqid, but not
251     // necessarily the most recent.  this happens with flush and
252     // promote ops, but we can't possible have both in our log where
253     // the original request is still not stable on disk, so for our
254     // purposes here it doesn't matter which one we get.
255     eversion_t version;
256     version_t user_version;
257     int return_code = 0;
258     bool got = check_in_progress_op(
259       m->get_reqid(), &version, &user_version, &return_code);
260     if (got) {
261       dout(3) << __func__ << " dup " << m->get_reqid()
262           << " version " << version << dendl;
263       if (already_complete(version)) {
264     osd->reply_op_error(op, return_code, version, user_version);
265       } else {
266     dout(10) << " waiting for " << version << " to commit" << dendl;
267         // always queue ondisk waiters, so that we can requeue if needed
268     waiting_for_ondisk[version].emplace_back(op, user_version, return_code);
269     op->mark_delayed("waiting for ondisk");
270       }
271       return;
272     }
273   }
274 
275   ObjectContextRef obc;
276   bool can_create = op->may_write();
277   hobject_t missing_oid;
278 
279   // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
280   hobject_t _oid_head;
281   if (m->get_snapid() == CEPH_SNAPDIR) {
282     _oid_head = m->get_hobj().get_head();
283   }
284   const hobject_t& oid =
285     m->get_snapid() == CEPH_SNAPDIR ? _oid_head : m->get_hobj();
286 
287   // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
288   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
289     OSDOp& osd_op = *p;
290 
291     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
292       if (m->get_snapid() != CEPH_SNAPDIR) {
293     dout(10) << "LIST_SNAPS with incorrect context" << dendl;
294     osd->reply_op_error(op, -EINVAL);
295     return;
296       }
297     } else {
298       if (m->get_snapid() == CEPH_SNAPDIR) {
299     dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
300     osd->reply_op_error(op, -EINVAL);
301     return;
302       }
303     }
304   }
305 
306   // io blocked on obc?
307   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
308       maybe_await_blocked_head(oid, op)) {
309     return;
310   }
311 
312   int r = find_object_context(
313     oid, &obc, can_create,
314     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
315     &missing_oid);
316 
317   // LIST_SNAPS needs the ssc too
318   if (obc &&
319       m->get_snapid() == CEPH_SNAPDIR &&
320       !obc->ssc) {
321     obc->ssc = get_snapset_context(oid, true);
322   }
323 
324   if (r == -EAGAIN) {
325     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
326     // we have to wait for the object.
327     if (is_primary()) {
328       // missing the specific snap we need; requeue and wait.
329       ceph_assert(!op->may_write()); // only happens on a read/cache
330       wait_for_unreadable_object(missing_oid, op);
331       return;
332     }
333   } else if (r == 0) {
334     if (is_unreadable_object(obc->obs.oi.soid)) {
335       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
336            << " is unreadable, waiting" << dendl;
337       wait_for_unreadable_object(obc->obs.oi.soid, op);
338       return;
339     }
340 
341     // degraded object?  (the check above was for head; this could be a clone)
342     if (write_ordered &&
343     obc->obs.oi.soid.snap != CEPH_NOSNAP &&
344     is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
345       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
346            << " is degraded, waiting" << dendl;
347       wait_for_degraded_object(obc->obs.oi.soid, op);
348       return;
349     }
350   }
351 
352   bool in_hit_set = false;
353   if (hit_set) {
354     if (obc.get()) {
355       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
356     in_hit_set = true;
357     } else {
358       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
359         in_hit_set = true;
360     }
361     if (!op->hitset_inserted) {
362       hit_set->insert(oid);
363       op->hitset_inserted = true;
364       if (hit_set->is_full() ||
365           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
366         hit_set_persist();
367       }
368     }
369   }
370 
371   if (agent_state) {
372     if (agent_choose_mode(false, op))
373       return;
374   }
375 
376   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
377     if (maybe_handle_manifest(op,
378                    write_ordered,
379                    obc))
380     return;
381   }
382 
383   if (maybe_handle_cache(op,
384              write_ordered,
385              obc,
386              r,
387              missing_oid,
388              false,
389              in_hit_set))
390     return;
391 
392   if (r && (r != -ENOENT || !obc)) {
393     // copy the reqids for copy get on ENOENT
394     if (r == -ENOENT &&
395     (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
396       fill_in_copy_get_noent(op, oid, m->ops[0]);
397       return;
398     }
399     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
400     if (op->may_write() &&
401     get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
402       record_write_error(op, oid, nullptr, r);
403     } else {
404       osd->reply_op_error(op, r);
405     }
406     return;
407   }
408 
409   // make sure locator is consistent
410   object_locator_t oloc(obc->obs.oi.soid);
411   if (m->get_object_locator() != oloc) {
412     dout(10) << " provided locator " << m->get_object_locator() 
413          << " != object's " << obc->obs.oi.soid << dendl;
414     osd->clog->warn() << "bad locator " << m->get_object_locator() 
415              << " on object " << oloc
416               << " op " << *m;
417   }
418 
419   // io blocked on obc?
420   if (obc->is_blocked() &&
421       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
422     wait_for_blocked_object(obc->obs.oi.soid, op);
423     return;
424   }
425 
426   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
427 
428   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);//这里要创建一个OpContext结构,该结构会接管message中的所有ops的操作,ops的操作就是客户端将rbd请求拆分成object的请求。
429 
430   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
431     dout(20) << __func__ << ": skipping rw locks" << dendl;
432   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
433     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
434 
435     // verify there is in fact a flush in progress
436     // FIXME: we could make this a stronger test.
437     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
438     if (p == flush_ops.end()) {
439       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
440       reply_ctx(ctx, -EINVAL);
441       return;
442     }
443   } else if (!get_rw_locks(write_ordered, ctx)) {
444     dout(20) << __func__ << " waiting for rw locks " << dendl;
445     op->mark_delayed("waiting for rw locks");
446     close_op_ctx(ctx);
447     return;
448   }
449   dout(20) << __func__ << " obc " << *obc << dendl;
450 
451   if (r) {
452     dout(20) << __func__ << " returned an error: " << r << dendl;
453     close_op_ctx(ctx);
454     if (op->may_write() &&
455     get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
456       record_write_error(op, oid, nullptr, r);
457     } else {
458       osd->reply_op_error(op, r);
459     }
460     return;
461   }
462 
463   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
464     ctx->ignore_cache = true;
465   }
466 
467   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
468     // This object is lost. Reading from it returns an error.
469     dout(20) << __func__ << ": object " << obc->obs.oi.soid
470          << " is lost" << dendl;
471     reply_ctx(ctx, -ENFILE);
472     return;
473   }
474   if (!op->may_write() &&
475       !op->may_cache() &&
476       (!obc->obs.exists ||
477        ((m->get_snapid() != CEPH_SNAPDIR) &&
478     obc->obs.oi.is_whiteout()))) {
479     // copy the reqids for copy get on ENOENT
480     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
481       fill_in_copy_get_noent(op, oid, m->ops[0]);
482       close_op_ctx(ctx);
483       return;
484     }
485     reply_ctx(ctx, -ENOENT);
486     return;
487   }
488 
489   op->mark_started();
490 
491   execute_ctx(ctx);
492   utime_t prepare_latency = ceph_clock_now();
493   prepare_latency -= op->get_dequeued_time();
494   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
495   if (op->may_read() && op->may_write()) {
496     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
497   } else if (op->may_read()) {
498     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
499   } else if (op->may_write() || op->may_cache()) {
500     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
501   }
502 
503   // force recovery of the oldest missing object if too many logs
504   maybe_force_recovery();
505 }
复制代码
posted @ 2020-01-19 14:51  yunlion  阅读(903)  评论(0编辑  收藏  举报