[Android] 你的 nSyncAndDrawFrame 到底卡在了哪里?

你的 nSyncAndDrawFrame 到底卡在了哪里?

tldr: 1. 等待渲染结果, 2. 被 suspend (挂起)

以下我们以Android最新源码和Android 12的实际ANR trace为例, 来分析下这个问题

在 JAVA 层的堆栈如下

android.graphics.HardwareRenderer.nSyncAndDrawFrame(Native method)
android.graphics.HardwareRenderer.syncAndDrawFrame(HardwareRenderer.java:465)
android.view.ThreadedRenderer.draw(ThreadedRenderer.java:645)
android.view.ViewRootImpl.draw(ViewRootImpl.java:4831)
android.view.ViewRootImpl.performDraw(ViewRootImpl.java:4537)
android.view.ViewRootImpl.performTraversals(ViewRootImpl.java:3630)
android.view.ViewRootImpl.doTraversal(ViewRootImpl.java:2357)
android.view.ViewRootImpl$TraversalRunnable.run(ViewRootImpl.java:9304)
android.view.Choreographer$CallbackRecord.run(Choreographer.java:1129)
android.view.Choreographer.doCallbacks(Choreographer.java:926)
android.view.Choreographer.doFrame(Choreographer.java:840)
android.view.Choreographer$FrameDisplayEventReceiver.run(Choreographer.java:1114)
android.os.Handler.handleCallback(Handler.java:938)
android.os.Handler.dispatchMessage(Handler.java:99)
android.os.Looper.loopOnce(Looper.java:210)
android.os.Looper.loop(Looper.java:299)
android.app.ActivityThread.main(ActivityThread.java:8306)

在ViewRootImpl traversal 完成后, 触发了一次渲染, 这个任务交给 HardwareRenderer, 并等待在 native 方法 nSyncAndDrawFrame 的调用中. (这里其实有个问题, Dump堆栈是在接受到系统发出的ANR signal后发生的, 在Dump的这段时间内 nSyncAndDrawFrame 方法还是可以得到执行的)

而解析出的 native 层堆栈有两种, 分别对应了结论中的两种情况

  1. 等待 RenderThread 渲染完成
#00 pc 000000000008581c /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28) (BuildId: 7b57955631cbd73efd22bba3337081f9)
#01 pc 0000000000089da8 /apex/com.android.runtime/lib64/bionic/libc.so (__futex_wait_ex(void volatile*, bool, int, bool, timespec const*)+144) (BuildId: 7b57955631cbd73efd22bba3337081f9)
#02 pc 00000000000ea1a8 /apex/com.android.runtime/lib64/bionic/libc.so (pthread_cond_wait+72) (BuildId: 7b57955631cbd73efd22bba3337081f9)
#03 pc 0000000000369854 /system/lib64/libhwui.so (android::uirenderer::renderthread::DrawFrameTask::postAndWait()+252) (BuildId: 9d81235cc4a1facd192b5dcb7853e97c)
#04 pc 0000000000369728 /system/lib64/libhwui.so (android::uirenderer::renderthread::RenderProxy::syncAndDrawFrame()+48) (BuildId: 9d81235cc4a1facd192b5dcb7853e97c)

卡在了 DrawFrameTask::postAndWait, 这里属于 Android 的渲染流程, 我们来看一下源码

int DrawFrameTask::drawFrame() {
    LOG_ALWAYS_FATAL_IF(!mContext, "Cannot drawFrame with no CanvasContext!");

    mSyncResult = SyncResult::OK;
    mSyncQueued = systemTime(SYSTEM_TIME_MONOTONIC);
    postAndWait();

    return mSyncResult;
}

void DrawFrameTask::postAndWait() {
    ATRACE_CALL();
    AutoMutex _lock(mLock);
    mRenderThread->queue().post([this]() { run(); });
    mSignal.wait(mLock);
}

void DrawFrameTask::run() {
    const int64_t vsyncId = mFrameInfo[static_cast<int>(FrameInfoIndex::FrameTimelineVsyncId)];
    ATRACE_FORMAT("DrawFrames %" PRId64, vsyncId);
    nsecs_t syncDelayDuration = systemTime(SYSTEM_TIME_MONOTONIC) - mSyncQueued;

    bool canUnblockUiThread;
    bool canDrawThisFrame;
    {
        TreeInfo info(TreeInfo::MODE_FULL, *mContext);
        info.forceDrawFrame = mForceDrawFrame;
        mForceDrawFrame = false;
        canUnblockUiThread = syncFrameState(info);
        canDrawThisFrame = info.out.canDrawThisFrame;

        if (mFrameCommitCallback) {
            mContext->addFrameCommitListener(std::move(mFrameCommitCallback));
            mFrameCommitCallback = nullptr;
        }
    }

    // Grab a copy of everything we need
    CanvasContext* context = mContext;
    std::function<std::function<void(bool)>(int32_t, int64_t)> frameCallback =
            std::move(mFrameCallback);
    std::function<void()> frameCompleteCallback = std::move(mFrameCompleteCallback);
    mFrameCallback = nullptr;
    mFrameCompleteCallback = nullptr;
    int64_t intendedVsync = mFrameInfo[static_cast<int>(FrameInfoIndex::IntendedVsync)];
    int64_t frameDeadline = mFrameInfo[static_cast<int>(FrameInfoIndex::FrameDeadline)];
    int64_t frameStartTime = mFrameInfo[static_cast<int>(FrameInfoIndex::FrameStartTime)];

    // From this point on anything in "this" is *UNSAFE TO ACCESS*
    if (canUnblockUiThread) {
        unblockUiThread();
    }

    // Even if we aren't drawing this vsync pulse the next frame number will still be accurate
    if (CC_UNLIKELY(frameCallback)) {
        context->enqueueFrameWork([frameCallback, context, syncResult = mSyncResult,
                                   frameNr = context->getFrameNumber()]() {
            auto frameCommitCallback = std::move(frameCallback(syncResult, frameNr));
            if (frameCommitCallback) {
                context->addFrameCommitListener(std::move(frameCommitCallback));
            }
        });
    }

    nsecs_t dequeueBufferDuration = 0;
    if (CC_LIKELY(canDrawThisFrame)) {
        dequeueBufferDuration = context->draw();
    } else {
        // Do a flush in case syncFrameState performed any texture uploads. Since we skipped
        // the draw() call, those uploads (or deletes) will end up sitting in the queue.
        // Do them now
        if (GrDirectContext* grContext = mRenderThread->getGrContext()) {
            grContext->flushAndSubmit();
        }
        // wait on fences so tasks don't overlap next frame
        context->waitOnFences();
    }

    if (CC_UNLIKELY(frameCompleteCallback)) {
        std::invoke(frameCompleteCallback);
    }

    if (!canUnblockUiThread) {
        unblockUiThread();
    }

    if (!mHintSessionWrapper) mHintSessionWrapper.emplace(mUiThreadId, mRenderThreadId);
    constexpr int64_t kSanityCheckLowerBound = 100000;       // 0.1ms
    constexpr int64_t kSanityCheckUpperBound = 10000000000;  // 10s
    int64_t targetWorkDuration = frameDeadline - intendedVsync;
    targetWorkDuration = targetWorkDuration * Properties::targetCpuTimePercentage / 100;
    if (targetWorkDuration > kSanityCheckLowerBound &&
        targetWorkDuration < kSanityCheckUpperBound &&
        targetWorkDuration != mLastTargetWorkDuration) {
        mLastTargetWorkDuration = targetWorkDuration;
        mHintSessionWrapper->updateTargetWorkDuration(targetWorkDuration);
    }
    int64_t frameDuration = systemTime(SYSTEM_TIME_MONOTONIC) - frameStartTime;
    int64_t actualDuration = frameDuration -
                             (std::min(syncDelayDuration, mLastDequeueBufferDuration)) -
                             dequeueBufferDuration;
    if (actualDuration > kSanityCheckLowerBound && actualDuration < kSanityCheckUpperBound) {
        mHintSessionWrapper->reportActualWorkDuration(actualDuration);
    }

    mLastDequeueBufferDuration = dequeueBufferDuration;
}

void DrawFrameTask::unblockUiThread() {
    AutoMutex _lock(mLock);
    mSignal.signal();
}

主线程在 postAndWait 中向 RenderThread Post 执行 run 方法, 然后等待条件变量 mSignal, 会被 run 方法调用 unblockUiThread 触发.

run 方法里会根据 canUnblockUiThread 判断是否要阻塞主线程, 具体判断的逻辑我们不去深究, 可以确定的是该方法再等待渲染完成

  1. 被挂起
native: #00 pc 000000000004c4ec /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28) (BuildId: 3cf600896578484f9ee24df4f1e36b7d)
native: #01 pc 000000000047cc80 /apex/com.android.art/lib64/libart.so (art::ConditionVariable::WaitHoldingLocks(art::Thread*)+140) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #02 pc 000000000055d7f4 /apex/com.android.art/lib64/libart.so (artQuickGenericJniEndTrampoline+1156) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #03 pc 0000000000440564 /apex/com.android.art/lib64/libart.so (art_quick_generic_jni_trampoline+164) (BuildId: 56e704c544e6c624201be2ab4933e853)

native: #00 pc 000000000004c4ac /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28) (BuildId: 91887fc17756b96752d59d99fca49b04)
native: #01 pc 000000000047cc80 /apex/com.android.art/lib64/libart.so (art::ConditionVariable::WaitHoldingLocks(art::Thread*)+140) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #02 pc 000000000075b88c /apex/com.android.art/lib64/libart.so (artJniMethodEnd+204) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #03 pc 000000000020facc /apex/com.android.art/lib64/libart.so (art_jni_method_end+12) (BuildId: 56e704c544e6c624201be2ab4933e853)


native: #00 pc 000000000004c4ec /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28)
native: #01 pc 000000000028cd74 /apex/com.android.art/lib64/libart.so (art::ConditionVariable::WaitHoldingLocks(art::Thread*)+148)
native: #02 pc 0000000000746a48 /apex/com.android.art/lib64/libart.so (art::GoToRunnable(art::Thread*)+432)
native: #03 pc 0000000000746860 /apex/com.android.art/lib64/libart.so (art::JniMethodEnd(unsigned int, art::Thread*)+24)

堆栈大同小异, 都是在JNi方法(nSyncAndDrawFrame)退出时, 等待了ConditionVariable

根据GoToRunnable查看源码

static void GoToRunnable(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
  if (kIsDebugBuild) {
    ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
    CHECK(!native_method->IsFastNative()) << native_method->PrettyMethod();
  }

  self->TransitionFromSuspendedToRunnable();
}


inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
  union StateAndFlags old_state_and_flags;
  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
  int16_t old_state = old_state_and_flags.as_struct.state;
  DCHECK_NE(static_cast<ThreadState>(old_state), kRunnable);
  do {
    Locks::mutator_lock_->AssertNotHeld(this);  // Otherwise we starve GC..
    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
    DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
    if (LIKELY(old_state_and_flags.as_struct.flags == 0)) {
      // Optimize for the return from native code case - this is the fast path.
      // Atomically change from suspended to runnable if no suspend request pending.
      union StateAndFlags new_state_and_flags;
      new_state_and_flags.as_int = old_state_and_flags.as_int;
      new_state_and_flags.as_struct.state = kRunnable;

      // CAS the value with a memory barrier.
      if (LIKELY(tls32_.state_and_flags.as_atomic_int.CompareAndSetWeakAcquire(
                                                 old_state_and_flags.as_int,
                                                 new_state_and_flags.as_int))) {
        // Mark the acquisition of a share of the mutator_lock_.
        Locks::mutator_lock_->TransitionFromSuspendedToRunnable(this);
        break;
      }
    } else if ((old_state_and_flags.as_struct.flags & kActiveSuspendBarrier) != 0) {
      PassActiveSuspendBarriers(this);
    } else if ((old_state_and_flags.as_struct.flags &
                (kCheckpointRequest | kEmptyCheckpointRequest)) != 0) {
      // Impossible
      LOG(FATAL) << "Transitioning to runnable with checkpoint flag, "
                 << " flags=" << old_state_and_flags.as_struct.flags
                 << " state=" << old_state_and_flags.as_struct.state;
    } else if ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
      // Wait while our suspend count is non-zero.

      // We pass null to the MutexLock as we may be in a situation where the
      // runtime is shutting down. Guarding ourselves from that situation
      // requires to take the shutdown lock, which is undesirable here.
      Thread* thread_to_pass = nullptr;
      if (kIsDebugBuild && !IsDaemon()) {
        // We know we can make our debug locking checks on non-daemon threads,
        // so re-enable them on debug builds.
        thread_to_pass = this;
      }
      MutexLock mu(thread_to_pass, *Locks::thread_suspend_count_lock_);
      ScopedTransitioningToRunnable scoped_transitioning_to_runnable(this);
      old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
      DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
      while ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
        // Re-check when Thread::resume_cond_ is notified.
        Thread::resume_cond_->Wait(thread_to_pass);
        old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
        DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
      }
      DCHECK_EQ(GetSuspendCount(), 0);
    }
  } while (true);
  // Run the flip function, if set.
  Closure* flip_func = GetFlipFunction();
  if (flip_func != nullptr) {
    flip_func->Run(this);
  }
  return static_cast<ThreadState>(old_state);
}

TransitionFromSuspendedToRunnable 的代码很复杂, 但Wait的case只有 kSuspendRequest 一个.

// If set, implies that suspend_count_ > 0 and the Thread should enter the safepoint handler.
kSuspendRequest = 1u << 0

// 该方法的调用只有 ModifySuspendCount
bool Thread::ModifySuspendCountInternal(Thread* self,
                                        int delta,
                                        AtomicInteger* suspend_barrier,
                                        SuspendReason reason) {
    
  ...
  uint32_t flags = enum_cast<uint32_t>(ThreadFlag::kSuspendRequest);
  ...
  tls32_.suspend_count += delta;
  ...
  if (tls32_.suspend_count == 0) {
    AtomicClearFlag(ThreadFlag::kSuspendRequest);
  } else {
    // Two bits might be set simultaneously.
    tls32_.state_and_flags.fetch_or(flags, std::memory_order_seq_cst);
    TriggerSuspend();
  }
  return true;
}

继续确认主线程是何时被调用 ModifySuspendCount, 该方法的调用大部分是从 thread_list 发起的, thread 和 ti_thread 的调用基本上是做为jvm内部机制使用

用 threadlist 到 anr message 中搜索, 基本只有 bugly_trace 的堆栈在调用 RunCheckpoint.

bugly在接收的anr signal时, 会启动线程做dump, 此时会调用该方法来暂停所有线程执行.

main_thread
    syncAndDrawFrame ------- anr ---------- JniMethodEnd ------  Wait::kSuspendRequest
bugly                          \                                  /
                                接收到anr信号, 开始Dump ----------- 

image.png

总结

通过以上对源码及两种堆栈的分析, nSyncAndDrawFrame 并没有什么异常操作, 因此还是老老实实优化cpu使用率和布局吧

ref

Android ANR问题总结(非原创) - Vane的博客 | Vane's Blog (vanelst.site)

ANR  |  Android 开发者  |  Android Developers

调试 ART 垃圾回收  |  Android 开源项目  |  Android Open Source Project (google.cn)

posted @ 2024-01-15 10:51  新新人類  阅读(670)  评论(0编辑  收藏  举报