[Android] 你的 nSyncAndDrawFrame 到底卡在了哪里?
你的 nSyncAndDrawFrame 到底卡在了哪里?
tldr: 1. 等待渲染结果, 2. 被 suspend (挂起)
以下我们以Android最新源码和Android 12的实际ANR trace为例, 来分析下这个问题
在 JAVA 层的堆栈如下
android.graphics.HardwareRenderer.nSyncAndDrawFrame(Native method)
android.graphics.HardwareRenderer.syncAndDrawFrame(HardwareRenderer.java:465)
android.view.ThreadedRenderer.draw(ThreadedRenderer.java:645)
android.view.ViewRootImpl.draw(ViewRootImpl.java:4831)
android.view.ViewRootImpl.performDraw(ViewRootImpl.java:4537)
android.view.ViewRootImpl.performTraversals(ViewRootImpl.java:3630)
android.view.ViewRootImpl.doTraversal(ViewRootImpl.java:2357)
android.view.ViewRootImpl$TraversalRunnable.run(ViewRootImpl.java:9304)
android.view.Choreographer$CallbackRecord.run(Choreographer.java:1129)
android.view.Choreographer.doCallbacks(Choreographer.java:926)
android.view.Choreographer.doFrame(Choreographer.java:840)
android.view.Choreographer$FrameDisplayEventReceiver.run(Choreographer.java:1114)
android.os.Handler.handleCallback(Handler.java:938)
android.os.Handler.dispatchMessage(Handler.java:99)
android.os.Looper.loopOnce(Looper.java:210)
android.os.Looper.loop(Looper.java:299)
android.app.ActivityThread.main(ActivityThread.java:8306)
在ViewRootImpl traversal 完成后, 触发了一次渲染, 这个任务交给 HardwareRenderer, 并等待在 native 方法 nSyncAndDrawFrame 的调用中. (这里其实有个问题, Dump堆栈是在接受到系统发出的ANR signal后发生的, 在Dump的这段时间内 nSyncAndDrawFrame 方法还是可以得到执行的)
而解析出的 native 层堆栈有两种, 分别对应了结论中的两种情况
- 等待 RenderThread 渲染完成
#00 pc 000000000008581c /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28) (BuildId: 7b57955631cbd73efd22bba3337081f9)
#01 pc 0000000000089da8 /apex/com.android.runtime/lib64/bionic/libc.so (__futex_wait_ex(void volatile*, bool, int, bool, timespec const*)+144) (BuildId: 7b57955631cbd73efd22bba3337081f9)
#02 pc 00000000000ea1a8 /apex/com.android.runtime/lib64/bionic/libc.so (pthread_cond_wait+72) (BuildId: 7b57955631cbd73efd22bba3337081f9)
#03 pc 0000000000369854 /system/lib64/libhwui.so (android::uirenderer::renderthread::DrawFrameTask::postAndWait()+252) (BuildId: 9d81235cc4a1facd192b5dcb7853e97c)
#04 pc 0000000000369728 /system/lib64/libhwui.so (android::uirenderer::renderthread::RenderProxy::syncAndDrawFrame()+48) (BuildId: 9d81235cc4a1facd192b5dcb7853e97c)
卡在了 DrawFrameTask::postAndWait, 这里属于 Android 的渲染流程, 我们来看一下源码
int DrawFrameTask::drawFrame() {
LOG_ALWAYS_FATAL_IF(!mContext, "Cannot drawFrame with no CanvasContext!");
mSyncResult = SyncResult::OK;
mSyncQueued = systemTime(SYSTEM_TIME_MONOTONIC);
postAndWait();
return mSyncResult;
}
void DrawFrameTask::postAndWait() {
ATRACE_CALL();
AutoMutex _lock(mLock);
mRenderThread->queue().post([this]() { run(); });
mSignal.wait(mLock);
}
void DrawFrameTask::run() {
const int64_t vsyncId = mFrameInfo[static_cast<int>(FrameInfoIndex::FrameTimelineVsyncId)];
ATRACE_FORMAT("DrawFrames %" PRId64, vsyncId);
nsecs_t syncDelayDuration = systemTime(SYSTEM_TIME_MONOTONIC) - mSyncQueued;
bool canUnblockUiThread;
bool canDrawThisFrame;
{
TreeInfo info(TreeInfo::MODE_FULL, *mContext);
info.forceDrawFrame = mForceDrawFrame;
mForceDrawFrame = false;
canUnblockUiThread = syncFrameState(info);
canDrawThisFrame = info.out.canDrawThisFrame;
if (mFrameCommitCallback) {
mContext->addFrameCommitListener(std::move(mFrameCommitCallback));
mFrameCommitCallback = nullptr;
}
}
// Grab a copy of everything we need
CanvasContext* context = mContext;
std::function<std::function<void(bool)>(int32_t, int64_t)> frameCallback =
std::move(mFrameCallback);
std::function<void()> frameCompleteCallback = std::move(mFrameCompleteCallback);
mFrameCallback = nullptr;
mFrameCompleteCallback = nullptr;
int64_t intendedVsync = mFrameInfo[static_cast<int>(FrameInfoIndex::IntendedVsync)];
int64_t frameDeadline = mFrameInfo[static_cast<int>(FrameInfoIndex::FrameDeadline)];
int64_t frameStartTime = mFrameInfo[static_cast<int>(FrameInfoIndex::FrameStartTime)];
// From this point on anything in "this" is *UNSAFE TO ACCESS*
if (canUnblockUiThread) {
unblockUiThread();
}
// Even if we aren't drawing this vsync pulse the next frame number will still be accurate
if (CC_UNLIKELY(frameCallback)) {
context->enqueueFrameWork([frameCallback, context, syncResult = mSyncResult,
frameNr = context->getFrameNumber()]() {
auto frameCommitCallback = std::move(frameCallback(syncResult, frameNr));
if (frameCommitCallback) {
context->addFrameCommitListener(std::move(frameCommitCallback));
}
});
}
nsecs_t dequeueBufferDuration = 0;
if (CC_LIKELY(canDrawThisFrame)) {
dequeueBufferDuration = context->draw();
} else {
// Do a flush in case syncFrameState performed any texture uploads. Since we skipped
// the draw() call, those uploads (or deletes) will end up sitting in the queue.
// Do them now
if (GrDirectContext* grContext = mRenderThread->getGrContext()) {
grContext->flushAndSubmit();
}
// wait on fences so tasks don't overlap next frame
context->waitOnFences();
}
if (CC_UNLIKELY(frameCompleteCallback)) {
std::invoke(frameCompleteCallback);
}
if (!canUnblockUiThread) {
unblockUiThread();
}
if (!mHintSessionWrapper) mHintSessionWrapper.emplace(mUiThreadId, mRenderThreadId);
constexpr int64_t kSanityCheckLowerBound = 100000; // 0.1ms
constexpr int64_t kSanityCheckUpperBound = 10000000000; // 10s
int64_t targetWorkDuration = frameDeadline - intendedVsync;
targetWorkDuration = targetWorkDuration * Properties::targetCpuTimePercentage / 100;
if (targetWorkDuration > kSanityCheckLowerBound &&
targetWorkDuration < kSanityCheckUpperBound &&
targetWorkDuration != mLastTargetWorkDuration) {
mLastTargetWorkDuration = targetWorkDuration;
mHintSessionWrapper->updateTargetWorkDuration(targetWorkDuration);
}
int64_t frameDuration = systemTime(SYSTEM_TIME_MONOTONIC) - frameStartTime;
int64_t actualDuration = frameDuration -
(std::min(syncDelayDuration, mLastDequeueBufferDuration)) -
dequeueBufferDuration;
if (actualDuration > kSanityCheckLowerBound && actualDuration < kSanityCheckUpperBound) {
mHintSessionWrapper->reportActualWorkDuration(actualDuration);
}
mLastDequeueBufferDuration = dequeueBufferDuration;
}
void DrawFrameTask::unblockUiThread() {
AutoMutex _lock(mLock);
mSignal.signal();
}
主线程在 postAndWait 中向 RenderThread Post 执行 run 方法, 然后等待条件变量 mSignal, 会被 run 方法调用 unblockUiThread 触发.
run 方法里会根据 canUnblockUiThread 判断是否要阻塞主线程, 具体判断的逻辑我们不去深究, 可以确定的是该方法再等待渲染完成
- 被挂起
native: #00 pc 000000000004c4ec /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28) (BuildId: 3cf600896578484f9ee24df4f1e36b7d)
native: #01 pc 000000000047cc80 /apex/com.android.art/lib64/libart.so (art::ConditionVariable::WaitHoldingLocks(art::Thread*)+140) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #02 pc 000000000055d7f4 /apex/com.android.art/lib64/libart.so (artQuickGenericJniEndTrampoline+1156) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #03 pc 0000000000440564 /apex/com.android.art/lib64/libart.so (art_quick_generic_jni_trampoline+164) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #00 pc 000000000004c4ac /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28) (BuildId: 91887fc17756b96752d59d99fca49b04)
native: #01 pc 000000000047cc80 /apex/com.android.art/lib64/libart.so (art::ConditionVariable::WaitHoldingLocks(art::Thread*)+140) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #02 pc 000000000075b88c /apex/com.android.art/lib64/libart.so (artJniMethodEnd+204) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #03 pc 000000000020facc /apex/com.android.art/lib64/libart.so (art_jni_method_end+12) (BuildId: 56e704c544e6c624201be2ab4933e853)
native: #00 pc 000000000004c4ec /apex/com.android.runtime/lib64/bionic/libc.so (syscall+28)
native: #01 pc 000000000028cd74 /apex/com.android.art/lib64/libart.so (art::ConditionVariable::WaitHoldingLocks(art::Thread*)+148)
native: #02 pc 0000000000746a48 /apex/com.android.art/lib64/libart.so (art::GoToRunnable(art::Thread*)+432)
native: #03 pc 0000000000746860 /apex/com.android.art/lib64/libart.so (art::JniMethodEnd(unsigned int, art::Thread*)+24)
堆栈大同小异, 都是在JNi方法(nSyncAndDrawFrame)退出时, 等待了ConditionVariable
根据GoToRunnable查看源码
static void GoToRunnable(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
if (kIsDebugBuild) {
ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
CHECK(!native_method->IsFastNative()) << native_method->PrettyMethod();
}
self->TransitionFromSuspendedToRunnable();
}
inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
union StateAndFlags old_state_and_flags;
old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
int16_t old_state = old_state_and_flags.as_struct.state;
DCHECK_NE(static_cast<ThreadState>(old_state), kRunnable);
do {
Locks::mutator_lock_->AssertNotHeld(this); // Otherwise we starve GC..
old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
if (LIKELY(old_state_and_flags.as_struct.flags == 0)) {
// Optimize for the return from native code case - this is the fast path.
// Atomically change from suspended to runnable if no suspend request pending.
union StateAndFlags new_state_and_flags;
new_state_and_flags.as_int = old_state_and_flags.as_int;
new_state_and_flags.as_struct.state = kRunnable;
// CAS the value with a memory barrier.
if (LIKELY(tls32_.state_and_flags.as_atomic_int.CompareAndSetWeakAcquire(
old_state_and_flags.as_int,
new_state_and_flags.as_int))) {
// Mark the acquisition of a share of the mutator_lock_.
Locks::mutator_lock_->TransitionFromSuspendedToRunnable(this);
break;
}
} else if ((old_state_and_flags.as_struct.flags & kActiveSuspendBarrier) != 0) {
PassActiveSuspendBarriers(this);
} else if ((old_state_and_flags.as_struct.flags &
(kCheckpointRequest | kEmptyCheckpointRequest)) != 0) {
// Impossible
LOG(FATAL) << "Transitioning to runnable with checkpoint flag, "
<< " flags=" << old_state_and_flags.as_struct.flags
<< " state=" << old_state_and_flags.as_struct.state;
} else if ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
// Wait while our suspend count is non-zero.
// We pass null to the MutexLock as we may be in a situation where the
// runtime is shutting down. Guarding ourselves from that situation
// requires to take the shutdown lock, which is undesirable here.
Thread* thread_to_pass = nullptr;
if (kIsDebugBuild && !IsDaemon()) {
// We know we can make our debug locking checks on non-daemon threads,
// so re-enable them on debug builds.
thread_to_pass = this;
}
MutexLock mu(thread_to_pass, *Locks::thread_suspend_count_lock_);
ScopedTransitioningToRunnable scoped_transitioning_to_runnable(this);
old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
while ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
// Re-check when Thread::resume_cond_ is notified.
Thread::resume_cond_->Wait(thread_to_pass);
old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
}
DCHECK_EQ(GetSuspendCount(), 0);
}
} while (true);
// Run the flip function, if set.
Closure* flip_func = GetFlipFunction();
if (flip_func != nullptr) {
flip_func->Run(this);
}
return static_cast<ThreadState>(old_state);
}
TransitionFromSuspendedToRunnable 的代码很复杂, 但Wait的case只有 kSuspendRequest 一个.
// If set, implies that suspend_count_ > 0 and the Thread should enter the safepoint handler.
kSuspendRequest = 1u << 0
// 该方法的调用只有 ModifySuspendCount
bool Thread::ModifySuspendCountInternal(Thread* self,
int delta,
AtomicInteger* suspend_barrier,
SuspendReason reason) {
...
uint32_t flags = enum_cast<uint32_t>(ThreadFlag::kSuspendRequest);
...
tls32_.suspend_count += delta;
...
if (tls32_.suspend_count == 0) {
AtomicClearFlag(ThreadFlag::kSuspendRequest);
} else {
// Two bits might be set simultaneously.
tls32_.state_and_flags.fetch_or(flags, std::memory_order_seq_cst);
TriggerSuspend();
}
return true;
}
继续确认主线程是何时被调用 ModifySuspendCount, 该方法的调用大部分是从 thread_list 发起的, thread 和 ti_thread 的调用基本上是做为jvm内部机制使用
用 threadlist 到 anr message 中搜索, 基本只有 bugly_trace 的堆栈在调用 RunCheckpoint.
bugly在接收的anr signal时, 会启动线程做dump, 此时会调用该方法来暂停所有线程执行.
main_thread
syncAndDrawFrame ------- anr ---------- JniMethodEnd ------ Wait::kSuspendRequest
bugly \ /
接收到anr信号, 开始Dump -----------
总结
通过以上对源码及两种堆栈的分析, nSyncAndDrawFrame
并没有什么异常操作, 因此还是老老实实优化cpu使用率和布局吧
ref
Android ANR问题总结(非原创) - Vane的博客 | Vane's Blog (vanelst.site)
ANR | Android 开发者 | Android Developers
调试 ART 垃圾回收 | Android 开源项目 | Android Open Source Project (google.cn)