记录一个Mono Runtime与Mono Debugger-Agent的兼容性问题
在mono集成到C++应用程序中时,可以通过一些参数来配置mono启动时暂停,并在调试器链接上来后再继续运行。这对于调试一些启动时机非常早的代码会非常有用。这通过给mono_jit_parse_options传递参数embedding和suspend来实现。
static void print_usage (void) { PRINT_ERROR_MSG ("Usage: mono --debugger-agent=[<option>=<value>,...] ...\n"); PRINT_ERROR_MSG ("Available options:\n"); PRINT_ERROR_MSG (" transport=<transport>\t\tTransport to use for connecting to the debugger (mandatory, possible values: 'dt_socket')\n"); PRINT_ERROR_MSG (" address=<hostname>:<port>\tAddress to connect to (mandatory)\n"); PRINT_ERROR_MSG (" loglevel=<n>\t\t\tLog level (defaults to 0)\n"); PRINT_ERROR_MSG (" logfile=<file>\t\tFile to log to (defaults to stdout)\n"); PRINT_ERROR_MSG (" suspend=y/n\t\t\tWhether to suspend after startup.\n"); PRINT_ERROR_MSG (" timeout=<n>\t\t\tTimeout for connecting in milliseconds.\n"); PRINT_ERROR_MSG (" server=y/n\t\t\tWhether to listen for a client connection.\n"); PRINT_ERROR_MSG (" keepalive=<n>\t\t\tSend keepalive events every n milliseconds.\n"); PRINT_ERROR_MSG (" setpgid=y/n\t\t\tWhether to call setpid(0, 0) after startup.\n"); PRINT_ERROR_MSG (" help\t\t\t\tPrint this help.\n"); }
实测timeout可能与大家想象中的不一致,如果超时调试器没有链接上来,那么将不会再继续接收调试器链接请求。所以这是一个问题。
延迟启动后,在通过mono_jit_init_version初始化Domain完成后,继续执行其它代码,此时会遇到类似下面这个错误:
mono_coop_mutex_lock Cannot transition thread 000000??0000???? from STATE_BLOCKING with DO_BLOCKING
经过一些分析后发现,这个可能与协作式gc同步的兼容性有关系。如果延迟了Mono启动,那么调试器链接上来后,会立马执行一系列的信息获取操作,这些操作就包括: CMD_VM_GET_TYPES_FOR_SOURCE_FILE
#define CMD_VM_VERSION MDBGPROT_CMD_VM_VERSION #define CMD_VM_SET_PROTOCOL_VERSION MDBGPROT_CMD_VM_SET_PROTOCOL_VERSION #define CMD_VM_ALL_THREADS MDBGPROT_CMD_VM_ALL_THREADS #define CMD_VM_SUSPEND MDBGPROT_CMD_VM_SUSPEND #define CMD_VM_RESUME MDBGPROT_CMD_VM_RESUME #define CMD_VM_DISPOSE MDBGPROT_CMD_VM_DISPOSE #define CMD_VM_EXIT MDBGPROT_CMD_VM_EXIT #define CMD_VM_INVOKE_METHOD MDBGPROT_CMD_VM_INVOKE_METHOD #define CMD_VM_INVOKE_METHODS MDBGPROT_CMD_VM_INVOKE_METHODS #define CMD_VM_ABORT_INVOKE MDBGPROT_CMD_VM_ABORT_INVOKE #define CMD_VM_SET_KEEPALIVE MDBGPROT_CMD_VM_SET_KEEPALIVE #define CMD_VM_GET_TYPES_FOR_SOURCE_FILE MDBGPROT_CMD_VM_GET_TYPES_FOR_SOURCE_FILE #define CMD_VM_GET_TYPES MDBGPROT_CMD_VM_GET_TYPES #define CMD_VM_START_BUFFERING MDBGPROT_CMD_VM_START_BUFFERING #define CMD_VM_STOP_BUFFERING MDBGPROT_CMD_VM_STOP_BUFFERING
case CMD_VM_GET_TYPES_FOR_SOURCE_FILE: { char *fname, *basename; gboolean ignore_case; GPtrArray *res_classes, *res_domains; fname = decode_string (p, &p, end); ignore_case = decode_byte (p, &p, end); basename = dbg_path_get_basename (fname); res_classes = g_ptr_array_new (); res_domains = g_ptr_array_new (); mono_loader_lock (); t_start = clock(); GetTypesForSourceFileArgs args; memset (&args, 0, sizeof (args)); args.ignore_case = ignore_case; args.basename = basename; args.res_classes = res_classes; args.res_domains = res_domains; mono_de_foreach_domain (get_types_for_source_file, &args); t_end = clock(); mono_loader_unlock (); time_spent = (double)(t_end - t_start) / CLOCKS_PER_SEC; g_print("CMD_VM_GET_TYPES_FOR_SOURCE_FILE:%f\n", (float)time_spent); g_free (fname); g_free (basename); buffer_add_int (buf, res_classes->len); for (guint i = 0; i < res_classes->len; ++i) buffer_add_typeid (buf, (MonoDomain *)g_ptr_array_index (res_domains, i), (MonoClass *)g_ptr_array_index (res_classes, i)); g_ptr_array_free (res_classes, TRUE); g_ptr_array_free (res_domains, TRUE); break; }
这是对应的代码,其中会对loader进行加锁。这个操作的时间可长可短。而这个操作可能会对MonoRuntime的其它代码执行产生影响。在mono_loader_lock中:
/** * mono_loader_lock: * * See \c docs/thread-safety.txt for the locking strategy. */ void mono_loader_lock (void) { mono_locks_coop_acquire (&loader_mutex, LoaderLock); if (G_UNLIKELY (loader_lock_track_ownership)) { mono_native_tls_set_value (loader_lock_nest_id, GUINT_TO_POINTER (GPOINTER_TO_UINT (mono_native_tls_get_value (loader_lock_nest_id)) + 1)); } } static inline void mono_coop_mutex_lock (MonoCoopMutex *mutex) { /* Avoid thread state switch if lock is not contended */ if (mono_os_mutex_trylock (&mutex->m) == 0) return; MONO_ENTER_GC_SAFE; mono_os_mutex_lock (&mutex->m); MONO_EXIT_GC_SAFE; }
在尝试trylock失败后,会直接进入资源竞争状态,进入之前会尝试将当前线程状态调整到GC_SAFE状态。但是这个看起来没有什么问题的操作就可能引起前面说的崩溃。在我的应用场景中,存在对mono_assembly_get_image的使用,它本身会调整线程的状态,日志如下:
[ABORT_BLOCKING][000000000000B844] STATE_BLOCKING . -> RUNNING . (0 -> 0) mono_assembly_get_image
[DO_BLOCKING][000000000000B844] RUNNING . -> STATE_BLOCKING . (0 -> 0) mono_assembly_get_image
可以看到此时主线程已经进入STATE_BLOCKING状态,如果此时再执行其它mono api,且调试器支持线程正在因为执行CMD_VM_GET_TYPES_FOR_SOURCE_FILE而对loader加了锁,那么主线程的mono_loader_lock将尝试进入GC_SAFE状态,此时会尝试调整线程状态,然而此时线程还处于STATE_BLOCKING状态,这样前面的报错就出现了:
/* This transitions the thread into a cooperative state where it's assumed to be suspended but can continue. Native runtime code might want to put itself into a state where the thread is considered suspended but can keep running. That state only works as long as the only managed state touched is blitable and was pinned before the transition. It returns the action the caller must perform: - Continue: Entered blocking state successfully; - PollAndRetry: Async suspend raced and won, try to suspend and then retry; */ MonoDoBlockingResult mono_threads_transition_do_blocking (MonoThreadInfo* info, const char *func) { int raw_state, cur_state, suspend_count; gboolean no_safepoints; retry_state_change: UNWRAP_THREAD_STATE (raw_state, cur_state, suspend_count, no_safepoints, info); switch (cur_state) { case STATE_RUNNING: //transition to blocked if (!(suspend_count == 0)) mono_fatal_with_history ("suspend_count = %d, but should be == 0", suspend_count); if (no_safepoints) mono_fatal_with_history ("no_safepoints = TRUE, but should be FALSE in state RUNNING with DO_BLOCKING"); if (thread_state_cas (&info->thread_state, build_thread_state (STATE_BLOCKING, suspend_count, no_safepoints), raw_state) != raw_state) goto retry_state_change; trace_state_change_sigsafe ("DO_BLOCKING", info, raw_state, STATE_BLOCKING, no_safepoints, 0, func); return DoBlockingContinue; case STATE_ASYNC_SUSPEND_REQUESTED: if (!(suspend_count > 0)) mono_fatal_with_history ("suspend_count = %d, but should be > 0", suspend_count); if (no_safepoints) mono_fatal_with_history ("no_safepoints = TRUE, but should be FALSE in state ASYNC_SUSPEND_REQUESTED with DO_BLOCKING"); trace_state_change_sigsafe ("DO_BLOCKING", info, raw_state, cur_state, no_safepoints, 0, func); return DoBlockingPollAndRetry; /* STATE_ASYNC_SUSPENDED STATE_SELF_SUSPENDED: Code should not be running while suspended. STATE_BLOCKING: STATE_BLOCKING_SUSPEND_REQUESTED: STATE_BLOCKING_SELF_SUSPENDED: Blocking is not nestabled STATE_BLOCKING_ASYNC_SUSPENDED: Blocking is not nestable _and_ code should not be running while suspended */ default: mono_fatal_with_history ("%s Cannot transition thread %p from %s with DO_BLOCKING", func, mono_thread_info_get_tid (info), state_name (cur_state)); } }
我目前也没有什么好的办法来解决这个问题,毕竟mono不是我写的,协作gc还挺复杂的,不敢乱改。目前我采用的策略是延迟Domain初始化完成后的其它代码的调用。比如推迟1-2秒,确保不会跟调试器支持线程的资源竞争即可。当然这不能根治这个问题,具体细节我提交到github上让微软看去。