环境变量引起的系统bin FC问题分析报告

【NE现场】

Build fingerprint: 'Xiaomi/gemini/gemini:7.0/NRD90M/7.3.30:user/release-keys'
ABI: 'arm64'
pid: 6226, tid: 6226, name: ls >>> ls <<<
signal 6 (SIGABRT), code -6 (SI_TKILL), fault addr --------
x0 0000000000000000 x1 0000000000001852 x2 0000000000000006 x3 0000000000000008
x4 ffffffffffffffff x5 0000000000000000 x6 0000008000808080 x7 2c33351f656e1f63
x8 0000000000000083 x9 ffffffffffffffdf x10 0000000000000000 x11 0000000000000001
x12 ffffffffffffffff x13 0000000000000000 x14 0000000000000000 x15 000477224f38d06c
x16 0000007fefbd0be0 x17 0000007fefbd0aaf x18 00000000ffffffff x19 0000007f9e471b40
x20 0000000000000006 x21 0000007f9e471a98 x22 0000000000000002 x23 0000000000000004
x24 0000000000000000 x25 0000007fefbd1840 x26 0000007fefbd1860 x27 0000007f9e46d348
x28 0000007f9e46d258 x29 0000007fefbd16b0 x30 0000007f9e433408
sp 0000007fefbd1690 pc 0000007f9e433f50 pstate 0000000060000000
fpsr 00000000 fpcr 00000000
backtrace:
#00 pc 0000000000073f50 /system/bin/linker64 (__dl_tgkill+8)
#01 pc 0000000000073404 /system/bin/linker64 (__dl_pthread_kill+64)
#02 pc 0000000000066ef4 /system/bin/linker64 (__dl_raise+24)
#03 pc 00000000000648c8 /system/bin/linker64 (__dl_abort+52)
#04 pc 0000000000066a7c /system/bin/linker64 (_dl__libc_fatal+104)
#05 pc 000000000000fc98 /system/bin/linker64 (_dlZL29_linker_init_post_relocationR19KernelArgumentBlocky+3668)
#06 pc 000000000000eda4 /system/bin/linker64 (_dl__linker_init+528)
#07 pc 0000000000006c78 /system/bin/linker64 (_start+4)

主要表现为ls、sh、chmod、cat、getprop、app_process等系统bin高概率FC。

【问题分析】

coredump调用栈如下:

(gdb) bt
#0  __dl_tgkill () at bionic/libc/arch-arm64/syscalls/tgkill.S:9
#1  0x0000007f8dcc6408 in pthread_kill (t=<optimized out>, sig=6) at bionic/libc/bionic/pthread_kill.cpp:45
#2  0x0000007f8dcb9ef8 in raise (sig=8315) at bionic/libc/bionic/raise.cpp:34
#3  0x0000007f8dcb78cc in abort () at bionic/libc/bionic/abort.cpp:47
#4  0x0000007f8dcb9a80 in __libc_fatal (format=0x0) at bionic/libc/bionic/libc_logging.cpp:678
#5  0x0000007f8dc62c9c in __linker_init_post_relocation (args=..., linker_base=<optimized out>) at bionic/linker/linker.cpp:4270
#6  0x0000007f8dc61da8 in __linker_init (raw_args=<optimized out>) at bionic/linker/linker.cpp:4481
#7  0x0000007f8dc59c7c in __dl__start () at bionic/linker/arch/arm64/begin.S:33

关键点就是#5处:

@bionic/linker/linker.cpp
static ElfW(Addr)
__linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
    ...
  if (!si->prelink_image()) {
    __libc_fatal("CANNOT LINK EXECUTABLE \"%s\": %s", args.argv[0], linker_get_error_buffer());
  }

 

看起来是prelink_image()的时候出错了,具体错误得看__libc_fatal的第三个参数linker_get_error_buffer():

@bionic/linker/linker.cpp
char* linker_get_error_buffer() {
  return &__linker_dl_err_buf[0];
}

错误值放在__linker_dl_err_buf这个buffer中,用gdb查看这个值:

(gdb) p __linker_dl_err_buf$10 = "\"/system/lib/libc.so\" is 32-bit instead of 64-bit\000-bit", '\000' <repeats 713 times>

错误信息的意思是当前进程是64位的,但正在加载的so是32位的。大概是动态库的搜索路径错了。

代码中动态库的搜索路径默认值如下:

@bionic/linker/linker.cpp
static const char* const kDefaultLdPaths[] = {
#if defined(__LP64__)
  "/system/lib64",
  "/vendor/lib64",
#else
  "/system/lib",
  "/vendor/lib",
#endif
  nullptr
 };

 

显然不可能是默认值的问题,除了默认值,动态库的搜索路径还可以通过环境变量指定,

linker启动时先从环境变量中找LD_LIBRARY_PATH对应的值:

如果是空,则取默认值,否则取环境变量中的值:

@bionic/linker/linker.cpp
static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
  ...
  const char* ldpath_env = nullptr;
  const char* ldpreload_env = nullptr;
  if (!getauxval(AT_SECURE)) {
    ldpath_env = getenv("LD_LIBRARY_PATH");
    if (ldpath_env != nullptr) {
      INFO("[ LD_LIBRARY_PATH set to \"%s\" ]", ldpath_env);
    }
    ldpreload_env = getenv("LD_PRELOAD");
    if (ldpreload_env != nullptr) {
      INFO("[ LD_PRELOAD set to \"%s\" ]", ldpreload_env);
    }
} ... parse_LD_LIBRARY_PATH(ldpath_env); parse_LD_PRELOAD(ldpreload_env);

是不是app自定义的环境变量值有问题呢?读取全局变量environ:

(gdb) p environ
$11 = (char **) 0x7ffeb67898
 
(gdb) x /32gx 0x7ffeb67898
0x7ffeb67898:   0x0000007ffeb67b5d  0x0000007ffeb67b6e
0x7ffeb678a8:   0x0000007ffeb67b81  0x0000007ffeb67ba3
0x7ffeb678b8:   0x0000007ffeb67bb8  0x0000007ffeb67bcb
0x7ffeb678c8:   0x0000007ffeb67be6  0x0000007ffeb67c00
0x7ffeb678d8:   0x0000007ffeb67ed3  0x0000007ffeb67eec
0x7ffeb678e8:   0x0000007ffeb67f05  0x0000007ffeb67f2d
0x7ffeb678f8:   0x0000007ffeb67f6a  0x0000000000000000
0x7ffeb67908:   0x0000000000000021  0x0000007f8dc52000
0x7ffeb67918:   0x0000000000000010  0x00000000000000ff
0x7ffeb67928:   0x0000000000000006  0x0000000000001000
0x7ffeb67938:   0x0000000000000011  0x0000000000000064
0x7ffeb67948:   0x0000000000000003  0x0000005555555040
0x7ffeb67958:   0x0000000000000004  0x0000000000000038
0x7ffeb67968:   0x0000000000000005  0x0000000000000009
0x7ffeb67978:   0x0000000000000007  0x0000007f8dc53000
0x7ffeb67988:   0x0000000000000008  0x0000000000000000
 
(gdb) x /s 0x0000007ffeb67b5d
0x7ffeb67b5d:   "_=/system/bin/ls"
 
(gdb) x /s 0x0000007ffeb67b6e
0x7ffeb67b6e:   "ANDROID_DATA=/data"
 
(gdb) x /s 0x0000007ffeb67b81
0x7ffeb67b81:   "ANDROID_SOCKET_zygote_secondary=8"
 
(gdb) x /s 0x0000007ffeb67ba3
0x7ffeb67ba3:   "ANDROID_ROOT=/system"
 
(gdb) x /s 0x0000007ffeb67bb8
0x7ffeb67bb8:   "ANDROID_BOOTLOGO=1"
 
(gdb) x /s 0x0000007ffeb67bcb
0x7ffeb67bcb:   "ANDROID_ASSETS=/system/app"
 
(gdb) x /s 0x0000007ffeb67be6
0x7ffeb67be6:   "ASEC_MOUNTPOINT=/mnt/asec"
 
(gdb) x /s 0x0000007ffeb67c00
0x7ffeb67c00:   "BOOTCLASSPATH=/system/framework/core-oj.jar:/system/framework/core-libart.jar:/system/framework/conscrypt.jar:/system/framework/okhttp.jar:/system/framework/core-junit.jar:/system/framework/bouncycast"...
 
(gdb) x /s 0x0000007ffeb67ed3
0x7ffeb67ed3:   "EXTERNAL_STORAGE=/sdcard"
 
(gdb) x /s 0x0000007ffeb67eec
0x7ffeb67eec:   "ANDROID_STORAGE=/storage"
 
(gdb) x /s 0x0000007ffeb67f05
0x7ffeb67f05:   "LD_LIBRARY_PATH=/vendor/lib:/system/lib"
 
(gdb) x /s 0x0000007ffeb67f2d
0x7ffeb67f2d:   "PATH=/sbin:/vendor/bin:/system/sbin:/system/bin:/system/xbin"
 
(gdb) x /s 0x0000007ffeb67f6a
0x7ffeb67f6a:   "SYSTEMSERVERCLASSPATH=/system/framework/services.jar:/system/framework/ethernet-service.jar:/system/framework/wifi-service.jar"

果然环境变量LD_LIBRARY_PATH已经被设置过,为“/vendor/lib:/system/lib”,这个是32位动态库的路径,难怪程序会找32位的libc.so。

这个ls程序时启动的时候挂的,显然不是ls程序设的环境变量,那只可能是父进程设置了这个环境变量。

为此专门抓了出现问题时的apk包:

u0_a125   20153 770   1731864 57016 SyS_epoll_ 00e85ae208 S com.tencent.android.qqdownloader:tools
u0_a125   20653 20153 7704   1424  sigsuspend 7f86155830 S sh
u0_a125   20657 20653 1704   428   do_signal_ 7f9a670f50 T ls

解压apk包,找到so目录grep一下:

lib/armeabi$ grep -rn LD_LIBRARY_PATH .
匹配到二进制文件 ./libaurora.so

libaurora.so这个库用到了LD_LIBRARY_PATH,那很可能就是这个库设置了环境变量:

lib/armeabi$ arm-linux-androideabi-readelf -s libaurora.so |grep setenv
    33: 00000000     0 FUNC    GLOBAL DEFAULT  UND setenv
 
lib/armeabi$ strings libaurora.so |grep vendor
/vendor/lib:/system/lib:%s
/vendor/lib:/system/lib

看来确实是app自己设置了环境变量,报类似问题的app很多,所以可能这个问题就是系统的缺陷,

只要在32位程序里面设置了环境变量LD_LIBRARY_PATH为/vendor/lib:/system/lib后再加载64位的程序,必然会FC。

为此,自己的写了个demo验证问题:

diff --git a/samples/SimpleJNI/Android.mk b/samples/SimpleJNI/Android.mk
index a9600ef..1c15764 100644
--- a/samples/SimpleJNI/Android.mk
+++ b/samples/SimpleJNI/Android.mk
@@ -36,6 +36,8 @@ LOCAL_PROGUARD_ENABLED := disabled
  
 LOCAL_SDK_VERSION := current
  
+LOCAL_32_BIT_ONLY := true
+
 include $(BUILD_PACKAGE)
  
 # ============================================================
diff --git a/samples/SimpleJNI/jni/native.cpp b/samples/SimpleJNI/jni/native.cpp
index 853c3d9..5fb901e 100644
--- a/samples/SimpleJNI/jni/native.cpp
+++ b/samples/SimpleJNI/jni/native.cpp
@@ -18,6 +18,7 @@
 #include <utils/Log.h>
  
 #include <stdio.h>
+#include <stdlib.h>
  
 #include "jni.h"
  
@@ -25,6 +26,17 @@ static jint
 add(JNIEnv *env, jobject thiz, jint a, jint b) {
 int result = a + b;
     ALOGI("%d + %d = %d", a, b, result);
+
+    setenv("LD_LIBRARY_PATH","/vendor/lib:/system/lib",1);
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        execlp("/system/bin/ls", "ls", NULL);
+    }
     return result;
 }

push到手机中后运行apk,必现FC,调用栈一模一样,error code也一样。

 

【解决方案】

在linker中如果当前程序是64位的,且环境变量LD_LIBRARY_PATH里包含/vendor/lib或/system/lib,就转换成/vendor/lib64或/system/lib64。

修改后,运行demo不再FC。

 

posted @ 2017-05-15 21:06  YYPapa  阅读(1152)  评论(0编辑  收藏  举报