linux 内核启动流程
本文以Linux3.14版本源码为例分析其启动流程。各版本启动代码略有不同,但核心流程与思想万变不离其宗。
内核映像被加载到内存并获得控制权之后,内核启动流程开始。通常,内核映像以压缩形式存储,并不是一个可以执行的内核。因此,内核阶段的首要工作是自解压内核映像。
内核编译生成 vmlinux 后,通常会对其进行压缩,得到 zImage(小内核,小于512KB)或 bzImage(大内核,大于512KB)。在它们的头部嵌有解压缩程序。
通过 linux/arch/arm/boot/compressed 目录下的 Makefile 寻找到 vmlinux文件的链接脚本(vmlinux.lds),从中查找系统启动入口函数。
$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/$(HEAD) $(obj)/piggy.$(suffix_y).o \ $(addprefix $(obj)/, $(OBJS)) $(lib1funcs) $(ashldi3) \ $(bswapsdi2) FORCE @$(check_for_multiple_zreladdr) $(call if_changed,ld) @$(check_for_bad_syms)
vmlinux.lds(linux/arch/arm/kernel/vmlinux.lds)链接脚本开头内容
OUTPUT_ARCH(arm) ENTRY(stext) jiffies = jiffies_64; SECTIONS { 。 。 。
得到内核入口函数为 stext(linux/arch/arm/kernel/head.S)
内核引导阶段
ENTRY(stext) 。 。 。 bl __lookup_processor_type @ r5=procinfo r9=cpuid //处理器是否支持 movs r10, r5 @ invalid processor (r5=0)? THUMB( it eq ) @ force fixup-able long branch encoding beq __error_p @ yes, error 'p' //不支持则打印错误信息 。 。 。 bl __create_page_tables //创建页表 /* * The following calls CPU specific code in a position independent * manner. See arch/arm/mm/proc-*.S for details. r10 = base of * xxx_proc_info structure selected by __lookup_processor_type * above. On return, the CPU will be ready for the MMU to be * turned on, and r0 will hold the CPU control register value. */ ldr r13, =__mmap_switched @ address to jump to after //保存MMU使能后跳转地址 @ mmu has been enabled adr lr, BSYM(1f) @ return (PIC) address mov r8, r4 @ set TTBR1 to swapper_pg_dir ARM( add pc, r10, #PROCINFO_INITFUNC ) THUMB( add r12, r10, #PROCINFO_INITFUNC ) THUMB( mov pc, r12 ) 1: b __enable_mmu //使能MMU后跳转到__mmap_switched
查找标签__mmap_switched所在位置:/linux/arch/arm/kernel/head-common.S
__mmap_switched: /* * The following fragment of code is executed with the MMU on in MMU mode, * and uses absolute addresses; this is not position independent. * * r0 = cp#15 control register * r1 = machine ID * r2 = atags/dtb pointer * r9 = processor ID */ //保存设备信息、设备树及启动参数存储地址 。 。 。 b start_kernel
内核初始化阶段
从start_kernel函数开始,内核进入C语言部分,完成内核的大部分初始化工作。
函数所在位置:/linux/init/Main.c
start_kernel涉及大量初始化工作,只列举重要的初始化工作。
asmlinkage void __init start_kernel(void) { …… //类型判断 smp_setup_processor_id(); //smp相关,返回启动CPU号 …… local_irq_disable(); //关闭当前CPU中断 early_boot_irqs_disabled = true; /* * Interrupts are still disabled. Do necessary setups, then * enable them */ boot_cpu_init(); page_address_init(); //初始化页地址 pr_notice("%s", linux_banner); //显示内核版本信息 setup_arch(&command_line); mm_init_owner(&init_mm, &init_task); mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ build_all_zonelists(NULL, NULL); page_alloc_init(); //页内存申请初始化 pr_notice("Kernel command line: %s\n", boot_command_line); //打印内核启动命令行参数 parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, -1, -1, &unknown_bootoption); …… /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. */ sched_init(); //进程调度器初始化 /* * Disable preemption - early bootup scheduling is extremely * fragile until we cpu_idle() for the first time. */ preempt_disable(); //禁止内核抢占 if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); //检查关闭CPU中断 /*大量初始化内容 见名知意*/ idr_init_cache(); rcu_init(); tick_nohz_init(); context_tracking_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); tick_init(); init_timers(); hrtimers_init(); softirq_init(); timekeeping_init(); time_init(); sched_clock_postinit(); perf_event_init(); profile_init(); call_function_init(); WARN(!irqs_disabled(), "Interrupts were enabled early\n"); early_boot_irqs_disabled = false; local_irq_enable(); //本地中断可以使用了 kmem_cache_init_late(); /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ console_init(); //初始化控制台,可以使用printk了 if (panic_later) panic("Too many boot %s vars at `%s'", panic_later, panic_param); lockdep_info(); /* * Need to run this when irqs are enabled, because it wants * to self-test [hard/soft]-irqs on/off lock inversion bugs * too: */ locking_selftest(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", page_to_pfn(virt_to_page((void *)initrd_start)), min_low_pfn); initrd_start = 0; } #endif page_cgroup_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) late_time_init(); sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); acpi_early_init(); #ifdef CONFIG_X86 if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); #endif #ifdef CONFIG_X86_ESPFIX64 /* Should be run before the first non-init thread is created */ init_espfix_bsp(); #endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); //初始化fork proc_caches_init(); buffer_init(); key_init(); security_init(); dbg_late_init(); vfs_caches_init(totalram_pages); //虚拟文件系统初始化 signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); #ifdef CONFIG_PROC_FS proc_root_init(); #endif cgroup_init(); cpuset_init(); taskstats_init_early(); delayacct_init(); check_bugs(); sfi_init_late(); if (efi_enabled(EFI_RUNTIME_SERVICES)) { efi_late_init(); efi_free_boot_services(); } ftrace_init(); /* Do the rest non-__init'ed, we're now alive */ rest_init(); }
函数最后调用rest_init()函数
/*最重要使命:创建kernel_init进程,并进行后续初始化*/ static noinline void __init_refok rest_init(void) { int pid; rcu_scheduler_starting(); /* * We need to spawn init first so that it obtains pid 1, however * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); //创建kernel_init进程 numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); complete(&kthreadd_done); /* * The boot idle thread must execute schedule() * at least once to get things moving: */ init_idle_bootup_task(current); schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ //cpu_idle就是在系统闲置时用来降低电力的使用和减少热的产生的空转函数,函数至此不再返回,其余工作从kernel_init进程处发起 cpu_startup_entry(CPUHP_ONLINE); }
kernel_init函数将完成设备驱动程序的初始化,并调用init_post函数启动用户进程
部分书籍介绍的内核启动流程基于经典的2.6版本,kernel_init函数还会调用init_post函数专门负责_init进程的启动,现版本已经被整合到了一起。
static int __ref kernel_init(void *unused) { int ret; kernel_init_freeable(); //该函数中完成smp开启 驱动初始化 共享内存初始化等工作 /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); free_initmem(); //初始化尾声,清除内存无用数据 mark_rodata_ro(); system_state = SYSTEM_RUNNING; numa_default_policy(); flush_delayed_fput(); if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) return 0; pr_err("Failed to execute %s (error %d)\n", ramdisk_execute_command, ret); } /* * We try each of these until one succeeds. * * The Bourne shell can be used instead of init if we are * trying to recover a really broken machine. *寻找init函数,创建一号进程_init (第一个用户空间进程)*/ if (execute_command) { ret = run_init_process(execute_command); if (!ret) return 0; pr_err("Failed to execute %s (error %d). Attempting defaults...\n", execute_command, ret); } if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || !try_to_run_init_process("/bin/sh")) return 0; panic("No working init found. Try passing init= option to kernel. " "See Linux Documentation/init.txt for guidance."); } static int __ref kernel_init(void *unused) { int ret; kernel_init_freeable(); //该函数中完成smp开启 驱动初始化 共享内存初始化等工作 /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); free_initmem(); //初始化尾声,清除内存无用数据 mark_rodata_ro(); system_state = SYSTEM_RUNNING; numa_default_policy(); flush_delayed_fput(); if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) return 0; pr_err("Failed to execute %s (error %d)\n", ramdisk_execute_command, ret); } /* * We try each of these until one succeeds. * * The Bourne shell can be used instead of init if we are * trying to recover a really broken machine. *寻找init函数,创建一号进程_init (第一个用户空间进程)*/ if (execute_command) { ret = run_init_process(execute_command); if (!ret) return 0; pr_err("Failed to execute %s (error %d). Attempting defaults...\n", execute_command, ret); } if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || !try_to_run_init_process("/bin/sh")) return 0; panic("No working init found. Try passing init= option to kernel. " "See Linux Documentation/init.txt for guidance."); }
到此,内核初始化已经接近尾声,所有的初始化函数都已经调用,因此free_initmem函数可以舍弃内存的__init_begin至__init_end之间的数据。
当内核被引导并进行初始化后,内核启动了自己的第一个用户空间应用程序_init,这是调用的第一个使用标准C库编译的程序,其进程编号始终为1.
_init负责触发其他必须的进程,以使系统进入整体可用的状态。
以下为内核启动流程图: