linux kernel v6.6.31(LTS)
start_kernel()的实现在 /init/main.c

asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
void start_kernel(void)

先解释一手上面一大串宏的作用:

  • asmlinkage: 这是一个汇编语言链接约定,用于告诉编译器这个函数的参数是从栈上获取的,而不是通过寄存器。在x86架构中,所有内核的入口点(包括中断和系统调用处理程序)都使用asmlinkage,因为当这些函数被调用时,寄存器可能正在被其他用途。
  • __visible: 这个宏确保函数对于链接器是可见的,这样它就可以被正确地放置在内核的二进制映像中,并可以被其他模块引用。
  • __init: 这个标记告诉编译器这个函数只在初始化阶段需要,一旦初始化完成,这个函数就不再需要了。编译器会因此将这个函数的代码放置在内核映像的一个特殊段中,在初始化之后这个段可以被释放,从而节省内存资源。
  • __no_sanitize_address: 这个修饰符用于告诉AddressSanitizer(一种内存错误检测工具)不要对函数进行检测。这通常用于那些不遵循普通内存访问规则的底层代码。
  • __noreturn: 这个修饰符表明函数不会返回给调用者。对于start_kernel来说,这个函数是内核执行的起点,它不会返回到之前的执行环境。
  • __no_stack_protector: 这个修饰符用于关闭堆栈保护,这通常用于那些对性能要求极高或者在特定情况下堆栈保护会导致问题的代码。

由于start_kernel()实在是太长,下面分段解释各个变量和函数的作用,必要时会给出子一级函数的定义和实现。下面看以下几个变量和初始化函数或宏:

char* command_line;
char* after_dashes;

以上两个变量存储引导过程中传递给内核的参数,可以用来配置内核相关功能

set_task_stack_end_magic(&init_task);
smp_setup_processor_id();
debug_objects_early_init();
init_vmlinux_build_id();
cgroup_init_early();
local_irq_disable();
early_boot_irqs_disabled = true;
  • set_task_stack_end_magic(&init_task) impl in /kernel/fork.c
void set_task_stack_end_magic(struct task_struct *tsk)
{
	unsigned long *stackend;

	stackend = end_of_stack(tsk);
	*stackend = STACK_END_MAGIC;	/* for overflow detection */
}

这个函数用于设置init进程(PID 0)的栈结束魔数(magic number)。这个魔数用于检测栈溢出。init_task是内核中的第一个任务,是所有其他任务的祖先。这个函数确保init_task的栈有一个已知的结束标记,以便在栈溢出时可以检测到。

魔数(magic number):特别的没有什么意义数值,此处的魔数是定义在`/include/uapi/linux/magic.h`中的`STACK_END_MAGIC`(0x57AC6E9D)
  • smp_setup_processor_id()
    这个函数用于在多处理器(SMP)系统中设置当前处理器的ID。在SMP系统中,每个CPU都需要知道自己的身份,以便内核可以正确地管理并发执行和处理器间的通信。
  • debug_objects_early_init() impl in /lib/debugobjects.c
void __init debug_objects_early_init(void)
{
	int i;

	for (i = 0; i < ODEBUG_HASH_SIZE; i++)
		raw_spin_lock_init(&obj_hash[i].lock);

	for (i = 0; i < ODEBUG_POOL_SIZE; i++)
		hlist_add_head(&obj_static_pool[i].node, &obj_pool);
}

这个函数用于初始化内核的调试对象子系统。调试对象是一种内核基础设施,用于跟踪内核中的对象(如锁、引用计数等)的生命周期,以帮助开发者发现和修复内存泄漏和死锁等问题。

  • init_vmlinux_build_id() impl in /lib/buildid.c
void __init init_vmlinux_build_id(void)
{
	extern const void __start_notes __weak;
	extern const void __stop_notes __weak;
	unsigned int size = &__stop_notes - &__start_notes;

	build_id_parse_buf(&__start_notes, vmlinux_build_id, size);
}

这个函数用于初始化内核映像的构建ID。构建ID是一个唯一标识内核构建的字符串,通常包含构建的时间戳和Git哈希值。这个ID可以帮助开发者追踪内核的版本和构建信息。

  • cgroup_init_early() impl in /kernel/cgroup/cgroup.c
int __init cgroup_init_early(void)
{
	static struct cgroup_fs_context __initdata ctx;
	struct cgroup_subsys *ss;
	int i;

	ctx.root = &cgrp_dfl_root;
	init_cgroup_root(&ctx);
	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

	for_each_subsys(ss, i) {
		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
		     ss->id, ss->name);
		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

		ss->id = i;
		ss->name = cgroup_subsys_name[i];
		if (!ss->legacy_name)
			ss->legacy_name = cgroup_subsys_name[i];

		if (ss->early_init)
			cgroup_init_subsys(ss, true);
	}
	return 0;
}

这个函数用于初始化控制组(cgroups)的早期阶段。cgroups是Linux内核的一个功能,用于限制、计算和隔离进程组使用的资源(如CPU、内存、网络等)。

  • local_irq_disable() impl in /include/linux/irqflags.h
    定义了CONFIG_TRACE_IRQFLAGS的话 local_irq_disable()的实现是
#define local_irq_disable()				        \
	do {						                \
		bool was_disabled = raw_irqs_disabled();\
		raw_local_irq_disable();		        \
		if (!was_disabled)			            \
			trace_hardirqs_off();		        \
	} while (0)

否则为

#define local_irq_disable()	do { raw_local_irq_disable(); } while (0)

这个宏用于在当前处理器上禁用本地中断。在内核初始化的早期阶段,中断可能会干扰内核的初始化过程,因此有时需要禁用它们。

  • early_boot_irqs_disabled = true
    这行代码将early_boot_irqs_disabled变量设置为true,表示在内核初始化的早期阶段,中断被禁用了。这个变量用于跟踪中断的状态,以确保在适当的时候重新启用中断。
boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
early_security_init();
setup_arch(&command_line); // important!
setup_boot_config();
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();
  • boot_cpu_init() impl in /kernel/cpu.c
void __init boot_cpu_init(void)
{
	int cpu = smp_processor_id();

	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
	set_cpu_online(cpu, true);
	set_cpu_active(cpu, true);
	set_cpu_present(cpu, true);
	set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
	__boot_cpu_id = cpu;
#endif
}

初始化引导CPU(bootstrapping CPU)的数据结构。这通常包括设置CPU的标识信息,如CPU ID,以及可能的特定于CPU的初始化操作。

  • page_address_init()
    初始化页到物理地址的映射。这允许内核通过页结构访问实际的物理内存地址。
  • pr_notice("%s", linux_banner)
    打印Linux内核的版本信息,这通常是在内核启动日志中看到的第一个信息,用于显示内核版本、编译器版本等信息。
  • early_security_init() impl in /security/security.c
int __init early_security_init(void)
{
	struct lsm_info *lsm;

#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
	INIT_HLIST_HEAD(&security_hook_heads.NAME);
#include "linux/lsm_hook_defs.h"
#undef LSM_HOOK

	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
		if (!lsm->enabled)
			lsm->enabled = &lsm_enabled_true;
		prepare_lsm(lsm);
		initialize_lsm(lsm);
	}

	return 0;
}

初始化内核的早期安全特性,如安全增强Linux(SELinux)或其他安全模块的早期设置。


  • setup_arch(&command_line) impl in /arch/<arch>/kernel/setup.c
    执行与体系结构相关的初始化。这个函数非常依赖于具体的硬件架构,它负责设置内核的中断处理、内存管理、CPU调度等。
    此函数非常重要,具体的实现太多,列出过于冗余,下面主要列出 x86 和 arm中的实现
  • x86下的实现
void __init setup_arch(char **cmdline_p)
{
#ifdef CONFIG_X86_32
	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));

	/*
	 * copy kernel address range established so far and switch
	 * to the proper swapper page table
	 */
	clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
			initial_page_table + KERNEL_PGD_BOUNDARY,
			KERNEL_PGD_PTRS);

	load_cr3(swapper_pg_dir);
	/*
	 * Note: Quark X1000 CPUs advertise PGE incorrectly and require
	 * a cr3 based tlb flush, so the following __flush_tlb_all()
	 * will not flush anything because the CPU quirk which clears
	 * X86_FEATURE_PGE has not been invoked yet. Though due to the
	 * load_cr3() above the TLB has been flushed already. The
	 * quirk is invoked before subsequent calls to __flush_tlb_all()
	 * so proper operation is guaranteed.
	 */
	__flush_tlb_all();
#else
	printk(KERN_INFO "Command line: %s\n", boot_command_line);
	boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif

	/*
	 * If we have OLPC OFW, we might end up relocating the fixmap due to
	 * reserve_top(), so do this before touching the ioremap area.
	 */
	olpc_ofw_detect();

	idt_setup_early_traps();
	early_cpu_init();
	jump_label_init();
	static_call_init();
	early_ioremap_init();

	setup_olpc_ofw_pgd();

	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
	screen_info = boot_params.screen_info;
	edid_info = boot_params.edid_info;
#ifdef CONFIG_X86_32
	apm_info.bios = boot_params.apm_bios_info;
	ist_info = boot_params.ist_info;
#endif
	saved_video_mode = boot_params.hdr.vid_mode;
	bootloader_type = boot_params.hdr.type_of_loader;
	if ((bootloader_type >> 4) == 0xe) {
		bootloader_type &= 0xf;
		bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
	}
	bootloader_version  = bootloader_type & 0xf;
	bootloader_version |= boot_params.hdr.ext_loader_ver << 4;

#ifdef CONFIG_BLK_DEV_RAM
	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
#endif
#ifdef CONFIG_EFI
	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
		     EFI32_LOADER_SIGNATURE, 4)) {
		set_bit(EFI_BOOT, &efi.flags);
	} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
		     EFI64_LOADER_SIGNATURE, 4)) {
		set_bit(EFI_BOOT, &efi.flags);
		set_bit(EFI_64BIT, &efi.flags);
	}
#endif

	x86_init.oem.arch_setup();

	/*
	 * Do some memory reservations *before* memory is added to memblock, so
	 * memblock allocations won't overwrite it.
	 *
	 * After this point, everything still needed from the boot loader or
	 * firmware or kernel text should be early reserved or marked not RAM in
	 * e820. All other memory is free game.
	 *
	 * This call needs to happen before e820__memory_setup() which calls the
	 * xen_memory_setup() on Xen dom0 which relies on the fact that those
	 * early reservations have happened already.
	 */
	early_reserve_memory();

	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
	e820__memory_setup();
	parse_setup_data();

	copy_edd();

	if (!boot_params.hdr.root_flags)
		root_mountflags &= ~MS_RDONLY;
	setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);

	code_resource.start = __pa_symbol(_text);
	code_resource.end = __pa_symbol(_etext)-1;
	rodata_resource.start = __pa_symbol(__start_rodata);
	rodata_resource.end = __pa_symbol(__end_rodata)-1;
	data_resource.start = __pa_symbol(_sdata);
	data_resource.end = __pa_symbol(_edata)-1;
	bss_resource.start = __pa_symbol(__bss_start);
	bss_resource.end = __pa_symbol(__bss_stop)-1;

#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
	strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
#else
	if (builtin_cmdline[0]) {
		/* append boot loader cmdline to builtin */
		strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
		strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
		strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
	}
#endif
#endif

	strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
	*cmdline_p = command_line;

	/*
	 * x86_configure_nx() is called before parse_early_param() to detect
	 * whether hardware doesn't support NX (so that the early EHCI debug
	 * console setup can safely call set_fixmap()).
	 */
	x86_configure_nx();

	parse_early_param();

	if (efi_enabled(EFI_BOOT))
		efi_memblock_x86_reserve_range();

#ifdef CONFIG_MEMORY_HOTPLUG
	/*
	 * Memory used by the kernel cannot be hot-removed because Linux
	 * cannot migrate the kernel pages. When memory hotplug is
	 * enabled, we should prevent memblock from allocating memory
	 * for the kernel.
	 *
	 * ACPI SRAT records all hotpluggable memory ranges. But before
	 * SRAT is parsed, we don't know about it.
	 *
	 * The kernel image is loaded into memory at very early time. We
	 * cannot prevent this anyway. So on NUMA system, we set any
	 * node the kernel resides in as un-hotpluggable.
	 *
	 * Since on modern servers, one node could have double-digit
	 * gigabytes memory, we can assume the memory around the kernel
	 * image is also un-hotpluggable. So before SRAT is parsed, just
	 * allocate memory near the kernel image to try the best to keep
	 * the kernel away from hotpluggable memory.
	 */
	if (movable_node_is_enabled())
		memblock_set_bottom_up(true);
#endif

	x86_report_nx();

	apic_setup_apic_calls();

	if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
		apic_is_disabled = true;
#endif
		setup_clear_cpu_cap(X86_FEATURE_APIC);
	}

	e820__reserve_setup_data();
	e820__finish_early_params();

	if (efi_enabled(EFI_BOOT))
		efi_init();

	reserve_ibft_region();
	x86_init.resources.dmi_setup();

	/*
	 * VMware detection requires dmi to be available, so this
	 * needs to be done after dmi_setup(), for the boot CPU.
	 * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be
	 * called before cache_bp_init() for setting up MTRR state.
	 */
	init_hypervisor_platform();

	tsc_early_init();
	x86_init.resources.probe_roms();

	/* after parse_early_param, so could debug it */
	insert_resource(&iomem_resource, &code_resource);
	insert_resource(&iomem_resource, &rodata_resource);
	insert_resource(&iomem_resource, &data_resource);
	insert_resource(&iomem_resource, &bss_resource);

	e820_add_kernel_range();
	trim_bios_range();
#ifdef CONFIG_X86_32
	if (ppro_with_ram_bug()) {
		e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
				  E820_TYPE_RESERVED);
		e820__update_table(e820_table);
		printk(KERN_INFO "fixed physical RAM map:\n");
		e820__print_table("bad_ppro");
	}
#else
	early_gart_iommu_check();
#endif

	/*
	 * partially used pages are not usable - thus
	 * we are rounding upwards:
	 */
	max_pfn = e820__end_of_ram_pfn();

	/* update e820 for memory not covered by WB MTRRs */
	cache_bp_init();
	if (mtrr_trim_uncached_memory(max_pfn))
		max_pfn = e820__end_of_ram_pfn();

	max_possible_pfn = max_pfn;

	/*
	 * Define random base addresses for memory sections after max_pfn is
	 * defined and before each memory section base is used.
	 */
	kernel_randomize_memory();

#ifdef CONFIG_X86_32
	/* max_low_pfn get updated here */
	find_low_pfn_range();
#else
	check_x2apic();

	/* How many end-of-memory variables you have, grandma! */
	/* need this before calling reserve_initrd */
	if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
		max_low_pfn = e820__end_of_low_ram_pfn();
	else
		max_low_pfn = max_pfn;

	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif

	/*
	 * Find and reserve possible boot-time SMP configuration:
	 */
	find_smp_config();

	early_alloc_pgt_buf();

	/*
	 * Need to conclude brk, before e820__memblock_setup()
	 * it could use memblock_find_in_range, could overlap with
	 * brk area.
	 */
	reserve_brk();

	cleanup_highmap();

	memblock_set_current_limit(ISA_END_ADDRESS);
	e820__memblock_setup();

	/*
	 * Needs to run after memblock setup because it needs the physical
	 * memory size.
	 */
	sev_setup_arch();
	cc_random_init();

	efi_fake_memmap();
	efi_find_mirror();
	efi_esrt_init();
	efi_mokvar_table_init();

	/*
	 * The EFI specification says that boot service code won't be
	 * called after ExitBootServices(). This is, in fact, a lie.
	 */
	efi_reserve_boot_services();

	/* preallocate 4k for mptable mpc */
	e820__memblock_alloc_reserved_mpc_new();

#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
	setup_bios_corruption_check();
#endif

#ifdef CONFIG_X86_32
	printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
			(max_pfn_mapped<<PAGE_SHIFT) - 1);
#endif

	/*
	 * Find free memory for the real mode trampoline and place it there. If
	 * there is not enough free memory under 1M, on EFI-enabled systems
	 * there will be additional attempt to reclaim the memory for the real
	 * mode trampoline at efi_free_boot_services().
	 *
	 * Unconditionally reserve the entire first 1M of RAM because BIOSes
	 * are known to corrupt low memory and several hundred kilobytes are not
	 * worth complex detection what memory gets clobbered. Windows does the
	 * same thing for very similar reasons.
	 *
	 * Moreover, on machines with SandyBridge graphics or in setups that use
	 * crashkernel the entire 1M is reserved anyway.
	 */
	x86_platform.realmode_reserve();

	init_mem_mapping();

	idt_setup_early_pf();

	/*
	 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
	 * with the current CR4 value.  This may not be necessary, but
	 * auditing all the early-boot CR4 manipulation would be needed to
	 * rule it out.
	 *
	 * Mask off features that don't work outside long mode (just
	 * PCIDE for now).
	 */
	mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;

	memblock_set_current_limit(get_max_mapped());

	/*
	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
	 */

#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
	if (init_ohci1394_dma_early)
		init_ohci1394_dma_on_all_controllers();
#endif
	/* Allocate bigger log buffer */
	setup_log_buf(1);

	if (efi_enabled(EFI_BOOT)) {
		switch (boot_params.secure_boot) {
		case efi_secureboot_mode_disabled:
			pr_info("Secure boot disabled\n");
			break;
		case efi_secureboot_mode_enabled:
			pr_info("Secure boot enabled\n");
			break;
		default:
			pr_info("Secure boot could not be determined\n");
			break;
		}
	}

	reserve_initrd();

	acpi_table_upgrade();
	/* Look for ACPI tables and reserve memory occupied by them. */
	acpi_boot_table_init();

	vsmp_init();

	io_delay_init();

	early_platform_quirks();

	early_acpi_boot_init();

	initmem_init();
	dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);

	if (boot_cpu_has(X86_FEATURE_GBPAGES))
		hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);

	/*
	 * Reserve memory for crash kernel after SRAT is parsed so that it
	 * won't consume hotpluggable memory.
	 */
	reserve_crashkernel();

	memblock_find_dma_reserve();

	if (!early_xdbc_setup_hardware())
		early_xdbc_register_console();

	x86_init.paging.pagetable_init();

	kasan_init();

	/*
	 * Sync back kernel address range.
	 *
	 * FIXME: Can the later sync in setup_cpu_entry_areas() replace
	 * this call?
	 */
	sync_initial_page_table();

	tboot_probe();

	map_vsyscall();

	x86_32_probe_apic();

	early_quirks();

	/*
	 * Read APIC and some other early information from ACPI tables.
	 */
	acpi_boot_init();
	x86_dtb_init();

	/*
	 * get boot-time SMP configuration:
	 */
	get_smp_config();

	/*
	 * Systems w/o ACPI and mptables might not have it mapped the local
	 * APIC yet, but prefill_possible_map() might need to access it.
	 */
	init_apic_mappings();

	prefill_possible_map();

	init_cpu_to_node();
	init_gi_nodes();

	io_apic_init_mappings();

	x86_init.hyper.guest_late_init();

	e820__reserve_resources();
	e820__register_nosave_regions(max_pfn);

	x86_init.resources.reserve_resources();

	e820__setup_pci_gap();

#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
	if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
		conswitchp = &vga_con;
#endif
#endif
	x86_init.oem.banner();

	x86_init.timers.wallclock_init();

	/*
	 * This needs to run before setup_local_APIC() which soft-disables the
	 * local APIC temporarily and that masks the thermal LVT interrupt,
	 * leading to softlockups on machines which have configured SMI
	 * interrupt delivery.
	 */
	therm_lvt_init();

	mcheck_init();

	register_refined_jiffies(CLOCK_TICK_RATE);

#ifdef CONFIG_EFI
	if (efi_enabled(EFI_BOOT))
		efi_apply_memmap_quirks();
#endif

	unwind_init();
}
  • arm中的实现
void __init setup_arch(char **cmdline_p)
{
	const struct machine_desc *mdesc = NULL;
	void *atags_vaddr = NULL;

	if (__atags_pointer)
		atags_vaddr = FDT_VIRT_BASE(__atags_pointer);

	setup_processor();
	if (atags_vaddr) {
		mdesc = setup_machine_fdt(atags_vaddr);
		if (mdesc)
			memblock_reserve(__atags_pointer,
					 fdt_totalsize(atags_vaddr));
	}
	if (!mdesc)
		mdesc = setup_machine_tags(atags_vaddr, __machine_arch_type);
	if (!mdesc) {
		early_print("\nError: invalid dtb and unrecognized/unsupported machine ID\n");
		early_print("  r1=0x%08x, r2=0x%08x\n", __machine_arch_type,
			    __atags_pointer);
		if (__atags_pointer)
			early_print("  r2[]=%*ph\n", 16, atags_vaddr);
		dump_machine_table();
	}

	machine_desc = mdesc;
	machine_name = mdesc->name;
	dump_stack_set_arch_desc("%s", mdesc->name);

	if (mdesc->reboot_mode != REBOOT_HARD)
		reboot_mode = mdesc->reboot_mode;

	setup_initial_init_mm(_text, _etext, _edata, _end);

	/* populate cmd_line too for later use, preserving boot_command_line */
	strscpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE);
	*cmdline_p = cmd_line;

	early_fixmap_init();
	early_ioremap_init();

	parse_early_param();

#ifdef CONFIG_MMU
	early_mm_init(mdesc);
#endif
	setup_dma_zone(mdesc);
	xen_early_init();
	arm_efi_init();
	/*
	 * Make sure the calculation for lowmem/highmem is set appropriately
	 * before reserving/allocating any memory
	 */
	adjust_lowmem_bounds();
	arm_memblock_init(mdesc);
	/* Memory may have been removed so recalculate the bounds. */
	adjust_lowmem_bounds();

	early_ioremap_reset();

	paging_init(mdesc);
	kasan_init();
	request_standard_resources(mdesc);

	if (mdesc->restart) {
		__arm_pm_restart = mdesc->restart;
		register_restart_handler(&arm_restart_nb);
	}

	unflatten_device_tree();

	arm_dt_init_cpu_maps();
	psci_dt_init();
#ifdef CONFIG_SMP
	if (is_smp()) {
		if (!mdesc->smp_init || !mdesc->smp_init()) {
			if (psci_smp_available())
				smp_set_ops(&psci_smp_ops);
			else if (mdesc->smp)
				smp_set_ops(mdesc->smp);
		}
		smp_init_cpus();
		smp_build_mpidr_hash();
	}
#endif

	if (!is_smp())
		hyp_mode_check();

	reserve_crashkernel();

#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
	conswitchp = &vga_con;
#endif
#endif

	if (mdesc->init_early)
		mdesc->init_early();
}

  • setup_boot_config() impl in /init/main.c
static void __init setup_boot_config(void)
{
	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
	const char *msg, *data;
	int pos, ret;
	size_t size;
	char *err;

	/* Cut out the bootconfig data even if we have no bootconfig option */
	data = get_boot_config_from_initrd(&size);
	/* If there is no bootconfig in initrd, try embedded one. */
	if (!data)
		data = xbc_get_embedded_bootconfig(&size);

	strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
	err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
			 bootconfig_params);

	if (IS_ERR(err) || !(bootconfig_found || IS_ENABLED(CONFIG_BOOT_CONFIG_FORCE)))
		return;

	/* parse_args() stops at the next param of '--' and returns an address */
	if (err)
		initargs_offs = err - tmp_cmdline;

	if (!data) {
		/* If user intended to use bootconfig, show an error level message */
		if (bootconfig_found)
			pr_err("'bootconfig' found on command line, but no bootconfig found\n");
		else
			pr_info("No bootconfig data provided, so skipping bootconfig");
		return;
	}

	if (size >= XBC_DATA_MAX) {
		pr_err("bootconfig size %ld greater than max size %d\n",
			(long)size, XBC_DATA_MAX);
		return;
	}

	ret = xbc_init(data, size, &msg, &pos);
	if (ret < 0) {
		if (pos < 0)
			pr_err("Failed to init bootconfig: %s.\n", msg);
		else
			pr_err("Failed to parse bootconfig: %s at %d.\n",
				msg, pos);
	} else {
		xbc_get_info(&ret, NULL);
		pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
		/* keys starting with "kernel." are passed via cmdline */
		extra_command_line = xbc_make_cmdline("kernel");
		/* Also, "init." keys are init arguments */
		extra_init_args = xbc_make_cmdline("init");
	}
	return;
}

static void __init setup_boot_config(void)
{
	/* Remove bootconfig data from initrd */
	get_boot_config_from_initrd(NULL);
}

设置引导配置, __init标识下说明setup_boot_config()函数是内核函数,链接器有特殊的处理保证不会出现链接错误(具体是什么机制,我也没有查到准确说法)


补充

在Linux操作系统中,initrd(Initial RAM Disk)是一个临时的根文件系统,它在内核启动的早期阶段被加载到内存中,用于提供必要的文件和驱动程序,以便内核能够访问真正的根文件系统。initrd通常被用于当内核还没有加载所有必要的驱动程序来访问硬盘上的文件系统时,或者当根文件系统需要通过网络启动时。

initrd的主要作用如下:

  • 提供驱动: initrd可以包含必要的驱动程序,特别是那些用于磁盘和文件系统的驱动程序,这些驱动程序可能不在内核映像中。这样,即使内核本身不支持某种存储设备,也可以通过initrd中的驱动程序来访问。
  • 准备根文件系统: initrd可以包含工具和脚本,用于挂载真正的根文件系统。例如,它可以包含逻辑卷管理(LVM)或软件RAID的工具,这些工具可以帮助挂载复杂的文件系统配置。
    转换根文件系统: 在某些情况下,initrd可以用于转换根文件系统的状态,例如,从只读状态切换到读写状态,或者从网络文件系统切换到本地文件系统。
  • 引导加载: initrd可以用于网络引导(PXE)的情况,其中整个操作系统是通过网络下载的,包括内核和根文件系统。
    initrd通常是一个压缩的文件系统映像,它在引导过程中由引导加载程序(如GRUB uBoot LILO)加载到内存中。一旦内核启动并运行,它会解压initrd并将其作为临时根文件系统使用。然后,内核会执行initrd中的/init脚本,该脚本负责初始化硬件、加载必要的驱动程序,并最终挂载真正的根文件系统。

在现代Linux系统中,initrd已经被initramfs(Initial RAM File System)所取代。initramfs是一个基于cpio的文件系统映像,它可以在引导时动态地解压到根文件系统中。与initrd相比,initramfs更加灵活,因为它可以动态地调整大小以适应不同的引导情况。


  • setup_command_line(command_line) impl in /init/main.c
/*
 * We need to store the untouched command line for future reference.
 * We also need to store the touched command line since the parameter
 * parsing is performed in place, and we should allow a component to
 * store reference of name/value for future reference.
 */
static void __init setup_command_line(char *command_line)
{
	size_t len, xlen = 0, ilen = 0;

	if (extra_command_line)
		xlen = strlen(extra_command_line);
	if (extra_init_args)
		ilen = strlen(extra_init_args) + 4; /* for " -- " */

	len = xlen + strlen(boot_command_line) + 1;

	saved_command_line = memblock_alloc(len + ilen, SMP_CACHE_BYTES);
	if (!saved_command_line)
		panic("%s: Failed to allocate %zu bytes\n", __func__, len + ilen);

	len = xlen + strlen(command_line) + 1;

	static_command_line = memblock_alloc(len, SMP_CACHE_BYTES);
	if (!static_command_line)
		panic("%s: Failed to allocate %zu bytes\n", __func__, len);

	if (xlen) {
		/*
		 * We have to put extra_command_line before boot command
		 * lines because there could be dashes (separator of init
		 * command line) in the command lines.
		 */
		strcpy(saved_command_line, extra_command_line);
		strcpy(static_command_line, extra_command_line);
	}
	strcpy(saved_command_line + xlen, boot_command_line);
	strcpy(static_command_line + xlen, command_line);

	if (ilen) {
		/*
		 * Append supplemental init boot args to saved_command_line
		 * so that user can check what command line options passed
		 * to init.
		 * The order should always be
		 * " -- "[bootconfig init-param][cmdline init-param]
		 */
		if (initargs_offs) {
			len = xlen + initargs_offs;
			strcpy(saved_command_line + len, extra_init_args);
			len += ilen - 4;	/* strlen(extra_init_args) */
			strcpy(saved_command_line + len,
				boot_command_line + initargs_offs - 1);
		} else {
			len = strlen(saved_command_line);
			strcpy(saved_command_line + len, " -- ");
			len += 4;
			strcpy(saved_command_line + len, extra_init_args);
		}
	}

	saved_command_line_len = strlen(saved_command_line);
}

处理内核命令行参数,这些参数通常由引导加载程序(如GRUB, LILO)传递给内核。

  • setup_nr_cpu_ids()
    设置系统中CPU的总数。这告诉内核有多少个CPU需要初始化和调度。
  • setup_per_cpu_areas()
    为每个CPU分配和初始化 per-cpu 数据区域,用于存储每个CPU的私有数据,如寄存器状态等。
  • smp_prepare_boot_cpu()
    为引导CPU准备 SMP(对称多处理)环境设置CPU间通信的必要数据结构和同步机制。
  • boot_cpu_hotplug_init()
    初始化CPU热插拔支持。允许在系统运行时动态地添加或移除CPU。

TO BE CONTINUE...