copyright_author: 科学边界
copyright_author_href: https://www.daodaodao123.com/
copyright_info: 此文章版权归科学边界所有,如有转载,请注明来自原作者
copyright_url: https://www.daodaodao123.com/?p=715
首先是引导程序,即 bootloader,简单说即 bootloader 会做如下事情:
(1)初始化物理内存;
(2)设置设备树;
(3)解压缩内核映像,将其加载到内核运行地址(可选);
(4)跳转到内核入口地址; 下面进入 Linux 范畴:
一、内核的起始地址
第一个要看的文件,“arch/arm64/kernel/vmlinux.lds.S”,Linux内核的链接脚本。
OUTPUT_ARCH(aarch64) /// 编译目标文件格式为 aarch64
ENTRY(_text) /// 内核入口地址
Linux 内核的内存布局定义
/***************************************************************************
* 内核的内存布局:*
* 包括代码段(.text),只读数据段(.rodata),初始化数据段(.init.), .bss 段等
* 几个常见的地址在 arch/arm64/mm/init.c 加了打印
*
**************************************************************************/
SECTIONS
{
. = KIMAGE_VADDR; /// 内核的起始链接地址,...
.head.text : {
_text = .; ///内核入口地址
HEAD_TEXT
}
.text : ALIGN(SEGMENT_ALIGN) { /* Real text segment */
_stext = .; /* Text and read-only data */
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
ENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
HYPERVISOR_TEXT
IDMAP_TEXT
HIBERNATE_TEXT
TRAMP_TEXT
*(.fixup)
*(.gnu.warning)
. = ALIGN(16);
*(.got) /* Global offset table */
}
}
这里我们追寻一下我司的一个高通的项目sm4450平台的,使用readelf读取vmlinux可知
从vmlinux.lds.S中的section可知,KIMAGE_VADDR地址对应的就是readelf输出结果中的[ 1] .head.text,所以KIMAGE_VADDR为0xffffffc008000000
llvm-readelf.exe -S D:\dump\vmlinux\vmlinux
There are 53 section headers, starting at offset 0x1c8612f0:
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 0000000000000000 000000 000000 00 0 0 0
[ 1] .head.text PROGBITS ffffffc008000000 010000 010000 00 AX 0 0 65536
[ 2] .text PROGBITS ffffffc008010000 020000 14131d8 00 WAX 0 0 4096
[ 3] .rodata PROGBITS ffffffc009430000 1440000 aa45b3 00 WAMS 0 0 4096
[ 4] .rodata1 PROGBITS ffffffc009ed45b3 1ee45b3 000000 00 WA 0 0 1
[ 5] .pci_fixup PROGBITS ffffffc009ed45c0 1ee45c0 0024c0 00 A 0 0 16
[ 6] .builtin_fw PROGBITS ffffffc009ed6a80 1ee6a80 000000 00 A 0 0 8
[ 7] __ksymtab PROGBITS ffffffc009ed6a80 1ee6a80 0084fc 00 A 0 0 4
[ 8] __ksymtab_gpl PROGBITS ffffffc009edef7c 1eeef7c 00c660 00 A 0 0 4
[ 9] __ksymtab_unused PROGBITS ffffffc009eeb5dc 1efb5dc 000000 00 A 0 0 1
[10] __ksymtab_unused_gpl PROGBITS ffffffc009eeb5dc 1efb5dc 000000 00 A 0 0 1
[11] __ksymtab_gpl_future PROGBITS ffffffc009eeb5dc 1efb5dc 000000 00 A 0 0 1
[12] __kcrctab PROGBITS ffffffc009eeb5dc 1efb5dc 002c54 00 A 0 0 4
[13] __kcrctab_gpl PROGBITS ffffffc009eee230 1efe230 004220 00 A 0 0 1
[14] __kcrctab_unused PROGBITS ffffffc009ef2450 1f02450 000000 00 A 0 0 1
[15] __kcrctab_unused_gpl PROGBITS ffffffc009ef2450 1f02450 000000 00 A 0 0 1
[16] __kcrctab_gpl_future PROGBITS ffffffc009ef2450 1f02450 000000 00 A 0 0 1
[17] __ksymtab_strings PROGBITS ffffffc009ef2450 1f02450 02816c 01 AMS 0 0 1
[18] __init_rodata PROGBITS ffffffc009f1a5bc 1f2a5bc 000000 00 A 0 0 1
[19] __param PROGBITS ffffffc009f1a5c0 1f2a5c0 003778 00 A 0 0 8
[20] __modver PROGBITS ffffffc009f1dd38 1f2dd38 0000d8 00 A 0 0 8
[21] __ex_table PROGBITS ffffffc009f1de10 1f2de10 002d68 00 A 0 0 8
[22] .notes NOTE ffffffc009f20b78 1f30b78 00003c 00 A 0 0 4
[23] .hyp.rodata PROGBITS ffffffc009f21000 1f31000 001000 00 WAMS 0 0 8
[24] .init.text PROGBITS ffffffc009f30000 1f40000 05ec8c 00 AX 0 0 4
[25] .exit.text PROGBITS ffffffc009f8ec8c 1f9ec8c 0082dc 00 AX 0 0 4
[26] .altinstructions PROGBITS ffffffc009f96f68 1fa6f68 0cb5bc 00 A 0 0 1
[27] .init.data PROGBITS ffffffc00a070000 2080000 015f35 00 WAMS 0 0 256
[28] .data..percpu PROGBITS ffffffc00a086000 2096000 017898 00 WA 0 0 64
[29] .hyp.data..percpu PROGBITS ffffffc00a09e000 20ae000 000e88 00 WA 0 0 16
[30] .hyp.reloc PROGBITS ffffffc00a09ee88 20aee88 000064 00 A 0 0 4
[31] .rela.dyn RELA ffffffc00a09eef0 20aeef0 000078 18 A 0 0 8
[32] .relr.dyn ANDROID_RELR ffffffc00a09ef68 20aef68 008f20 08 A 0 0 8
[33] .data PROGBITS ffffffc00a0b0000 20c0000 1c0aa0 00 WA 0 0 4096
[34] __bug_table PROGBITS ffffffc00a270aa0 2280aa0 0205b0 00 WA 0 0 4
[35] .mmuoff.data.write PROGBITS ffffffc00a291800 22a1800 000018 00 WA 0 0 2048
[36] .mmuoff.data.read PROGBITS ffffffc00a292000 22a2000 000008 00 WA 0 0 8
[37] .pecoff_edata_padding PROGBITS ffffffc00a292008 22a2008 0001f8 00 WA 0 0 1
[38] .sbss PROGBITS ffffffc00a293000 22a2200 000000 00 WA 0 0 1
[39] .bss NOBITS ffffffc00a293000 22a3000 0f7c94 00 WA 0 0 4096
[40] .eh_frame PROGBITS ffffffc00a390000 22b0000 02ba94 00 A 0 0 8
[41] .debug_aranges PROGBITS 0000000000000000 22dba94 000a20 00 0 0 1
[42] .debug_info PROGBITS 0000000000000000 22dc4b4 1261748f 00 0 0 1
[43] .debug_abbrev PROGBITS 0000000000000000 148f3943 3ab68a 00 0 0 1
[44] .debug_line PROGBITS 0000000000000000 14c9efcd 15eff48 00 0 0 1
[45] .debug_frame PROGBITS 0000000000000000 1628ef18 25c2d0 00 0 0 8
[46] .debug_str PROGBITS 0000000000000000 164eb1e8 498de6 01 MS 0 0 1
[47] .debug_loc PROGBITS 0000000000000000 16983fce 4187c98 00 0 0 1
[48] .debug_ranges PROGBITS 0000000000000000 1ab0bc66 b21e80 00 0 0 1
[49] .comment PROGBITS 0000000000000000 1b62dae6 000116 01 MS 0 0 1
[50] .symtab SYMTAB 0000000000000000 1b62dc00 cb2320 18 51 510804 8
[51] .strtab STRTAB 0000000000000000 1c2dff20 581142 00 0 0 1
[52] .shstrtab STRTAB 0000000000000000 1c861062 00028b 00 0 0 1
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
L (link order), O (extra OS processing required), G (group), T (TLS),
C (compressed), x (unknown), o (OS specific), E (exclude),
R (retain), p (processor specific)
现在我们通过代码来计算这个值:
///arch/arm64/include/asm/memory.h
#define KIMAGE_VADDR (MODULES_END)
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
//MODULES_VSIZE的大小
#define MODULES_VSIZE (SZ_128M)
#define SZ_128M 0x08000000
// MODULES_VADDR大小
#define MODULES_VADDR (KASAN_SHADOW_END)
// 此处判断是否设置了这两个defconfig确定,我从手机导出/proc/config.gz中的config文件确认这两个配置config均为not set
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
#define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \
+ KASAN_SHADOW_OFFSET)
#define KASAN_THREAD_SHIFT 1
#else
#define KASAN_THREAD_SHIFT 0
#define KASAN_SHADOW_END (_PAGE_END(VA_BITS_MIN))
#endif /* CONFIG_KASAN */
// VA_BITS_MIN的确定
#if VA_BITS > 48
#define VA_BITS_MIN (48)
#else
#define VA_BITS_MIN (VA_BITS) //走到了这里,因为设置了CONFIG_ARM64_VA_BITS_39=y,所以VA_BITS_MIN=39
#endif
//_PAGE_END函数
// 这个函数的意思是根据传入的 va 值生成一个掩码,此掩码的特点是从最高位到va-1位都是1,其余位都是0
#define _PAGE_END(va) (-(UL(1) << ((va) - 1)))
经过如上的分析此时内核的起始地址即为:
-(1)<<(39-1) + 0x0800 0000
= -(0x4000000000)+0x0800 0000 //此处的符号为补码,其结果在64位系统中为0xFFFFFFC0 0000 0000
= 0xFFFFFFC0 0000 0000 + 0x0800 0000
= 0xFFFFFFC008000000
这就和从vmlinux中读出来的一致了
llvm-readelf.exe -h D:\dump\vmlinux\vmlinux
ELF Header:
Magic: 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
Class: ELF64
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: DYN (Shared object file)
Machine: AArch64
Version: 0x1
Entry point address: 0xFFFFFFC008000000 //开始点至
Start of program headers: 64 (bytes into file)
Start of section headers: 478548720 (bytes into file)
Flags: 0x0
Size of this header: 64 (bytes)
Size of program headers: 56 (bytes)
Number of program headers: 25
Size of section headers: 64 (bytes)
Number of section headers: 53
Section header string table index: 52
二、head.S
/*
* Kernel startup entry point.
* ---------------------------
*
* The requirements are:
* MMU = off, D-cache = off, I-cache = on or off,
* x0 = physical address to the FDT blob.
*
* This code is mostly position independent so you call this at
* __pa(PAGE_OFFSET).
*
* Note that the callee-saved registers are used for storing variables
* that are useful before the MMU is enabled. The allocations are described
* in the entry routines.
*/
/*********************************************************************************
*
* ARMV8支持EL2和EL3,这些异常等级都可以引导Linux内核的运行;
* Linux内核运行在EL1,
* kernel启动条件的要求:
* CPU:
* 屏蔽CPU上所有的中断,比如清除PSTATE寄存器的DAIF域;
* CPU必须处在EL2或非安全模式的EL1
*
* MMU和高速缓存:
* 关闭MMU;
* 关闭数据高速缓存;//清除内核镜像加载的地址范围的高速缓存,最简单办法,关闭缓存
* 指令高速缓存可关闭或打开;//因为u-boot和内核指令代码不会重叠,缓存不会出错
*
* 其他:
* X0寄存器指向设备树的物理地址;
* 设置时钟,CNTFRQ和CNTVOFF寄存器;
* 内存一致性;
*
* U-boot的作用是加载内核镜像到内存,跳转到kernel入口地址,即这里!
********************************************************************************/
__HEAD
/*
* DO NOT MODIFY. Image header expected by Linux boot-loaders.
*/
efi_signature_nop // special NOP to identity as PE/COFF executable
b primary_entry // branch to kernel start, magic ///跳转到内核启动汇编代码入口
.quad 0 // Image load offset from start of RAM, little-endian
le64sym _kernel_size_le // Effective size of kernel image, little-endian
le64sym _kernel_flags_le // Informative flags, little-endian
.quad 0 // reserved
.quad 0 // reserved
.quad 0 // reserved
.ascii ARM64_IMAGE_MAGIC // Magic number
.long .Lpe_header_offset // Offset to the PE header.
__EFI_PE_HEADER
__INIT ///以下代码处于".init.text"段
2.1 primary_entry
进入 Linux 内核,汇编部分主要完成以下工作:
SYM_CODE_START(primary_entry)
bl preserve_boot_args ///保持启动参数到boot_args[]数组
bl init_kernel_el // w0=cpu_boot_mode ///切换到EL1模式,以运行kernel
adrp x23, __PHYS_OFFSET
and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0
bl set_cpu_boot_mode_flag ///设置set_cpu_boot_mode_flag全局变量
bl __create_page_tables ///创建恒等映射页表,以及内核映像映射页表
/*
* The following calls CPU setup code, see arch/arm64/mm/proc.S for
* details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
bl __cpu_setup // initialise processor ///为打开MMU做一些处理器相关的初始化
b __primary_switch ///启动MMU,并跳转到start_kernel()函数(进入内核的C语言部分)
SYM_CODE_END(primary_entry)
下面细看每个函数内容
2.1.1 preserve_boot_args
///把引导程序传递过来的参数x0~x3保存到boot_args[]数组中
SYM_CODE_START_LOCAL(preserve_boot_args)
mov x21, x0 // x21=FDT,x0设备树地址,暂存在x21
adr_l x0, boot_args // record the contents of
stp x21, x1, [x0] // x0 .. x3 at kernel entry
stp x2, x3, [x0, #16] ///4个参数存入boot_args
dmb sy // needed before dc ivac with
// MMU off
///保证后面__inval_dcache_area清除缓存前,执行完stp指令,保证参数保存完整性
add x1, x0, #0x20 // 4 x 8 bytes
///x0为设备树地址,x1=32为长度,__inval_dcache_area使boot_args[]数组对应的高速缓存失效,并清除缓存
b dcache_inval_poc // tail call
SYM_CODE_END(preserve_boot_args)
2.1.2 init_kernel_el
SYM_FUNC_START(init_kernel_el)
mrs x0, CurrentEL
cmp x0, #CurrentEL_EL2
b.eq init_el2
SYM_INNER_LABEL(init_el1, SYM_L_LOCAL)
mov_q x0, INIT_SCTLR_EL1_MMU_OFF /// 设置大小端
msr sctlr_el1, x0
isb /// 刷新流水线
mov_q x0, INIT_PSTATE_EL1 /// 屏蔽外部中断信号
msr spsr_el1, x0
msr elr_el1, lr /// 设置 el1 返回地址
mov w0, #BOOT_CPU_MODE_EL1 /// 返回值,ARM64 当前运行等级 el1
eret
SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
2.1.3 set_cpu_boot_mode_flag
/*
* Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
* in w0. See arch/arm64/include/asm/virt.h for more info.
*/
SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
adr_l x1, __boot_cpu_mode /// 全局变量,存放本地 CPU 执行等级
cmp w0, #BOOT_CPU_MODE_EL2
b.ne 1f
add x1, x1, #4 ///EL2, 存放在__boot_cpu_mode[1]
1: str w0, [x1] // This CPU has booted in EL1 ///w0 为 init_kernel_el 函数返回的当前 CPU 异常等级
dmb sy /// 确保__boot_cpu_mode 数据完整刷回内存;dc ivac, x1 // Invalidate potentially stale cache line
ret
SYM_FUNC_END(set_cpu_boot_mode_flag)
2.1.4 __create_page_tables
创建恒等映射页表,以及内核映像映射页表,这部分再下一个大章节中详细描述
三、内核启动开始阶段的内存映射
- CPU启动时,MMU是关闭的,CPU访问的是物理地址,而MMU开启后,访问的是虚拟地址;
- 现代处理器大多支持多级流水线,处理器会提前预取多条指令到流水线中, 当打开MMU时,CPU已经预取多条指令到流水线中,并且这些指令都是用物理地址预取的;MMU开启后,将以虚拟地址访问,这样继续访问流水线中预取的指令 (按物理地址预取),就很容易出错;{% u 为解决这个问题,引入“ 恒等映射”,即将虚拟地址映射到相等的物理地址 %},可以巧妙的解决上述问题;这里建立的恒等映射是小范围的,一般内核镜像占用的空间就几M;恒等映射完毕,开启MMU,CPU进入虚拟地址访问阶段;
3.1内存恒等映射
3.1.1 __create_page_tables
SYM_FUNC_START_LOCAL(__create_page_tables)
mov x28, lr ///汇编函数由多级跳转,保存返回地址
/*
* Invalidate the init page tables to avoid potential dirty cache lines
* being evicted. Other page tables are allocated in rodata as part of
* the kernel image, and thus are clean to the PoC per the boot
* protocol.
*/
/*
* 页表地址在vmlinux.lds.S定义,大小为INIT_DIR_SIZE
* init_pg_dir = .;
* . += INIT_DIR_SIZE;
* init_pg_end = .;
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
bl dcache_inval_poc ///init_pg_dir页表高速缓存,失效
/*
* Clear the init page tables. 初始化页表清零
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
subs x1, x1, #64
b.ne 1b
mov x7, SWAPPER_MM_MMUFLAGS ///描述段内存属性,普通内存,块映射,访问权限,共享属性
/*
* Create the identity mapping.
*/
/*
* 在vmlinux.lds.S定义,大小为IDMAP_DIR_SIZE,通常为3个连续4KB页面,分别对应PGD,PUD和PMD页表
* 这里要建立一个2MB大小的块映射
* idmap_pg_dir = .;
* . += IDMAP_DIR_SIZE;
* idmap_pg_end = .;
*/
// 查询我司flame项目代码IDMAP_DIR_SIZE为 ((48-4)/(12-3)-1 )*4kb= 3*4kb
adrp x0, idmap_pg_dir
///.idmap.text段的起始地址,除了开机启动时打开MMU外,内核还有许多场景需要恒等映射,如唤醒处理器的函数cpu_do_resume
adrp x3, __idmap_text_start // __pa(__idmap_text_start)
#ifdef CONFIG_ARM64_VA_BITS_52
mrs_s x6, SYS_ID_AA64MMFR2_EL1
and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
mov x5, #52
cbnz x6, 1f
#endif
mov x5, #VA_BITS_MIN ///我司flame项目VA_BITS_MIN=39
1:
adr_l x6, vabits_actual
str x5, [x6] ///VA_BITS_MIN的值保存在全局变量vabits_actual中
dmb sy //保证str指令数据刷新到内存
dc ivac, x6 // Invalidate potentially stale cache line
/*
* VA_BITS may be too small to allow for an ID mapping to be created
* that covers system RAM if that is located sufficiently high in the
* physical address space. So for the ID map, use an extended virtual
* range in that case, and configure an additional translation level
* if needed.
*
* Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
* entire ID map region can be mapped. As T0SZ == (64 - #bits used),
* this number conveniently equals the number of leading zeroes in
* the physical address of __idmap_text_end.
*/
adrp x5, __idmap_text_end
clz x5, x5 ///统计x5第一个1前由多少个0
cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough?
b.ge 1f // .. then skip VA range extension ///__idmap_text_end没超过VA_BITS_MIN表达的范围,跳转1f
adr_l x6, idmap_t0sz
str x5, [x6]
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line
#if (VA_BITS < 48)
#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))
/*
* If VA_BITS < 48, we have to configure an additional table level.
* First, we have to verify our assumption that the current value of
* VA_BITS was chosen such that all translation levels are fully
* utilised, and that lowering T0SZ will always result in an additional
* translation level to be configured.
*/
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif
mov x4, EXTRA_PTRS
create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
#else
/*
* If VA_BITS == 48, we don't have to configure an additional
* translation level, but the top-level table has more entries.
*/
mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
str_l x4, idmap_ptrs_per_pgd, x5
#endif
1:
ldr_l x4, idmap_ptrs_per_pgd //idmap_ptrs_per_pgd等于PTRS_PER_PGD,表示PGD页表由多少个页表项
adr_l x6, __idmap_text_end // __pa(__idmap_text_end)
///调用map_memory宏建立__idmap_text代码段的映射页表;
/*
* x0:idmap_pg_dir
* x1:
* x3:__idmap_text_start
* x6: __idmap_text_end
* x7: SWAPPER_MM_MMUFLAGS
* x3: __idmap_text_start
* x4: PTRS_PER_PGD
*/
map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
SYM_FUNC_END(__create_page_tables)
3.2 粗粒度内核镜像映射
3.2.1 map_memory
/* tbl:页表起始地址,页表基地址
* rtbl:下一级页表起始地址,通常是tbl+PAGE_SIZE
* vstart:要映射的虚拟地址的起始地址
* vend:要映射的虚拟地址的结束地址
* flags:最后一级页表的属性
* phys:映射的物理地址
* pgds:PGD页表项的个数
*/
.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
sub \vend, \vend, #1
add \rtbl, \tbl, #PAGE_SIZE
mov \sv, \rtbl
mov \count, #0
///compute_indices宏计算vstart,vend在页表中的索引值
compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
///设置页表内容,分别填充一级页表PGD,二级页表PMD, 最后一级页表PT
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#if SWAPPER_PGTABLE_LEVELS > 3
compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#endif
#if SWAPPER_PGTABLE_LEVELS > 2
compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
#endif
compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
.endm
3.2.2 compute_indices
/**************************************************************
* func:计算vstart,vend在页表的索引值,返回值填在istart,iend
*
* vstart:虚拟地址的起始地址
* vend:虚拟地址结束地址;
* shift各级页表在虚拟地址中的偏移;
* ptrs:页表项的个数;
* istart:vstart索引值;
* iend:vend索引值;
* count:页表项个数
**************************************************************/
.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
lsr \iend, \vend, \shift
mov \istart, \ptrs
sub \istart, \istart, #1
and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) iend索引值
mov \istart, \ptrs
mul \istart, \istart, \count
add \iend, \iend, \istart // iend += count * ptrs
// our entries span multiple tables
//跨多个表
lsr \istart, \vstart, \shift
mov \count, \ptrs
sub \count, \count, #1
and \istart, \istart, \count ///istart索引值istart = (vstart >> shift) & (ptrs - 1)
sub \count, \iend, \istart ///页表项个数
.endm
3.2.3 populate_entries
/*******************************************************************
* 填写页表
*
* tbl: 页表基地址
* rtbl: 下级页表基地址
* index: 写入页表的起始索引
* eindex: 页表结束索引
* flags: 页表属性
* inc:
* tmp1: temporary variable
*********************************************************************/
.macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
.Lpe\@: phys_to_pte \tmp1, \rtbl
orr \tmp1, \tmp1, \flags // tmp1 = table entry
str \tmp1, [\tbl, \index, lsl #3] ///
add \rtbl, \rtbl, \inc // rtbl = pa next level /// 这里我理解为 rtbl 的下一个页(简单理解为相邻下个物理页),而不是下一级,跟注释有点不同?add \index, \index, #1
cmp \index, \eindex /// 判断是否填充完,未完则继续填写下一个
b.ls .Lpe\@
.endm
综上,.idmap.text 段的虚拟地址映射到了相同的物理地址上,这个映射表在 idmap_pg_dir 中;
{% tip success %}
问题:那些函数在这个映射的 2MB 内存中?
由 head.s 中的定义知
.section .idmap.text,awx
__enable_mmu, __primary_switch, __cput_setup 等汇编函数都在.idmap.text 段中;可以从 System.map 文件中得到验证;这些函数在 Linux“自举”过程中会用到;
{% endtip %}
{% tip success %}
问题:为什么要使用map_memory创建第二个页表?
CPU 刚启动时,物理内存一般都在低地址(不会超过 256T 大小),恒等映射的地址实际在用户空间了,即 MMU 启用后 idmap_pg_dir 会填入 TTBR0; 而内核空间的链接地址都是在高地址(内核空间在高地址),需要填入 TTBR1;因此,这里再建一张表,映射整个内核镜像,且虚拟地址空间是在高地址区 0xffffxxxx xxxx xxxx
{% endtip %}
/*
* Map the kernel image (starting with PHYS_OFFSET).
*/
/// 调用 map_memory 宏建立整个内核镜像代码段 的映射页表;/**************************************************************************
* 为什么要建第二张表?* CPU 刚启动时,物理内存一般都在低地址(不会超过 256T 大小),恒等映射的地址实际在用户空间了,* 即 MMU 启用后 idmap_pg_dir 会填入 TTBR0;
* 而内核空间的链接地址都是在高地址(内核空间在高地址),需要填入 TTBR1;* 因此,这里再建一张表,映射整个内核镜像,且虚拟地址空间是在高地址区 0xffffxxxx xxxx xxxx
* 注:init_pg_dir 和 idmap_pg_dir 两个页表映射区别:* (1)init_pg_dir 映射的虚拟地址在高位 0xffff xxxx xxxx xxxx;
* idmap_pg_dir 映射的虚拟地址在低位 0x0000 xxxx xxxx xxxx;
* MMU 启用后,init_pg_dir 填入 TTBR1,idmap_pg_dir 填入 TTBR0;* (2)init_pg_dir 映射大小是整个内核镜像,idmap_pg_dir 映射 2M, 只是内存访问过渡,成功开启 MMU 即可;***************************************************************************/
adrp x0, init_pg_dir
mov_q x5, KIMAGE_VADDR // compile time __va(_text)
add x5, x5, x23 // add KASLR displacement
mov x4, PTRS_PER_PGD
adrp x6, _end // runtime __pa(_end)
adrp x3, _text // runtime __pa(_text)
sub x6, x6, x3 // _end - _text
add x6, x6, x5 // runtime __va(_end)
map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
四、__cpu_setup 函数
为打开MMU做一些处理器相关的初始化
/*
* __cpu_setup
*
* Initialise the processor for turning the MMU on.
*
* Output:
* Return in x0 the value of the SCTLR_EL1 register.
*/
.pushsection .idmap.text, awx /// 把__cpu_setup 连接到.idmap.text 段
SYM_FUNC_START(__cpu_setup)
tlbi vmalle1 // Invalidate local TLB /// 本地 TLB 无效
dsb nsh /// 确保 tlbi 执行完
mov x1, #3 << 20
msr cpacr_el1, x1 // Enable FP/ASIMD /// 设定 EL0,EL1 可以访问浮点单元,SIMD 单元
mov x1, #1 << 12 // Reset mdscr_el1 and disable
msr mdscr_el1, x1 // access to the DCC from EL0
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu /// 打开 PSATE 调试功能
reset_pmuserenr_el0 x1 // Disable PMU access from EL0
reset_amuserenr_el0 x1 // Disable AMU access from EL0
/*
* Default values for VMSA control registers. These will be adjusted
* below depending on detected CPU features.
*/
mair .req x17
tcr .req x16
mov_q mair, MAIR_EL1_SET
/// 设置 TCR 寄存器,TCR 用于管理页表映射
mov_q tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS
...
tcr_clear_errata_bits tcr, x9, x5
#ifdef CONFIG_ARM64_VA_BITS_52
ldr_l x9, vabits_actual
sub x9, xzr, x9
add x9, x9, #64
tcr_set_t1sz tcr, x9
#else
ldr_l x9, idmap_t0sz
#endif
tcr_set_t0sz tcr, x9
/*
* Set the IPS bits in TCR_EL1.
*/
tcr_compute_pa_size tcr, #TCR_IPS_SHIFT, x5, x6 ///IPS 域,设置位宽
#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable hardware update of the Access Flags bit.
* Hardware dirty bit management is enabled later,
* via capabilities.
*/
mrs x9, ID_AA64MMFR1_EL1
and x9, x9, #0xf
cbz x9, 1f
orr tcr, tcr, #TCR_HA // hardware Access flag update
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
msr mair_el1, mair
msr tcr_el1, tcr
/*
* Prepare SCTLR
*/
mov_q x0, INIT_SCTLR_EL1_MMU_ON /// 返回值,下个函数__enable_mmu 的参数
ret // return to head.S
.unreq mair
.unreq tcr
SYM_FUNC_END(__cpu_setup)
五、__primary_switch函数
启动MMU,并跳转到start_kernel()函数(进入内核的C语言部分)
SYM_FUNC_START_LOCAL(__primary_switch)
#ifdef CONFIG_RANDOMIZE_BASE /// 内核启动时对内核映像的虚拟地址重新映射,防止黑客攻击
mov x19, x0 // preserve new SCTLR_EL1 value
mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value
#endif
adrp x1, init_pg_dir
bl __enable_mmu /// 参数 x0->SCTLR_EL1,x1->init_pg_dir 页表基地址,开启 MMU
#ifdef CONFIG_RELOCATABLE /// 配置重新映射内核镜像
#ifdef CONFIG_RELR
mov x24, #0 // no RELR displacement yet
#endif
bl __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
blr x8
/*
* If we return here, we have a KASLR displacement in x23 which we need
* to take into account by discarding the current kernel mapping and
* creating a new one.
*/
pre_disable_mmu_workaround
msr sctlr_el1, x20 // disable the MMU
isb
bl __create_page_tables // recreate kernel mapping
tlbi vmalle1 // Remove any stale TLB entries
dsb nsh
isb
set_sctlr_el1 x19 // re-enable the MMU
bl __relocate_kernel
#endif
#endif
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
br x8 /// 实现重映射
SYM_FUNC_END(__primary_switch)
5.1 __enable_mmu函数
/*
* Enable the MMU.
*
* x0 = SCTLR_EL1 value for turning on the MMU.
* x1 = TTBR1_EL1 value
*
* Returns to the caller via x30/lr. This requires the caller to be covered
* by the .idmap.text section.
*
* Checks if the selected granule size is supported by the CPU.
* If it isn't, park the CPU
*/
SYM_FUNC_START(__enable_mmu)
mrs x2, ID_AA64MMFR0_EL1
ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN
b.lt __no_granule_support
cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX
b.gt __no_granule_support
update_early_cpu_boot_status 0, x2, x3
adrp x2, idmap_pg_dir
phys_to_ttbr x1, x1
phys_to_ttbr x2, x2
msr ttbr0_el1, x2 // load TTBR0
offset_ttbr1 x1, x3
msr ttbr1_el1, x1 // load TTBR1 //填充两个页表基地址到TTBR0,TTBR1
isb
set_sctlr_el1 x0 //填充M域,使能MMU
ret
SYM_FUNC_END(__enable_mmu)
5.2 __primary_switched函数
SYM_FUNC_START_LOCAL(__primary_switched)
///已开启mmu,这里开始访问的都是虚拟地址,
///比如init_task静态定义的虚拟地址,已经映射到对应物理内存地址
adr_l x4, init_task ///init_thread_union指向thread_union数据结构,其中包含系统第一个进程(init进程)的内核栈
init_cpu_task x4, x5, x6
adr_l x8, vectors // load VBAR_EL1 with virtual //将异常向量表的地址传入X8寄存器
msr vbar_el1, x8 // vector table address //将x8寄存器内的异常向量表地址写入vbar_el1
isb
stp x29, x30, [sp, #-16]!
mov x29, sp
///保存设备树的地址
str_l x21, __fdt_pointer, x5 // Save FDT pointer
ldr_l x4, kimage_vaddr // Save the offset between
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 // physical mappings
// Clear BSS
///清除未初始化的数据段
adr_l x0, __bss_start
mov x1, xzr
adr_l x2, __bss_stop
sub x2, x2, x0
bl __pi_memset
dsb ishst // Make zero page visible to PTW
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
bl kasan_early_init
#endif
mov x0, x21 // pass FDT address in x0
bl early_fdt_map // Try mapping the FDT early
bl init_feature_override // Parse cpu feature overrides
#ifdef CONFIG_RANDOMIZE_BASE
tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
b.ne 0f
bl kaslr_early_init // parse FDT for KASLR options
cbz x0, 0f // KASLR disabled? just proceed
orr x23, x23, x0 // record KASLR offset
ldp x29, x30, [sp], #16 // we must enable KASLR, return
ret // to __primary_switch()
0:
#endif
bl switch_to_vhe // Prefer VHE if possible
ldp x29, x30, [sp], #16 //sp指向内核栈顶?
bl start_kernel //跳转到C语言入口
ASM_BUG()
SYM_FUNC_END(__primary_switched)
从此刻开始,汇编结束了,内核开启了C语言的世界。。。