先贴一份dump的堆栈信息
-000|__cmpwait(inline)
-000|queued_spin_lock_slowpath(lock = 0xFFFFFFDDBFC5B000, val = 0)
-001|do_raw_spin_lock(lock = 0xFFFFFFDDBFC5B000)
-002|raw_spin_lock_irqsave(?)
-003|rq_pin_lock(inline)
-003|rq_lock_irqsave(inline)
-003|walt_try_to_wake_up(inline)
-003|try_to_wake_up(p = 0xFFFFFFDD7BD26340, ?, wake_flags = 0, sibling_count_hint = 1)
-004|wake_up_process(?)
-005|__queue_work()
-006|atomic_long_fetch_or(inline)
-006|test_and_set_bit(inline)
-006|queue_work_on(?, ?, ?)
-007|npu_ipcc_bridge_mbox_send_data()
-008|__msg_submit(inline)
-008|msg_submit(chan = 0xFFFFFFDDAE60D500)
-009|mbox_send_message(chan = 0xFFFFFFDDAE60D500, mssg = 0x0)
-010|qcom_smp2p_kick(inline)
-010|smp2p_update_bits(?, ?, ?)
-011|qcom_smem_state_update_bits(?, ?, ?)
-012|subsys_crash_shutdown(subsys = 0xFFFFFFDDAC9152A8)
-013|subsys_panic(?, ?)
-014|bus_for_each_dev(?, ?, ?, ?)
-015|ssr_panic_handler(?, ?, ?)
-016|atomic_notifier_call_chain(nh = 0x0, val = 0, v = 0xFFFFFFAA400A3B25)
-017|test_taint(inline)
-017|panic()
-018|die(?, ?, ?)
-019|die_kernel_fault(?, ?, ?, ?)
-020|do_page_fault(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-021|test_ti_thread_flag(inline)
-021|do_translation_fault(?, ?, ?)
-022|do_mem_abort(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-023|el1_da(asm)
-->|exception
-024|__switch_to(prev = 0xFFFFFFDCF7FB4240, ?)
-025|__schedule(?)
-026|__preempt_count_sub(inline)
-026|schedule()
-027|hrtimer_cancel(inline)
-027|schedule_hrtimeout_range_clock(?, ?, ?, ?)
-028|schedule_hrtimeout_range(?, ?, ?)
-029|freezer_count(inline)
-029|freezable_schedule_hrtimeout_range(inline)
从上面的stack frame看是schedule发生上下文切换的时候出现了el1_da 数据指令异常。上面的stack frame分为两部分学习。
schedule部分
-024|__switch_to(prev = 0xFFFFFFDCF7FB4240, ?)
-025|__schedule(?)
-026|__preempt_count_sub(inline)
-026|schedule()
-027|hrtimer_cancel(inline)
-027|schedule_hrtimeout_range_clock(?, ?, ?, ?)
-028|schedule_hrtimeout_range(?, ?, ?)
-029|freezer_count(inline)
-029|freezable_schedule_hrtimeout_range(inline)
指令异常panic流程
-017|panic()
-018|die(?, ?, ?)
-019|die_kernel_fault(?, ?, ?, ?)
-020|do_page_fault(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-021|test_ti_thread_flag(inline)
-021|do_translation_fault(?, ?, ?)
-022|do_mem_abort(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-023|el1_da(asm)
首先来分析shedule的流程:
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
do {
preempt_disable(); //关闭调度
__schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
}
其中在调度之前需要关闭抢占,然后调用核心的调度函数__schedule函数。
static __always_inline volatile int *preempt_count_ptr(void)
{
return ¤t_thread_info()->preempt_count;
}
static __always_inline void __preempt_count_add(int val)
{
*preempt_count_ptr() += val;
}
#define preempt_count_add(val) __preempt_count_add(val)
#define preempt_count_inc() preempt_count_add(1)
#define preempt_disable() \
do { \
preempt_count_inc(); \
barrier(); \
}
上面代码是preempt_disable的流程,关闭抢占其实是给struct thread_info结构中的preempt_count加1
我们在结合T32看task_struct
结构体中的therad_info结构
-026|__preempt_count_sub(inline)
-026|schedule()
| tsk = 0xFFFFFFDCF7FB4240 -> (
| thread_info = (
| flags = 2080,
| padding = (0, 0, 0, 0, 0, 0, 0),
| addr_limit = 549755813887,
| preempt_count = 8),
| state = 1,
| stack = 0xFFFFFF80197B0000,
| usage = (counter = 3),
| flags = 1077952576,
| ptrace = 0,
| wake_entry = (next = 0x0),
| on_cpu = 1,
| cpu = 2,
preempt=true代表抢占调度,强制剥夺当前进程对处理器的使用权; preempt=false代表主动调度,当前进程主动让出处理器。
从stack frame 来看当前进程应该是wait超时,所以主动让出了处理器。
既然是让出处理器,那肯定有一个进程来接手处理器的使用权,所以__schedule应该做两件事情:
找出一个合适的进程
切换当前进程和下一个进程的上下文
static void __sched notrace __schedule(bool preempt)
{
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr; //当前进程,也就是马上要切换出去的进程
next = pick_next_task(rq, prev, &rf); //选择下一个进程
clear_tsk_need_resched(prev); //清掉需要调度的标志位
clear_preempt_need_resched();
rq = context_switch(rq, prev, next, &rf); //切换上下文
}
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
if (likely((prev->sched_class == &idle_sched_class ||
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
again:
for_each_class(class) {
p = class->pick_next_task(rq, prev, rf);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
return p;
}
}
/* The idle class should always have a runnable task: */
BUG();
}
优化: 如果当前的所有进程(rq->nr_running rq->cfs.h_nr_running)属于完全公平调度类( prev->sched_class &fair_sched_class)的话,就直接调用完全公平调度类中的pick_next_task函数
便利所有的调度类,调用其中的pick_next_task函数,选择出最高优先级highest-prio task。
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
linux内核目前中定义为5中调度类,优先级是stop->dl->rt->fair→idle 至于各个调度类中的pick_next_task函数是如何实现的,这里暂且不讨论。当目前为止应该是可以找到一个优先级最高的task了。看下t32
-023|el1_da(asm)
-->|exception
-024|__switch_to(
| [X20] prev = 0xFFFFFFDCF7FB4240,
| ?)
-025|__schedule(
| ?)
| [X19] rq_=_0xFFFFFFDDBFC5B000
| [X20] prev = 0xFFFFFFDCF7FB4240 //current
| [X28] switch_count = 0xFFFFFFDCF7FB4958
| [X23] next = 0xFFFFFFDDB4792140 //by pick_next_task function
-026|__preempt_count_sub(inline)
-026|schedule()
| [X19] tsk = 0xFFFFFFDCF7FB4240 //current
-027|hrtimer_cancel(inline)
-027|schedule_hrtimeout_range_clock(
到目前为止next已经找到了,它的task_struct地址是:0xFFFFFFDDB4792140
既然是切换两个进程,那进程可分为用户空间的进程和内核空间的,一般内核空间称为内核线程。用户空间进程和内核空间的线程一般通过task_stuuct中的mm指针来区分。
struct task_struct{
struct mm_struct *mm;
struct mm_struct *active_mm;
.......
}
如果是用户空间的进程mm和active_mm是相等的,都是指向同一个mm_struct结构的
如果是内核线程,mm=NULL,active_mm在内核线程没运行时等于NULL,在运行时借用上一个进程的内存描述符,因为所以得进程内核地址空间是共享的。
切换上下文,其过程其实就是保存上一个进程的上下文,切换下一个进程。
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
/*
* If mm is non-NULL, we pass through switch_mm(). If mm is
* NULL, we will pass through mmdrop() in finish_task_switch().
* Both of these contain the full memory barrier required by
* membarrier after storing to rq->curr, before returning to
* user-space.
*/
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
} else
switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
return finish_task_switch(prev);
}
如果mm=NULL,说明是next是一个内核线程。因为内核线程没有用户虚拟地址空间,需要借用上一个进程的用户虚拟地址空间。则将oldmm赋值给next→active_mm。
有个问题:借用上一个进程的虚拟地址空间,会不会破坏上一个进程的用户虚拟空间? 答案是不会的,内核不会随意的访问用户空间的
enter_lazy_tlb 通知处理器,不需要切换用户空间虚拟地址,这样可以加速进程上下文的切换
如果mm !=NULL,说明是一个下一个用户空间进程。调用switch_mm_irqs_off完成用户虚拟地址空间的切换
如果prev→mm =NULL,说明上一个是内核线程。既然是内核线程它的active_mm肯定是借用它的上一个,则需要将它的active_mm设置为NULL
其实只有当next是一个用户空间的进程的时候,才真正的需要切换用户空间虚拟地址空间。
ARM64中switch_mm_irqs_off函数的定义如下:
#ifndef switch_mm_irqs_off
#define switch_mm_irqs_off switch_mm
#endif
static inline void
switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
if (prev != next)
__switch_mm(next);
/*
* Update the saved TTBR0_EL1 of the scheduled-in task as the previous
* value may have not been initialised yet (activate_mm caller) or the
* ASID has changed since the last run (following the context switch
* of another thread of the same process).
*/
update_saved_ttbr0(tsk, next);
}
当prev和next不指向同一个mm_struct的时候,调用函数__switch_mm发生上下文切换。__switch_mm函数的实现这里先不展开了,涉及到mm的知识。
#define switch_to(prev, next, last) \
do { \
((last) = __switch_to((prev), (next))); \
} while (0)
/*
* Thread switching.
*/
__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next)
{
struct task_struct *last;
fpsimd_thread_switch(next);
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
entry_task_switch(next);
uao_thread_switch(next);
ssbs_thread_switch(next);
/*
* Complete any pending TLB or cache maintenance on this CPU in case
* the thread migrates to a different CPU.
* This full barrier is also required by the membarrier system
* call.
*/
dsb(ish);
/* the actual thread switch */
last = cpu_switch_to(prev, next);
return last;
}
fpsimd_thread_switch 切换浮点寄存器
hw_breakpoint_thread_switch 切换调试寄存器
entry_task_switch(next); 记录next进程的task_struct
cpu_switch_to 真正的切换
/*
* Register switch for AArch64. The callee-saved registers need to be saved
* and restored. On entry:
* x0 = previous task_struct (must be preserved across the switch)
* x1 = next task_struct
* Previous and next are guaranteed not to be the same.
*
*/
ENTRY(cpu_switch_to)
mov x10, #THREAD_CPU_CONTEXT
add x8, x0, x10
mov x9, sp
stp x19, x20, [x8], #16 // store callee-saved registers
stp x21, x22, [x8], #16
stp x23, x24, [x8], #16
stp x25, x26, [x8], #16
stp x27, x28, [x8], #16
stp x29, x9, [x8], #16
str lr, [x8]
add x8, x1, x10
ldp x19, x20, [x8], #16 // restore callee-saved registers
ldp x21, x22, [x8], #16
ldp x23, x24, [x8], #16
ldp x25, x26, [x8], #16
ldp x27, x28, [x8], #16
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
msr sp_el0, x1
ret
ENDPROC(cpu_switch_to)
NOKPROBE(cpu_switch_to)
调用到ARM64中的汇编实现,其中x0代表上一个进程,x1代表next进程。
THREAD_CPU_CONTEXT代表的是task_struct→thread.cpu_context,cpu_context就是对进程寄存器的描述从x19到pc指针寄存器
struct cpu_context {
unsigned long x19;
unsigned long x20;
unsigned long x21;
unsigned long x22;
unsigned long x23;
unsigned long x24;
unsigned long x25;
unsigned long x26;
unsigned long x27;
unsigned long x28;
unsigned long fp;
unsigned long sp;
unsigned long pc;
};
struct thread_struct {
struct cpu_context cpu_context; /* cpu context */
........
};
mov x10, #THREAD_CPU_CONTEXT #用x10保存进程描述符的thread.cpu_context
add x8, x0, x10 #用x8存储上一个进程(prev)的thread.cpu_context
mov x9, sp #用x9保存sp指针
stp x19, x20, [x8], #16 #将上一个进程的x19,x20存储到上一个进程的cpu_context.x19 cpu_context.x20中
........
str lr, [x8] #寄存器lr存储返回地址
add x8, x1, x10 #寄存器x8存储下一个进程(next)的thread.cpu_context
ldp x19, x20, [x8], #16 #恢复之前存储在next进程的cpu_context到x19,x20
......
mov sp, x9 #设置sp寄存器
msr sp_el0, x1 #将下一个进程的thread_info结构存储到sp_el0中。sp_el0是用户栈指针寄存器
当切换完毕后,会从它的pc指针所指的地方运行