[Android稳定性] 第055篇从dump信息角度学调度schedule

AI智能摘要

本文分析了Linux内核中进程调度的流程和上下文切换的细节。在调度过程中，首先关闭抢占，然后调用核心调度函数__schedule，该函数会关闭抢占，选择下一个任务，切换上下文，并最终切换到下一个任务的执行。上下文切换涉及到保存上一个进程的上下文，加载下一个进程的上下文，并最终切换到下一个任务的执行。

此摘要由AI分析文章内容生成，仅供参考。

先贴一份dump的堆栈信息

-000|__cmpwait(inline)
-000|queued_spin_lock_slowpath(lock = 0xFFFFFFDDBFC5B000, val = 0)
-001|do_raw_spin_lock(lock = 0xFFFFFFDDBFC5B000)
-002|raw_spin_lock_irqsave(?)
-003|rq_pin_lock(inline)
-003|rq_lock_irqsave(inline)
-003|walt_try_to_wake_up(inline)
-003|try_to_wake_up(p = 0xFFFFFFDD7BD26340, ?, wake_flags = 0, sibling_count_hint = 1)
-004|wake_up_process(?)
-005|__queue_work()
-006|atomic_long_fetch_or(inline)
-006|test_and_set_bit(inline)
-006|queue_work_on(?, ?, ?)
-007|npu_ipcc_bridge_mbox_send_data()
-008|__msg_submit(inline)
-008|msg_submit(chan = 0xFFFFFFDDAE60D500)
-009|mbox_send_message(chan = 0xFFFFFFDDAE60D500, mssg = 0x0)
-010|qcom_smp2p_kick(inline)
-010|smp2p_update_bits(?, ?, ?)
-011|qcom_smem_state_update_bits(?, ?, ?)
-012|subsys_crash_shutdown(subsys = 0xFFFFFFDDAC9152A8)
-013|subsys_panic(?, ?)
-014|bus_for_each_dev(?, ?, ?, ?)
-015|ssr_panic_handler(?, ?, ?)
-016|atomic_notifier_call_chain(nh = 0x0, val = 0, v = 0xFFFFFFAA400A3B25)
-017|test_taint(inline)
-017|panic()
-018|die(?, ?, ?)
-019|die_kernel_fault(?, ?, ?, ?)
-020|do_page_fault(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-021|test_ti_thread_flag(inline)
-021|do_translation_fault(?, ?, ?)
-022|do_mem_abort(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-023|el1_da(asm)
 -->|exception
-024|__switch_to(prev = 0xFFFFFFDCF7FB4240, ?)
-025|__schedule(?)
-026|__preempt_count_sub(inline)
-026|schedule()
-027|hrtimer_cancel(inline)
-027|schedule_hrtimeout_range_clock(?, ?, ?, ?)
-028|schedule_hrtimeout_range(?, ?, ?)
-029|freezer_count(inline)
-029|freezable_schedule_hrtimeout_range(inline)

从上面的stack frame看是schedule发生上下文切换的时候出现了el1_da 数据指令异常。上面的stack frame分为两部分学习。

schedule部分

-024|__switch_to(prev = 0xFFFFFFDCF7FB4240, ?)
-025|__schedule(?)
-026|__preempt_count_sub(inline)
-026|schedule()
-027|hrtimer_cancel(inline)
-027|schedule_hrtimeout_range_clock(?, ?, ?, ?)
-028|schedule_hrtimeout_range(?, ?, ?)
-029|freezer_count(inline)
-029|freezable_schedule_hrtimeout_range(inline)

指令异常panic流程

-017|panic()
-018|die(?, ?, ?)
-019|die_kernel_fault(?, ?, ?, ?)
-020|do_page_fault(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-021|test_ti_thread_flag(inline)
-021|do_translation_fault(?, ?, ?)
-022|do_mem_abort(addr = 16328, esr = 2516582406, regs = 0xFFFFFF80197B3AA0)
-023|el1_da(asm)

首先来分析shedule的流程：

asmlinkage __visible void __sched schedule(void)
{
    struct task_struct *tsk = current;
 
    sched_submit_work(tsk);
    do {
        preempt_disable();   //关闭调度
        __schedule(false);
        sched_preempt_enable_no_resched();
    } while (need_resched());
}

其中在调度之前需要关闭抢占，然后调用核心的调度函数__schedule函数。

static __always_inline volatile int *preempt_count_ptr(void)
{
    return &current_thread_info()->preempt_count;
}
 
static __always_inline void __preempt_count_add(int val)
{
    *preempt_count_ptr() += val;
}
 
#define preempt_count_add(val)    __preempt_count_add(val)
#define preempt_count_inc() preempt_count_add(1)
 
#define preempt_disable() \
do { \
    preempt_count_inc(); \
    barrier(); \
}

上面代码是preempt_disable的流程，关闭抢占其实是给struct thread_info结构中的preempt_count加1

我们在结合T32看task_struct结构体中的therad_info结构

-026|__preempt_count_sub(inline)
-026|schedule()
    |  tsk = 0xFFFFFFDCF7FB4240 -> (
    |    thread_info = (
    |      flags = 2080,
    |      padding = (0, 0, 0, 0, 0, 0, 0),
    |      addr_limit = 549755813887,
    |      preempt_count = 8),
    |    state = 1,
    |    stack = 0xFFFFFF80197B0000,
    |    usage = (counter = 3),
    |    flags = 1077952576,
    |    ptrace = 0,
    |    wake_entry = (next = 0x0),
    |    on_cpu = 1,
    |    cpu = 2,

preempt=true代表抢占调度，强制剥夺当前进程对处理器的使用权； preempt=false代表主动调度，当前进程主动让出处理器。

从stack frame 来看当前进程应该是wait超时，所以主动让出了处理器。

既然是让出处理器，那肯定有一个进程来接手处理器的使用权，所以__schedule应该做两件事情：

找出一个合适的进程
切换当前进程和下一个进程的上下文

static void __sched notrace __schedule(bool preempt)
{
    cpu = smp_processor_id();
    rq = cpu_rq(cpu);
    prev = rq->curr;            //当前进程，也就是马上要切换出去的进程
 
    next = pick_next_task(rq, prev, &rf);    //选择下一个进程
    clear_tsk_need_resched(prev);            //清掉需要调度的标志位
    clear_preempt_need_resched();
 
   rq = context_switch(rq, prev, next, &rf);   //切换上下文
}

/*
 * Pick up the highest-prio task:
 */
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
    const struct sched_class *class;
    struct task_struct *p;
 
    /*
     * Optimization: we know that if all tasks are in the fair class we can
     * call that function directly, but only if the @prev task wasn't of a
     * higher scheduling class, because otherwise those loose the
     * opportunity to pull in more work from other CPUs.
     */
    if (likely((prev->sched_class == &idle_sched_class ||
            prev->sched_class == &fair_sched_class) &&
           rq->nr_running == rq->cfs.h_nr_running)) {
 
        p = fair_sched_class.pick_next_task(rq, prev, rf);
        if (unlikely(p == RETRY_TASK))
            goto again;
 
        /* Assumes fair_sched_class->next == idle_sched_class */
        if (unlikely(!p))
            p = idle_sched_class.pick_next_task(rq, prev, rf);
 
        return p;
    }
 
again:
    for_each_class(class) {
        p = class->pick_next_task(rq, prev, rf);
        if (p) {
            if (unlikely(p == RETRY_TASK))
                goto again;
            return p;
        }
    }
 
    /* The idle class should always have a runnable task: */
    BUG();
}

优化：如果当前的所有进程（rq->nr_running rq->cfs.h_nr_running）属于完全公平调度类（ prev->sched_class &fair_sched_class）的话，就直接调用完全公平调度类中的pick_next_task函数
便利所有的调度类，调用其中的pick_next_task函数，选择出最高优先级highest-prio task。

extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;

linux内核目前中定义为5中调度类，优先级是stop->dl->rt->fair→idle 至于各个调度类中的pick_next_task函数是如何实现的，这里暂且不讨论。当目前为止应该是可以找到一个优先级最高的task了。看下t32

-023|el1_da(asm)
 -->|exception
-024|__switch_to(
    |    [X20] prev = 0xFFFFFFDCF7FB4240,
    |  ?)
-025|__schedule(
    |  ?)
    |  [X19] rq_=_0xFFFFFFDDBFC5B000    
    |  [X20] prev = 0xFFFFFFDCF7FB4240   //current
    |  [X28] switch_count = 0xFFFFFFDCF7FB4958
    |  [X23] next = 0xFFFFFFDDB4792140   //by pick_next_task function
-026|__preempt_count_sub(inline)
-026|schedule()
    |  [X19] tsk = 0xFFFFFFDCF7FB4240    //current
-027|hrtimer_cancel(inline)
-027|schedule_hrtimeout_range_clock(

到目前为止next已经找到了，它的task_struct地址是：0xFFFFFFDDB4792140

既然是切换两个进程，那进程可分为用户空间的进程和内核空间的，一般内核空间称为内核线程。用户空间进程和内核空间的线程一般通过task_stuuct中的mm指针来区分。

struct task_struct{
    struct mm_struct        *mm;
    struct mm_struct        *active_mm;
    .......
}

如果是用户空间的进程mm和active_mm是相等的，都是指向同一个mm_struct结构的
如果是内核线程，mm=NULL，active_mm在内核线程没运行时等于NULL，在运行时借用上一个进程的内存描述符，因为所以得进程内核地址空间是共享的。

切换上下文，其过程其实就是保存上一个进程的上下文，切换下一个进程。

/*
 * context_switch - switch to the new MM and the new thread's register state.
 */
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next, struct rq_flags *rf)
{
    struct mm_struct *mm, *oldmm;
 
    prepare_task_switch(rq, prev, next);
 
    mm = next->mm;
    oldmm = prev->active_mm;
    /*
     * For paravirt, this is coupled with an exit in switch_to to
     * combine the page table reload and the switch backend into
     * one hypercall.
     */
    arch_start_context_switch(prev);
 
    /*
     * If mm is non-NULL, we pass through switch_mm(). If mm is
     * NULL, we will pass through mmdrop() in finish_task_switch().
     * Both of these contain the full memory barrier required by
     * membarrier after storing to rq->curr, before returning to
     * user-space.
     */
    if (!mm) {
        next->active_mm = oldmm;
        mmgrab(oldmm);
        enter_lazy_tlb(oldmm, next);
    } else
        switch_mm_irqs_off(oldmm, mm, next);
 
    if (!prev->mm) {
        prev->active_mm = NULL;
        rq->prev_mm = oldmm;
    }
 
    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
    prepare_lock_switch(rq, next, rf);
 
    /* Here we just switch the register state and the stack. */
    switch_to(prev, next, prev);
    barrier();
 
    return finish_task_switch(prev);
}

如果mm=NULL，说明是next是一个内核线程。因为内核线程没有用户虚拟地址空间，需要借用上一个进程的用户虚拟地址空间。则将oldmm赋值给next→active_mm。
- 有个问题：借用上一个进程的虚拟地址空间，会不会破坏上一个进程的用户虚拟空间？答案是不会的，内核不会随意的访问用户空间的
enter_lazy_tlb 通知处理器，不需要切换用户空间虚拟地址，这样可以加速进程上下文的切换
如果mm !=NULL，说明是一个下一个用户空间进程。调用switch_mm_irqs_off完成用户虚拟地址空间的切换
如果prev→mm =NULL，说明上一个是内核线程。既然是内核线程它的active_mm肯定是借用它的上一个，则需要将它的active_mm设置为NULL

其实只有当next是一个用户空间的进程的时候，才真正的需要切换用户空间虚拟地址空间。

ARM64中switch_mm_irqs_off函数的定义如下：

#ifndef switch_mm_irqs_off
#define switch_mm_irqs_off switch_mm
#endif

static inline void
switch_mm(struct mm_struct *prev, struct mm_struct *next,
   struct task_struct *tsk)
{
 if (prev != next)
     __switch_mm(next);

 /*
  * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
  * value may have not been initialised yet (activate_mm caller) or the
  * ASID has changed since the last run (following the context switch
  * of another thread of the same process).
  */
 update_saved_ttbr0(tsk, next);
}

当prev和next不指向同一个mm_struct的时候，调用函数__switch_mm发生上下文切换。__switch_mm函数的实现这里先不展开了，涉及到mm的知识。

#define switch_to(prev, next, last)                    \
    do {                                \
        ((last) = __switch_to((prev), (next)));            \
    } while (0)

/*
 * Thread switching.
 */
__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
                struct task_struct *next)
{
    struct task_struct *last;
 
    fpsimd_thread_switch(next);
    tls_thread_switch(next);
    hw_breakpoint_thread_switch(next);
    contextidr_thread_switch(next);
    entry_task_switch(next);
    uao_thread_switch(next);
    ssbs_thread_switch(next);
 
    /*
     * Complete any pending TLB or cache maintenance on this CPU in case
     * the thread migrates to a different CPU.
     * This full barrier is also required by the membarrier system
     * call.
     */
    dsb(ish);
 
    /* the actual thread switch */
    last = cpu_switch_to(prev, next);
 
    return last;
}

fpsimd_thread_switch 切换浮点寄存器
hw_breakpoint_thread_switch 切换调试寄存器
entry_task_switch(next); 记录next进程的task_struct
cpu_switch_to 真正的切换


/*
 * Register switch for AArch64. The callee-saved registers need to be saved
 * and restored. On entry:
 *   x0 = previous task_struct (must be preserved across the switch)
 *   x1 = next task_struct
 * Previous and next are guaranteed not to be the same.
 *
 */
ENTRY(cpu_switch_to)
    mov x10, #THREAD_CPU_CONTEXT
    add x8, x0, x10
    mov x9, sp
    stp x19, x20, [x8], #16     // store callee-saved registers
    stp x21, x22, [x8], #16
    stp x23, x24, [x8], #16
    stp x25, x26, [x8], #16
    stp x27, x28, [x8], #16
    stp x29, x9, [x8], #16
    str lr, [x8]
    add x8, x1, x10
    ldp x19, x20, [x8], #16     // restore callee-saved registers
    ldp x21, x22, [x8], #16
    ldp x23, x24, [x8], #16
    ldp x25, x26, [x8], #16
    ldp x27, x28, [x8], #16
    ldp x29, x9, [x8], #16
    ldr lr, [x8]
    mov sp, x9
    msr sp_el0, x1
    ret
ENDPROC(cpu_switch_to)
NOKPROBE(cpu_switch_to)

调用到ARM64中的汇编实现，其中x0代表上一个进程，x1代表next进程。

THREAD_CPU_CONTEXT代表的是task_struct→thread.cpu_context，cpu_context就是对进程寄存器的描述从x19到pc指针寄存器

struct cpu_context {
    unsigned long x19;
    unsigned long x20;
    unsigned long x21;
    unsigned long x22;
    unsigned long x23;
    unsigned long x24;
    unsigned long x25;
    unsigned long x26;
    unsigned long x27;
    unsigned long x28;
    unsigned long fp;
    unsigned long sp;
    unsigned long pc;
};
 
struct thread_struct {
    struct cpu_context    cpu_context;    /* cpu context */
 
    ........
};

mov x10, #THREAD_CPU_CONTEXT #用x10保存进程描述符的thread.cpu_context
add x8, x0, x10 #用x8存储上一个进程（prev）的thread.cpu_context
mov x9, sp #用x9保存sp指针
stp x19, x20, [x8], #16 #将上一个进程的x19,x20存储到上一个进程的cpu_context.x19 cpu_context.x20中
........
str lr, [x8] #寄存器lr存储返回地址
add x8, x1, x10 #寄存器x8存储下一个进程(next)的thread.cpu_context
ldp x19, x20, [x8], #16 #恢复之前存储在next进程的cpu_context到x19,x20
......
mov sp, x9 #设置sp寄存器
msr sp_el0, x1 #将下一个进程的thread_info结构存储到sp_el0中。sp_el0是用户栈指针寄存器

当切换完毕后，会从它的pc指针所指的地方运行

2025 年 10 月
日	一	二	三	四	五	六
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31