【BPF】第001篇使用 eBPF 监控 Linux 内核内存分配

AI智能摘要

深入探索eBPF技术，该方案通过非侵入式方式在Linux内核中高效运行自定义字节码，实现对kmalloc内存分配的实时监控，兼具安全性和灵活性。文章详细展示了内核态插桩与用户态应用的协同原理与代码实践，支持多维过滤和实用输出，适合于系统性能分析及生产环境部署，为内核行为追踪和资源优化提供了极具价值的案例

此摘要由AI分析文章内容生成，仅供参考。

最近也是因为一些灵感需要进行内核探测，从而了解到了eBPF。所以就开始了eBPF的学习之路，大体了解了一下太强大了！在不破坏内核代码的情况下，实现内核监控，性能开销极低，完全可以用于生产环境！这一篇也是我自己摸索实现的第一个功能，监控内核kmalloc的调用情况。

eBPF 技术概述

什么是 eBPF？

eBPF 是 Linux 内核的一个革命性技术，它允许用户空间程序在不修改内核源代码的情况下，安全、高效地在内核中运行自定义的字节码。eBPF 最初设计用于网络包过滤，现已扩展到系统监控、性能分析、安全等领域。

eBPF 的关键特性

安全性：所有 eBPF 程序必须通过内核验证器的严格检查，确保不会导致内核崩溃或数据损坏。
高性能：eBPF 程序在内核空间运行，避免了用户态和内核态之间的上下文切换开销。
灵活性：可以挂载到多种内核事件点（tracepoints、kprobes、uprobes 等）。
可编程性：支持复杂的数据结构和算法，包括循环（有限制）和分支。

kmalloc 监控

目标

创建一个能够实时监控和记录内核 kmalloc 调用的工具，需要捕获：

调用进程的 PID 和名称
请求分配的内存大小
实际分配的内存大小
分配标志（gfp_flags）
NUMA 节点信息
时间戳

实例

内核态插桩

// kmalloc.bpf.c
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "kmalloc.h"

char LICENSE[] SEC("license") = "GPL";

struct {
    __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    __uint(key_size, sizeof(u32));
    __uint(value_size, sizeof(u32));
    __uint(max_entries, 1024);
} kmalloc_events SEC(".maps");

// 定义一个结构体来匹配 tracepoint 参数
struct trace_kmalloc_args {
    unsigned long long unused;  
    unsigned long call_site;
    const void *ptr;
    size_t bytes_req;
    size_t bytes_alloc;
    unsigned long gfp_flags;
    int node;
};

static __always_inline bool comm_is_test_kmalloc(const char *comm)
{
    // 直接比较前11个字符（"test_kmalloc" 的长度是11）
    // 这样可以避免循环，提高性能
    const char target[12] = "test_kmalloc";  // 11个字符 + 1个结束符
    
    // 使用内联汇编或直接比较
    if (comm[0] != 't') return false;
    if (comm[1] != 'e') return false;
    if (comm[2] != 's') return false;
    if (comm[3] != 't') return false;
    if (comm[4] != '_') return false;
    if (comm[5] != 'k') return false;
    if (comm[6] != 'm') return false;
    if (comm[7] != 'a') return false;
    if (comm[8] != 'l') return false;
    if (comm[9] != 'l') return false;
    if (comm[10] != 'o') return false;
    if (comm[11] != 'c') return false;
    if (comm[12] != '\0') return false;  // 确保后面是结束符
    
    return true;
}

// 使用原始 tracepoint 访问方式
SEC("tracepoint/kmem/kmalloc")
int trace_kmalloc(void *ctx)
{
    struct kmalloc_event event = {};
    struct trace_kmalloc_args *args = ctx;
    
    // 获取进程信息
    u64 pid_tgid = bpf_get_current_pid_tgid();
    event.pid = pid_tgid & 0xFFFFFFFF;
    event.tgid = pid_tgid >> 32;
    bpf_get_current_comm(&event.comm, sizeof(event.comm));
    
    bpf_printk("Tracepoint triggered by: %s (PID: %d)", 
               event.comm, event.pid);

    if (!comm_is_test_kmalloc(event.comm)) {
        return 0;  // 不是 test_kmalloc 进程，直接返回
    }

    // 直接通过指针访问参数
    unsigned long long *raw_args = (unsigned long long *)((char *)ctx + 8);
    
    // 手动读取参数
    bpf_probe_read_kernel(&event.call_site, sizeof(event.call_site), &raw_args[0]);
    bpf_probe_read_kernel(&event.ptr, sizeof(event.ptr), &raw_args[1]);
    bpf_probe_read_kernel(&event.bytes_req, sizeof(event.bytes_req), &raw_args[2]);
    bpf_probe_read_kernel(&event.bytes_alloc, sizeof(event.bytes_alloc), &raw_args[3]);
    bpf_probe_read_kernel(&event.gfp_flags, sizeof(event.gfp_flags), &raw_args[4]);
    bpf_probe_read_kernel(&event.node, sizeof(event.node), &raw_args[5]);
    
    event.timestamp_ns = bpf_ktime_get_ns();
    
    // 发送事件到用户空间
    bpf_perf_event_output(ctx, &kmalloc_events, BPF_F_CURRENT_CPU, &event, sizeof(event));
    
    return 0;
}

应用层监控

// kmalloc.c - 用户态程序
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>
#include <string.h>
#include <time.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include <bpf/bpf.h>
#include "kmalloc.skel.h"
#include "kmalloc.h"

static volatile bool exiting = false;

// 信号处理函数
static void sig_handler(int sig)
{
    fprintf(stderr, "\nSignal received, exiting...\n");
    exiting = true;
}

// 处理从内核传来的 kmalloc 事件
static void handle_kmalloc_event(void *ctx, int cpu, void *data, __u32 size)
{
    struct kmalloc_event *e = data;
    struct tm *tm;
    char ts[32];
    time_t t;
    
    // 获取当前时间
    time(&t);
    tm = localtime(&t);
    strftime(ts, sizeof(ts), "%H:%M:%S", tm);
    
    // 格式化输出事件信息
    printf("[%s] PID: %-6d | TGID: %-6d | COMM: %-16s | "
           "SIZE_REQ: %-8llu | SIZE_ALLOC: %-8llu | "
           "GFP_FLAGS: 0x%-8llx | NODE: %d | ADDR: 0x%llx\n",
           ts, e->pid, e->tgid, e->comm, 
           e->bytes_req, e->bytes_alloc,
           e->gfp_flags, e->node, e->ptr);
}

// 打印使用说明
static void print_usage(const char *prog_name)
{
    printf("Usage: %s [options]\n", prog_name);
    printf("Options:\n");
    printf("  -h, --help     Show this help message\n");
    printf("  -f, --filter   Filter by process name\n");
    printf("  -p, --pid      Filter by process ID\n");
    printf("  -s, --size     Only show allocations larger than SIZE bytes\n");
    printf("  -t, --top      Show top N processes by allocation count\n");
    printf("\nExample:\n");
    printf("  %s -f chrome         # Monitor only Chrome processes\n", prog_name);
    printf("  %s -s 1024          # Show only allocations > 1KB\n", prog_name);
    printf("  %s -p 1234          # Monitor only PID 1234\n", prog_name);
}

// 提升内存限制
static void bump_memlock_rlimit(void)
{
    struct rlimit rlim_new = {
        .rlim_cur = RLIM_INFINITY,
        .rlim_max = RLIM_INFINITY,
    };
    
    if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
        fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit: %s\n", strerror(errno));
        exit(1);
    }
    
    printf("RLIMIT_MEMLOCK set to unlimited\n");
}

// 初始化信号处理
static void init_signals(void)
{
    signal(SIGINT, sig_handler);
    signal(SIGTERM, sig_handler);
    signal(SIGHUP, sig_handler);
}

// 程序主函数
int main(int argc, char **argv)
{
    struct kmalloc_bpf *skel = NULL;
    struct perf_buffer *pb = NULL;
    struct perf_buffer_opts pb_opts = {};
    int err;
    
    // 解析命令行参数
    int opt;
    char *filter_comm = NULL;
    pid_t filter_pid = 0;
    size_t min_size = 0;
    int top_n = 0;
    
    while ((opt = getopt(argc, argv, "hf:p:s:t:")) != -1) {
        switch (opt) {
            case 'h':
                print_usage(argv[0]);
                return 0;
            case 'f':
                filter_comm = optarg;
                printf("Filtering by process name: %s\n", filter_comm);
                break;
            case 'p':
                filter_pid = atoi(optarg);
                printf("Filtering by PID: %d\n", filter_pid);
                break;
            case 's':
                min_size = atoll(optarg);
                printf("Minimum allocation size: %zu bytes\n", min_size);
                break;
            case 't':
                top_n = atoi(optarg);
                printf("Will show top %d processes\n", top_n);
                break;
            default:
                print_usage(argv[0]);
                return 1;
        }
    }
    
    // 初始化
    init_signals();
    bump_memlock_rlimit();
    
    printf("========================================\n");
    printf("KMALLOC Monitor - Tracing Memory Allocations\n");
    printf("========================================\n");
    printf("Kernel Version: ");
    fflush(stdout);
    system("uname -r");
    printf("\n");
    
    // 1. 打开并加载 BPF 程序
    printf("[1/3] Opening and loading BPF skeleton...\n");
    skel = kmalloc_bpf__open_and_load();
    if (!skel) {
        fprintf(stderr, "Failed to open and load BPF skeleton\n");
        return 1;
    }
    
    // 2. 附加 BPF 程序到 tracepoint
    printf("[2/3] Attaching BPF programs...\n");
    err = kmalloc_bpf__attach(skel);
    if (err) {
        fprintf(stderr, "Failed to attach BPF skeleton: %d\n", err);
        goto cleanup;
    }
    
    // 3. 设置 perf buffer 接收事件
    printf("[3/3] Setting up perf buffer...\n");
    
    pb_opts.sz = sizeof(pb_opts);
    pb = perf_buffer__new(bpf_map__fd(skel->maps.kmalloc_events), 128,
                          handle_kmalloc_event, NULL, NULL, &pb_opts);
    if (!pb) {
        err = -errno;
        fprintf(stderr, "Failed to create perf buffer: %s\n", strerror(-err));
        goto cleanup;
    }
    
    printf("\nBPF program loaded successfully!\n");
    printf("========================================\n");
    printf("Monitoring kmalloc calls in real-time...\n");
    printf("Press Ctrl+C to stop\n");
    printf("========================================\n\n");
    
    // 打印表头
    printf("%-8s %-6s %-6s %-16s %-10s %-10s %-10s %-4s\n",
           "TIME", "PID", "TGID", "COMMAND", "REQ_B", "ALLOC_B", "GFP_FLAGS", "NODE");
    printf("%-8s %-6s %-6s %-16s %-10s %-10s %-10s %-4s\n",
           "--------", "------", "------", "----------------", 
           "----------", "----------", "----------", "----");
    
    // 4. 主事件循环
    while (!exiting) {
        err = perf_buffer__poll(pb, 100);
        if (err < 0 && err != -EINTR) {
            fprintf(stderr, "Error polling perf buffer: %d\n", err);
            break;
        }
        
        // 定期输出状态（每5秒）
        static time_t last_status = 0;
        time_t now = time(NULL);
        if (now - last_status >= 5) {
            printf("\n[%s] Monitoring... (Press Ctrl+C to stop)\n", ctime(&now));
            last_status = now;
        }
    }
    
    printf("\n========================================\n");
    printf("Shutting down...\n");
    
cleanup:
    // 5. 清理资源
    if (pb) {
        perf_buffer__free(pb);
    }
    
    if (skel) {
        kmalloc_bpf__destroy(skel);
    }
    
    printf("Cleanup complete. Goodbye!\n");
    return err < 0 ? 1 : 0;
}

模拟测试

// test_kmalloc.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <signal.h>

static volatile int running = 1;

void handle_signal(int sig)
{
    running = 0;
    printf("\nSignal received, stopping...\n");
}

// 简单方法：通过文件操作触发内核 kmalloc
void trigger_kmalloc(int size)
{
    char filename[256];
    static int counter = 0;
    int fd;
    char *buffer;
    
    // 创建唯一的文件名
    snprintf(filename, sizeof(filename), "/tmp/kmalloc_test_%d_%d.tmp", getpid(), counter++);
    
    // 打开文件
    fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644);
    if (fd < 0) {
        return;
    }
    
    // 分配内存并写入文件
    buffer = malloc(size);
    if (buffer) {
        memset(buffer, 'X', size);
        write(fd, buffer, size);
        free(buffer);
    }
    
    close(fd);
    
    // 删除文件
    unlink(filename);
}

int main(int argc, char *argv[])
{
    int count = 100;       // 默认执行100次
    int size = 4096;       // 默认4KB
    int delay = 100000;    // 默认100ms
    
    // 解析参数
    if (argc > 1) count = atoi(argv[1]);
    if (argc > 2) size = atoi(argv[2]);
    if (argc > 3) delay = atoi(argv[3]);
    
    // 设置信号处理
    signal(SIGINT, handle_signal);
    signal(SIGTERM, handle_signal);
    
    printf("KMALLOC Test Program - PID: %d\n", getpid());
    printf("Will execute %d times, size=%d bytes, delay=%d us\n", 
           count, size, delay);
    printf("Press Ctrl+C to stop\n\n");
    
    for (int i = 0; i < count && running; i++) {
        printf("Triggering kmalloc #%d/%d (size=%d)\n", i+1, count, size);
        
        // 触发 kmalloc
        trigger_kmalloc(size);
        
        // 延迟
        usleep(delay);
    }
    
    printf("\nTest completed!\n");
    return 0;
}

测试结果

启动上层监控程序：

此时会一直轮询监控

执行测试程序

总结

在不修改内核的情况下实现深度监控，很有意思，会继续学习这块！

2025 年 12 月
日	一	二	三	四	五	六
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31