本文是基于linux kernel 5.15.41

能量模型(EM)<kernel/power/energy_model.c | 源代码 | v5.15.41>框架是一种驱动程序与内核子系统之间的接口。其中驱动程序了解不同性能层级的设备所消耗的功率,而内核子系统愿意使用该信息做出能量感知决策。EM框架管理着系统中各个设备提供的“性能域”,也就是频率功率的映射表,相关的能量感知算法可通过接口获取相应的设备的“性能域”,进行性能成本估算。

em.drawio.png

EM能量模型debug节点:/sys/kernel/debug/energy_model

目前在当前内核版本中,仅仅支持CPU device,CPU设备的em_perf_state中power<active_power | 源代码 | v5.15.41>和cost值计算公式如下:

  • power = capacitance(电容,dtsi配置:dynamic-power-coefficient) * voltage^2 * frequency
  • cost = max_frequency * power / frequency

1. energy_model数据结构

<include/linux/energy_model.h | 源代码 | v5.15.41>

struct em_perf_state {
    unsigned long frequency;
    unsigned long power;
    unsigned long cost;
};

struct em_perf_domain {
    struct em_perf_state *table;
    int nr_perf_states;
    int milliwatts;
    unsigned long cpus[];
};

API函数接口

//获取相应device的em_perf_domain结构
struct em_perf_domain *em_pd_get(struct device *dev);
//通过cpu id获取CPU device的em_perf_domain结构
struct em_perf_domain *em_cpu_get(int cpu);

//设备注册EM能量模型,这个接口是提供给设备使用
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
                struct em_data_callback *cb, cpumask_t *span,
                bool milliwatts);
//设备注销EM能量模型,这个接口是提供给设备使用
void em_dev_unregister_perf_domain(struct device *dev);

2. em_dev_register_perf_domain

将设备注册到em能量模型,这个接口是提供给驱动设备使用

// 将设备注册到em能量模型
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
                struct em_data_callback *cb, cpumask_t *cpus,
                bool milliwatts)
{
    unsigned long cap, prev_cap = 0;
    int cpu, ret;

    if (!dev || !nr_states || !cb)
        return -EINVAL;

    /*
     * Use a mutex to serialize the registration of performance domains and
     * let the driver-defined callback functions sleep.
     */
    mutex_lock(&em_pd_mutex);

    if (dev->em_pd) {
        ret = -EEXIST;
        goto unlock;
    }

    //判断是否是cpu设备,dev->bus是否是虚拟总线cpu_subsys
    if (_is_cpu_device(dev)) {
        if (!cpus) {
            dev_err(dev, "EM: invalid CPU mask\n");
            ret = -EINVAL;
            goto unlock;
        }

        
        for_each_cpu(cpu, cpus) {
            //返回CPU的性能域
            if (em_cpu_get(cpu)) {
                dev_err(dev, "EM: exists for CPU%d\n", cpu);
                ret = -EEXIST;
                goto unlock;
            }
            /*
             * All CPUs of a domain must have the same
             * micro-architecture since they all share the same
             * table.
             */
            // 获取cpu的capacity,当前cpu算力和上一个算力不相等
            cap = arch_scale_cpu_capacity(cpu);
            if (prev_cap && prev_cap != cap) {
                dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
                    cpumask_pr_args(cpus));//printf输出cpumask

                ret = -EINVAL;
                goto unlock;
            }
            prev_cap = cap;
        }
    }

    // 创建em_perf_domain
    ret = em_create_pd(dev, nr_states, cb, cpus);
    if (ret)
        goto unlock;

    dev->em_pd->milliwatts = milliwatts;
    
    //创建em能量模型对应的设备debug接口:/sys/kernel/debug/energy_model
    em_debug_create_pd(dev);
    dev_info(dev, "EM: created perf domain\n");

unlock:
    mutex_unlock(&em_pd_mutex);
    return ret;
}
EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
// 创建em_perf_domain
static int em_create_pd(struct device *dev, int nr_states,
            struct em_data_callback *cb, cpumask_t *cpus)
{
    struct em_perf_domain *pd;
    struct device *cpu_dev;
    int cpu, ret;

    if (_is_cpu_device(dev)) {
        //创建em_perf_domain对象
        pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
        if (!pd)
            return -ENOMEM;
        
        //copy cpumask
        cpumask_copy(em_span_cpus(pd), cpus);
    } else {
        // 非CPU设备
        pd = kzalloc(sizeof(*pd), GFP_KERNEL);
        if (!pd)
            return -ENOMEM;
    }

    // 创建性能映射表,频率和功耗映射关系,计算cost
    ret = em_create_perf_table(dev, pd, nr_states, cb);
    if (ret) {
        kfree(pd);
        return ret;
    }

    if (_is_cpu_device(dev))
        for_each_cpu(cpu, cpus) {
            cpu_dev = get_cpu_device(cpu);
            cpu_dev->em_pd = pd;
        }

    dev->em_pd = pd;

    return 0;
}
// 创建性能映射表
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
                int nr_states, struct em_data_callback *cb)
{
    unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
    struct em_perf_state *table;
    int i, ret;
    u64 fmax;

    table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
    if (!table)
        return -ENOMEM;

    // 创建功耗和频率映射表
    /* Build the list of performance states for this performance domain */
    for (i = 0, freq = 0; i < nr_states; i++, freq++) {
        /*
         * active_power() is a driver callback which ceils 'freq' to
         * lowest performance state of 'dev' above 'freq' and updates
         * 'power' and 'freq' accordingly.
         */
        // 回调设备的power计算函数,获取功耗和频率
        ret = cb->active_power(&power, &freq, dev);
        if (ret) {
            dev_err(dev, "EM: invalid perf. state: %d\n",
                ret);
            goto free_ps_table;
        }

        /*
         * We expect the driver callback to increase the frequency for
         * higher performance states.
         */
        // 新增的freq必须比上一个freq大,递增
        if (freq <= prev_freq) {
            dev_err(dev, "EM: non-increasing freq: %lu\n",
                freq);
            goto free_ps_table;
        }

        /*
         * The power returned by active_state() is expected to be
         * positive and to fit into 16 bits.
         */
        if (!power || power > EM_MAX_POWER) {
            dev_err(dev, "EM: invalid power: %lu\n",
                power);
            goto free_ps_table;
        }

        table[i].power = power;
        table[i].frequency = prev_freq = freq;
    }

    //计算的cost,cost = max_freq * power / frequency
    /* Compute the cost of each performance state. */
    fmax = (u64) table[nr_states - 1].frequency;
    for (i = nr_states - 1; i >= 0; i--) {
        unsigned long power_res = em_scale_power(table[i].power);

        table[i].cost = div64_u64(fmax * power_res,
                      table[i].frequency);
        if (table[i].cost >= prev_cost) {
            dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
                table[i].frequency);
        } else {
            prev_cost = table[i].cost;
        }
    }

    pd->table = table;
    pd->nr_perf_states = nr_states;

    return 0;

free_ps_table:
    kfree(table);
    return -EINVAL;
}

3. em_pd_get

获取相应device的em_perf_domain结构,主要是给相关能量感知算法去调用,如:EAS、IPA。

struct em_perf_domain *em_pd_get(struct device *dev)
{
    if (IS_ERR_OR_NULL(dev))
        return NULL;

    return dev->em_pd;
}

4. em_cpu_get

通过cpu id获取相应的cpu device的em_perf_domain结构,主要是给相关能量感知算法去调用,如:EAS、IPA。

struct em_perf_domain *em_cpu_get(int cpu)
{
    struct device *cpu_dev;

    cpu_dev = get_cpu_device(cpu);
    if (!cpu_dev)
        return NULL;

    return em_pd_get(cpu_dev);
}
EXPORT_SYMBOL_GPL(em_cpu_get);

5. em_cpu_energy

计算在当前性能下cpu消耗的power

static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
                unsigned long max_util, unsigned long sum_util,
                unsigned long allowed_cpu_cap)
{
    unsigned long freq, scale_cpu;
    struct em_perf_state *ps;
    int i, cpu;

    if (!sum_util)
        return 0;

    /*
     * In order to predict the performance state, map the utilization of
     * the most utilized CPU of the performance domain to a requested
     * frequency, like schedutil. Take also into account that the real
     * frequency might be set lower (due to thermal capping). Thus, clamp
     * max utilization to the allowed CPU capacity before calculating
     * effective frequency.
     */
    cpu = cpumask_first(to_cpumask(pd->cpus));
    // 获取CPU归一化的capacity
    scale_cpu = arch_scale_cpu_capacity(cpu);
    ps = &pd->table[pd->nr_perf_states - 1];

    max_util = map_util_perf(max_util);
    // allowed_cpu_cap会限制max_util
    max_util = min(max_util, allowed_cpu_cap);
    // 获取通过max_util获取对应的最大freq
    freq = map_util_freq(max_util, ps->frequency, scale_cpu);

    /*
     * Find the lowest performance state of the Energy Model above the
     * requested frequency.
     */
    // 找到最大freq对应的em table
    for (i = 0; i < pd->nr_perf_states; i++) {
        ps = &pd->table[i];
        if (ps->frequency >= freq)
            break;
    }

    /*
     * The capacity of a CPU in the domain at the performance state (ps)
     * can be computed as:
     *  通过频率比例,计算当前性能下需要的cpu的capacity
     *             scale_cpu * ps->freq
     *   ps->cap = --------------------                          (1)
     *                 cpu_max_freq
     *
     * So, ignoring the costs of idle states (which are not available in
     * the EM), the energy consumed by this CPU at that performance state
     * is estimated as:
     *  通过util比例,计算在当前性能下cpu消耗的power
     *             ps->power * cpu_util
     *   cpu_nrg = --------------------                          (2)
     *                   ps->cap
     *
     * since 'cpu_util / ps->cap' represents its percentage of busy time.
     *
     *   NOTE: Although the result of this computation actually is in
     *         units of power, it can be manipulated as an energy value
     *         over a scheduling period, since it is assumed to be
     *         constant during that interval.
     *
     * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
     * of two terms:
     *
     *             ps->power * cpu_max_freq   cpu_util
     *   cpu_nrg = ------------------------ * ---------          (3)
     *                    ps->freq            scale_cpu
     *
     * The first term is static, and is stored in the em_perf_state struct
     * as 'ps->cost'.
     *
     * Since all CPUs of the domain have the same micro-architecture, they
     * share the same 'ps->cost', and the same CPU capacity. Hence, the
     * total energy of the domain (which is the simple sum of the energy of
     * all of its CPUs) can be factorized as:
     *
     *            ps->cost * \Sum cpu_util
     *   pd_nrg = ------------------------                       (4)
     *                  scale_cpu
     */
    // 计算在当前性能下cpu消耗的power
    return ps->cost * sum_util / scale_cpu;
}
文章目录