1. 框架结构

thermal core:thermal主要的程序,驱动初始化程序,维系thermal zone、governor、cooling device三者的关系。

thermal zone device:创建thermal zone节点和连接thermal sensor,在/sys/class/thermal/目录下的thermal_zone*,通过dtsi文件进行配置生成。thermal sensor是温度传感器(即热敏电阻NTC),主要是给thermal提供温度感知。

thermal govnernor: 温度控制算法,解决温控发生时(即throttle),cooling device如何选择cooling state的问题。

  • step_wise
  • power_allocator
  • user_space
  • bang_bang
  • fair_share

thermal cooling device:系统温控的执行者,实施冷却措施的驱动(cpufreq_cooling、cpuidle_cooling、 devfreq_cooling等)。cooling device根据governor计算出来的state,实施冷却操作,一般情况下,state越高表示系统的冷却需求越高。cooling device需要与trip point进行绑定,当 trip point 触发后,由相应的cooling device 去实施冷却措施。

thermal_framework.png

2. 代码结构

thermal_source_framework.png

2.1 thermal core

thermal_core.c </drivers/thermal/thermal_core.c | 源代码 | v5.10.43> 主要是初始化driver,注册governor,解析dtsi文件,创建thermal zone和初始通信。

2.1.1 thermal结构体定义

thermal.h </include/linux/thermal.h | 源代码 | v5.10.43> ,公开接口给其他驱动程序调用

// thermal_zone_device ops配置
struct thermal_zone_device_ops {
    int (*bind) (struct thermal_zone_device *,
             struct thermal_cooling_device *);
    int (*unbind) (struct thermal_zone_device *,
               struct thermal_cooling_device *);
    int (*get_temp) (struct thermal_zone_device *, int *);
    int (*set_trips) (struct thermal_zone_device *, int, int);
    int (*change_mode) (struct thermal_zone_device *,
        enum thermal_device_mode);
    int (*get_trip_type) (struct thermal_zone_device *, int,
        enum thermal_trip_type *);
    int (*get_trip_temp) (struct thermal_zone_device *, int, int *);
    int (*set_trip_temp) (struct thermal_zone_device *, int, int);
    int (*get_trip_hyst) (struct thermal_zone_device *, int, int *);
    int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
    int (*get_crit_temp) (struct thermal_zone_device *, int *);
    int (*set_emul_temp) (struct thermal_zone_device *, int);
    int (*get_trend) (struct thermal_zone_device *, int,
              enum thermal_trend *);
    int (*notify) (struct thermal_zone_device *, int,
               enum thermal_trip_type);
};

// thermal_cooling_device ops配置
struct thermal_cooling_device_ops {
    int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
    int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
    int (*set_cur_state) (struct thermal_cooling_device *, unsigned long);
    int (*get_requested_power)(struct thermal_cooling_device *, u32 *);
    int (*state2power)(struct thermal_cooling_device *, unsigned long, u32 *);
    int (*power2state)(struct thermal_cooling_device *, u32, unsigned long *);
};

thermal_core.c </drivers/thermal/thermal_core.c | 源代码 | v5.10.43>

//thermal驱动的init入口函数
static int __init thermal_init(void)
{
    int result;

    // generic netlink初始化
    result =thermal_netlink_init();
    if (result)
        goto error;

    // 注册thermal governor
    result =thermal_register_governors();
    if (result)
        goto error;

    // 注册/sys/class/thermal节点
    result =class_register(&thermal_class);
    if (result)
        goto unregister_governors;

    // 解析dtsi配置文件中的thermal-zones字段,并注册thermal_zone_device
    result =of_parse_thermal_zones();
    if (result)
        goto unregister_class;

    // 注册notifier
    result =register_pm_notifier(&thermal_pm_nb);
    if (result)
pr_warn("Thermal: Can not register suspend notifier, return %d\n",
            result);

    return 0;

unregister_class:
class_unregister(&thermal_class);
unregister_governors:
thermal_unregister_governors();
error:
ida_destroy(&thermal_tz_ida);
ida_destroy(&thermal_cdev_ida);
mutex_destroy(&thermal_list_lock);
mutex_destroy(&thermal_governor_lock);
mutex_destroy(&poweroff_lock);
    return result;
}
/* 将所有的governor策略(step_wise、power_allocator、user_space、fair_share)默认都注册给kernel */
static int __init thermal_register_governors(void)
{
    int ret = 0;
    structthermal_governor **governor;

    //遍历注册所有的governor策略
    for_each_governor_table(governor) {
        //注册governor策略接口
        ret =thermal_register_governor(*governor);
        if (ret) {
            pr_err("Failed to register governor: '%s'",
                   (*governor)->name);
            break;
        }

        pr_info("Registered thermal governor '%s'",
            (*governor)->name);
    }

    if (ret) {
        structthermal_governor **gov;

    for_each_governor_table(gov) {
            if (gov ==governor)
                break;
            thermal_unregister_governor(*gov);
        }
    }

    return ret;
}
// 将新governor添加到全局governor_list,设置默认的governor
int thermal_register_governor(struct thermal_governor *governor)
{
    int err;
    const char *name;
    struct thermal_zone_device *pos;

    if (!governor)
        return -EINVAL;

    mutex_lock(&thermal_governor_lock);

    err = -EBUSY;
    if (!__find_governor(governor->name)) {
        bool match_default;

        err = 0;
        // 将新governor添加到全局governor_list
        list_add(&governor->governor_list, &thermal_governor_list);

        // 查找匹配默认的governor,并设置默认值
        match_default = !strncmp(governor->name,
                     DEFAULT_THERMAL_GOVERNOR,
                     THERMAL_NAME_LENGTH);
        // Kconfig中配置默认的governor 
        if (!def_governor && match_default)
            def_governor = governor;
    }

    mutex_lock(&thermal_list_lock);

    list_for_each_entry(pos, &thermal_tz_list, node) {
        /*
         * only thermal zones with specified tz->tzp->governor_name
         * may run with tz->govenor unset
         */
        if (pos->governor)
            continue;

        name = pos->tzp->governor_name;

        if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) {
            int ret;

            ret = thermal_set_governor(pos, governor);
            if (ret)
                dev_err(&pos->device,
                    "Failed to set governor %s for thermal zone %s: %d\n",
                    governor->name, pos->type, ret);
        }
    }

    mutex_unlock(&thermal_list_lock);
    mutex_unlock(&thermal_governor_lock);

    return err;
}
// 注册一个thermal_zone_device
struct thermal_zone_device *
thermal_zone_device_register(const char *type, int trips, int mask,
                 void *devdata, struct thermal_zone_device_ops *ops,
                 struct thermal_zone_params *tzp, int passive_delay,
                 int polling_delay)
{
    struct thermal_zone_device *tz;
    enum thermal_trip_type trip_type;
    int trip_temp;
    int id;
    int result;
    int count;
    struct thermal_governor *governor;

    if (!type || strlen(type) == 0) {
        pr_err("Error: No thermal zone type defined\n");
        return ERR_PTR(-EINVAL);
    }

    if (type && strlen(type) >= THERMAL_NAME_LENGTH) {
        pr_err("Error: Thermal zone name (%s) too long, should be under %d chars\n",
               type, THERMAL_NAME_LENGTH);
        return ERR_PTR(-EINVAL);
    }

    if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips) {
        pr_err("Error: Incorrect number of thermal trips\n");
        return ERR_PTR(-EINVAL);
    }

    if (!ops) {
        pr_err("Error: Thermal zone device ops not defined\n");
        return ERR_PTR(-EINVAL);
    }

    if (trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp))
        return ERR_PTR(-EINVAL);

    tz = kzalloc(sizeof(*tz), GFP_KERNEL);
    if (!tz)
        return ERR_PTR(-ENOMEM);

    // 初始化一个链表thermal_instances
    INIT_LIST_HEAD(&tz->thermal_instances);
    ida_init(&tz->ida);
    mutex_init(&tz->lock);
    // 自动分配id
    id = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL);
    if (id < 0) {
        result = id;
        goto free_tz;
    }

    tz->id = id;
    strlcpy(tz->type, type, sizeof(tz->type));
    tz->ops = ops;
    tz->tzp = tzp;
    tz->device.class = &thermal_class;
    tz->devdata = devdata;
    tz->trips = trips;
    tz->passive_delay = passive_delay;
    tz->polling_delay = polling_delay;

    /* sys I/F */
    /* Add nodes that are always present via .groups */
    result = thermal_zone_create_device_groups(tz, mask);
    if (result)
        goto remove_id;

    /* A new thermal zone needs to be updated anyway. */
    atomic_set(&tz->need_update, 1);

    // 设置thermal_zone节点名称
    dev_set_name(&tz->device, "thermal_zone%d", tz->id);
    result = device_register(&tz->device);
    if (result)
        goto release_device;

    for (count = 0; count < trips; count++) {
        if (tz->ops->get_trip_type(tz, count, &trip_type))
            set_bit(count, &tz->trips_disabled);
        if (tz->ops->get_trip_temp(tz, count, &trip_temp))
            set_bit(count, &tz->trips_disabled);
        /* Check for bogus trip points */
        if (trip_temp == 0)
            set_bit(count, &tz->trips_disabled);
    }

    /* Update 'this' zone's governor information */
    mutex_lock(&thermal_governor_lock);

    // thermal_zone设置governor,否则默认governor
    if (tz->tzp)
        governor = __find_governor(tz->tzp->governor_name);
    else
        governor = def_governor;

    result = thermal_set_governor(tz, governor);
    if (result) {
        mutex_unlock(&thermal_governor_lock);
        goto unregister;
    }

    mutex_unlock(&thermal_governor_lock);

    if (!tz->tzp || !tz->tzp->no_hwmon) {
        result = thermal_add_hwmon_sysfs(tz);
        if (result)
            goto unregister;
    }

    mutex_lock(&thermal_list_lock);
  // 将thermal zone加入到thermal_tz_list
    list_add_tail(&tz->node, &thermal_tz_list);
    mutex_unlock(&thermal_list_lock);

    // 将thermal_cdev_list上的cooling设备绑定到thermal_zone_device上
    bind_tz(tz);

    // 初始化work queue下半部分,处理中断需要响应的操作,定时去调用thermal_zone_device_update函数
    // 设置polling_delay值为轮询周期
    INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check);

    // 对thermal zone的温度等复位。
    thermal_zone_device_reset(tz);
    /* Update the new thermal zone and mark it as already updated. */
    if (atomic_cmpxchg(&tz->need_update, 1, 0))
        thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);

    thermal_notify_tz_create(tz->id, tz->type);

    return tz;

unregister:
    device_del(&tz->device);
release_device:
    put_device(&tz->device);
    tz = NULL;
remove_id:
    ida_simple_remove(&thermal_tz_ida, id);
free_tz:
    kfree(tz);
    return ERR_PTR(result);
}
// 将thermal_cdev_list上的cooling设备绑定到thermal_zone_device上
static void bind_tz(struct thermal_zone_device *tz)
{
    int i, ret;
    struct thermal_cooling_device *pos = NULL;
    const struct thermal_zone_params *tzp = tz->tzp;

    if (!tzp && !tz->ops->bind)
        return;

    mutex_lock(&thermal_list_lock);

    /* If there is ops->bind, try to use ops->bind */
    if (tz->ops->bind) {
        // 遍历thermal_cdev_list,绑定cooling设备
        list_for_each_entry(pos, &thermal_cdev_list, node) {
            ret = tz->ops->bind(tz, pos);
            if (ret)
                print_bind_err_msg(tz, pos, ret);
        }
        goto exit;
    }

    if (!tzp || !tzp->tbp)
        goto exit;

    list_for_each_entry(pos, &thermal_cdev_list, node) {
        for (i = 0; i < tzp->num_tbps; i++) {
            if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
                continue;
            if (tzp->tbp[i].match(tz, pos))
                continue;
            tzp->tbp[i].cdev = pos;
            __bind(tz, tzp->tbp[i].trip_mask, pos,
                   tzp->tbp[i].binding_limits,
                   tzp->tbp[i].weight);
        }
    }
exit:
    mutex_unlock(&thermal_list_lock);
}
// 检查thermal_zone_device
static void thermal_zone_device_check(struct work_struct *work)
{
    //通过结构体成员变量地址来获取这个thermal_zone_device结构体的地址
    struct thermal_zone_device *tz = container_of(work, struct
                              thermal_zone_device,
                              poll_queue.work);
    //更新thermal_zone_device
    thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
}
void thermal_zone_device_update(struct thermal_zone_device *tz,
                enum thermal_notify_event event)
{
    int count;
    
    //判断是否需要轮询的方式
    if (should_stop_polling(tz))
        return;

    if (atomic_read(&in_suspend))
        return;

    // 判断sensor是否实现get_temp函数
    if (!tz->ops->get_temp)
        return;

    // 更新sensor温度,也就是thermal_zone的温度
    update_temperature(tz);

    // 更新trip值
    thermal_zone_set_trips(tz);

    tz->notify_event = event;

    for (count = 0; count < tz->trips; count++)
        handle_thermal_trip(tz, count);
}
// 更新thermal zone温度
static void update_temperature(struct thermal_zone_device *tz)
{
    int temp, ret;

    // 获取thermal_zone的温度
    ret = thermal_zone_get_temp(tz, &temp);
    if (ret) {
        if (ret != -EAGAIN)
            dev_warn(&tz->device,
                 "failed to read out thermal zone (%d)\n",
                 ret);
        return;
    }

    mutex_lock(&tz->lock);
    tz->last_temperature = tz->temperature;
    tz->temperature = temp;
    mutex_unlock(&tz->lock);

    trace_thermal_temperature(tz);

    thermal_genl_sampling_temp(tz->id, temp);
}
// 遍历处理符合条件的trips
static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
{
    enum thermal_trip_type type;
    int trip_temp, hyst = 0;

    /* Ignore disabled trip points */
    if (test_bit(trip, &tz->trips_disabled))
        return;

    // 获取trip_temp、trip_type、get_trip_hyst
    tz->ops->get_trip_temp(tz, trip, &trip_temp);
    tz->ops->get_trip_type(tz, trip, &type);
    if (tz->ops->get_trip_hyst)
        tz->ops->get_trip_hyst(tz, trip, &hyst);

    if (tz->last_temperature != THERMAL_TEMP_INVALID) {
        // 触发trip
        if (tz->last_temperature < trip_temp &&
            tz->temperature >= trip_temp)
            thermal_notify_tz_trip_up(tz->id, trip);
    // 触发hysteresis
        if (tz->last_temperature >= trip_temp &&
            tz->temperature < (trip_temp - hyst))
            thermal_notify_tz_trip_down(tz->id, trip);
    }

    // 处理critical||hot的trips
    if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
        handle_critical_trips(tz, trip, type);

    // 如果设置了governor,调用governor->throttle函数
    else
        handle_non_critical_trips(tz, trip);
    /*
     * Alright, we handled this trip successfully.
     * So, start monitoring again.
     */
    monitor_thermal_zone(tz);
}
// 处理type为critical||hot的trips
static void handle_critical_trips(struct thermal_zone_device *tz,
                  int trip, enum thermal_trip_type trip_type)
{
    int trip_temp;

    tz->ops->get_trip_temp(tz, trip, &trip_temp);

    /* If we have not crossed the trip_temp, we do not care. */
    if (trip_temp <= 0 || tz->temperature < trip_temp)
        return;

    trace_thermal_zone_trip(tz, trip, trip_type);

    if (tz->ops->notify)
        tz->ops->notify(tz, trip, trip_type);

    // 如果是critical,准备关机
    if (trip_type == THERMAL_TRIP_CRITICAL) {
        dev_emerg(&tz->device,
              "critical temperature reached (%d C), shutting down\n",
              tz->temperature / 1000);
        mutex_lock(&poweroff_lock);
        if (!power_off_triggered) {
            /*
             * Queue a backup emergency shutdown in the event of
             * orderly_poweroff failure
             */
            // 调用thermal_emergency_poweroff准备关机操作
            thermal_emergency_poweroff();
            orderly_poweroff(true);
            power_off_triggered = true;
        }
        mutex_unlock(&poweroff_lock);
    }
}
//处理其他的trips
//一般情况都是这个
static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
{
    //如果设置了governor,调用governor->throttle函数
    //否则调用默认的
    tz->governor ? tz->governor->throttle(tz, trip) :
               def_governor->throttle(tz, trip);
}
// 监控delay时间进行延时后工作
static void monitor_thermal_zone(struct thermal_zone_device *tz)
{
    bool stop;

    stop = should_stop_polling(tz);

    mutex_lock(&tz->lock);
    
    // 超过阀值轮询时间
    if (!stop && tz->passive)
        thermal_zone_device_set_polling(tz, tz->passive_delay);

    // 未超过阀值轮询时间
    else if (!stop && tz->polling_delay)
        thermal_zone_device_set_polling(tz, tz->polling_delay);
    
  // 取消调用,不轮询
    else
        thermal_zone_device_set_polling(tz, 0);

    mutex_unlock(&tz->lock);
}
static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
                        int delay)
{
    //需延时后再使用system_freezable_power_efficient_wq的工作队列进行工作
    if (delay > 1000)
        mod_delayed_work(system_freezable_power_efficient_wq,
                 &tz->poll_queue,
                 round_jiffies(msecs_to_jiffies(delay)));
    else if (delay)
        mod_delayed_work(system_freezable_power_efficient_wq,
                 &tz->poll_queue,
                 msecs_to_jiffies(delay));
    // 删除提交到工作队列的任务,不轮询
    else
        cancel_delayed_work(&tz->poll_queue);
}

在thermal_helps.c </drivers/thermal/thermal_helpers.c | 源代码 | v5.10.43> 定义thermal_zone_get_temp函数


int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
{
    int ret = -EINVAL;
    int count;
    int crit_temp = INT_MAX;
    enum thermal_trip_type type;

    if (!tz || IS_ERR(tz) || !tz->ops->get_temp)
        goto exit;

    mutex_lock(&tz->lock);

    // 获取当前sensor的温度,sensor里面实现
    ret = tz->ops->get_temp(tz, temp);

    // thermal debug开关
    if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) {
        for (count = 0; count < tz->trips; count++) {
            ret = tz->ops->get_trip_type(tz, count, &type);
            if (!ret && type == THERMAL_TRIP_CRITICAL) {
                ret = tz->ops->get_trip_temp(tz, count,
                        &crit_temp);
                break;
            }
        }

        /*
         * Only allow emulating a temperature when the real temperature
         * is below the critical temperature so that the emulation code
         * cannot hide critical conditions.
         */
        if (!ret && *temp < crit_temp)
            *temp = tz->emul_temperature;
    }

    mutex_unlock(&tz->lock);
exit:
    return ret;
}
// 获取当前温度下的thermal zone下一次的trip点
void thermal_zone_set_trips(struct thermal_zone_device *tz)
{
    int low = -INT_MAX;
    int high = INT_MAX;
    int trip_temp, hysteresis;
    int i, ret;

    mutex_lock(&tz->lock);

    if (!tz->ops->set_trips || !tz->ops->get_trip_hyst)
        goto exit;

    for (i = 0; i < tz->trips; i++) {
        int trip_low;
        
        // 获取sensor的触发温度
        tz->ops->get_trip_temp(tz, i, &trip_temp);
        
        // 获取sensor下降温度值恢复状态
        tz->ops->get_trip_hyst(tz, i, &hysteresis);

        trip_low = trip_temp - hysteresis;

        if (trip_low < tz->temperature && trip_low > low)
            low = trip_low;

        if (trip_temp > tz->temperature && trip_temp < high)
            high = trip_temp;
    }

    /* No need to change trip points */
    // 与上次对比相同,不需要进行更新
    if (tz->prev_low_trip == low && tz->prev_high_trip == high)
        goto exit;

    tz->prev_low_trip = low;
    tz->prev_high_trip = high;

    dev_dbg(&tz->device,
        "new temperature boundaries: %d < x < %d\n", low, high);

    /*
     * Set a temperature window. When this window is left the driver
     * must inform the thermal core via thermal_zone_device_update.
     */
    // 设置新的温度trip
    ret = tz->ops->set_trips(tz, low, high);
    if (ret)
        dev_err(&tz->device, "Failed to set trips: %d\n", ret);

exit:
    mutex_unlock(&tz->lock);
}

2.2 dtsi文件解析

thermal_of.c </drivers/thermal/thermal_of.c | 源代码 | v5.10.43> 解析dtsi文件,主要函数是of_parse_thermal_zones,创建解析生成thermal zone节点.

//解析dtsi中的&thermal_zones
int __init of_parse_thermal_zones(void)
{
    struct device_node *np, *child;
    struct __thermal_zone *tz;
    struct thermal_zone_device_ops *ops;

    //查找到thermal-zones
    np = of_find_node_by_name(NULL, "thermal-zones");
    if (!np) {
        pr_debug("unable to find thermal zones\n");
        return 0; /* Run successfully on systems without thermal DT */
    }

    for_each_available_child_of_node(np, child) {
        struct thermal_zone_device *zone;
        struct thermal_zone_params *tzp;
        int i, mask = 0;
        u32 prop;

        //创建一个thermal zone节点
        tz = thermal_of_build_thermal_zone(child);
        if (IS_ERR(tz)) {
            pr_err("failed to build thermal zone %pOFn: %ld\n",
                   child,
                   PTR_ERR(tz));
            continue;
        }

        //申请一段新内存,并将of_thermal_ops中的内容复制到新申请的这段内存中
        ops = kmemdup(&of_thermal_ops, sizeof(*ops), GFP_KERNEL);
        if (!ops)
            goto exit_free;

        tzp = kzalloc(sizeof(*tzp), GFP_KERNEL);
        if (!tzp) {
            kfree(ops);
            goto exit_free;
        }

        /* No hwmon because there might be hwmon drivers registering */
        tzp->no_hwmon = true;

        // 解析sustainable-power字段
        if (!of_property_read_u32(child, "sustainable-power", &prop))
            tzp->sustainable_power = prop;

        for (i = 0; i < tz->ntrips; i++)
            mask |= 1 << i;

        /* these two are left for temperature drivers to use */
        tzp->slope = tz->slope;
        tzp->offset = tz->offset;

        // 向thermal_core注册thermal_zone_device
        zone = thermal_zone_device_register(child->name, tz->ntrips,
                            mask, tz,
                            ops, tzp,
                            tz->passive_delay,
                            tz->polling_delay);
        if (IS_ERR(zone)) {
            pr_err("Failed to build %pOFn zone %ld\n", child,
                   PTR_ERR(zone));
            kfree(tzp);
            kfree(ops);
            of_thermal_free_zone(tz);
            /* attempting to build remaining zones still */
        }
    }
    of_node_put(np);

    return 0;

exit_free:
    of_node_put(child);
    of_node_put(np);
    of_thermal_free_zone(tz);

    /* no memory available, so free what we have built */
    of_thermal_destroy_zones();

    return -ENOMEM;
}
//创建一个thermal zone节点
static struct __thermal_zone
__init *thermal_of_build_thermal_zone(struct device_node *np)
{
    struct device_node *child = NULL, *gchild;
    struct __thermal_zone *tz;
    int ret, i;
    u32 prop, coef[2];

    if (!np) {
        pr_err("no thermal zone np\n");
        return ERR_PTR(-EINVAL);
    }

    tz = kzalloc(sizeof(*tz), GFP_KERNEL);
    if (!tz)
        return ERR_PTR(-ENOMEM);
    
    //解析polling-delay-passive,超过阀值轮询时间
    ret = of_property_read_u32(np, "polling-delay-passive", &prop);
    if (ret < 0) {
        pr_err("%pOFn: missing polling-delay-passive property\n", np);
        goto free_tz;
    }
    tz->passive_delay = prop;

    //解析polling-delay,未超阀值轮询时间
    ret = of_property_read_u32(np, "polling-delay", &prop);
    if (ret < 0) {
        pr_err("%pOFn: missing polling-delay property\n", np);
        goto free_tz;
    }
    tz->polling_delay = prop;

    /*
     * REVIST: for now, the thermal framework supports only
     * one sensor per thermal zone. Thus, we are considering
     * only the first two values as slope and offset.
     */
    //暂时支持一个sensor对应一个thermal zone
    ret = of_property_read_u32_array(np, "coefficients", coef, 2);
    if (ret == 0) {
        tz->slope = coef[0];
        tz->offset = coef[1];
    } else {
        tz->slope = 1;
        tz->offset = 0;
    }

    /* trips */
    // 查找trips字段
    child = of_get_child_by_name(np, "trips");

    /* No trips provided */
    if (!child)
        goto finish;

    //获取trips字段下child数量
    tz->ntrips = of_get_child_count(child);
    if (tz->ntrips == 0) /* must have at least one child */
        goto finish;

    tz->trips = kcalloc(tz->ntrips, sizeof(*tz->trips), GFP_KERNEL);
    if (!tz->trips) {
        ret = -ENOMEM;
        goto free_tz;
    }

    i = 0;
    for_each_child_of_node(child, gchild) {
        // 遍历解析trips字段下面的字段
        ret = thermal_of_populate_trip(gchild, &tz->trips[i++]);
        if (ret)
            goto free_trips;
    }

    // 减少节点引用
    of_node_put(child);

    /* cooling-maps */
  // 查找cooling-maps字段
    child = of_get_child_by_name(np, "cooling-maps");

    /* cooling-maps not provided */
    if (!child)
        goto finish;

    tz->num_tbps = of_get_child_count(child);
    if (tz->num_tbps == 0)
        goto finish;

    tz->tbps = kcalloc(tz->num_tbps, sizeof(*tz->tbps), GFP_KERNEL);
    if (!tz->tbps) {
        ret = -ENOMEM;
        goto free_trips;
    }

    i = 0;
    for_each_child_of_node(child, gchild) {
        // 遍历解析cooling-maps下的字段,绑定cooling device
        ret = thermal_of_populate_bind_params(gchild, &tz->tbps[i++],
                              tz->trips, tz->ntrips);
        if (ret)
            goto free_tbps;
    }

finish:
    of_node_put(child);

    return tz;

free_tbps:
    for (i = i - 1; i >= 0; i--) {
        struct __thermal_bind_params *tbp = tz->tbps + i;
        int j;

        for (j = 0; j < tbp->count; j++)
            of_node_put(tbp->tcbp[j].cooling_device);

        kfree(tbp->tcbp);
    }

    kfree(tz->tbps);
free_trips:
    for (i = 0; i < tz->ntrips; i++)
        of_node_put(tz->trips[i].np);
    kfree(tz->trips);
    of_node_put(gchild);
free_tz:
    kfree(tz);
    of_node_put(child);

    return ERR_PTR(ret);
}
// 遍历解析trips下的字段
static int thermal_of_populate_trip(struct device_node *np,
                    struct thermal_trip *trip)
{
    int prop;
    int ret;
    // 解析temperature字段,触发温度值
    ret = of_property_read_u32(np, "temperature", &prop);
    if (ret < 0) {
        pr_err("missing temperature property\n");
        return ret;
    }
    trip->temperature = prop;

    // 解析hysteresis字段,下降温度值恢复状态
    ret = of_property_read_u32(np, "hysteresis", &prop);
    if (ret < 0) {
        pr_err("missing hysteresis property\n");
        return ret;
    }
    trip->hysteresis = prop;

    // 解析type字段,一般配置为passive,当温控发生后由governor控制
    ret = thermal_of_get_trip_type(np, &trip->type);
    if (ret < 0) {
        pr_err("wrong trip type property\n");
        return ret;
    }

    /* Required for cooling map matching */
    trip->np = np;
    of_node_get(np);

    return 0;
}
//解析cooling-maps下字段
static int thermal_of_populate_bind_params(struct device_node *np,
                       struct __thermal_bind_params *__tbp,
                       struct thermal_trip *trips,
                       int ntrips)
{
    struct of_phandle_args cooling_spec;
    struct __thermal_cooling_bind_param *__tcbp;
    struct device_node *trip;
    int ret, i, count;
    u32 prop;

    // 默认contribution字段,表示权重值,可选
    __tbp->usage = THERMAL_WEIGHT_DEFAULT;
    ret = of_property_read_u32(np, "contribution", &prop);
    if (ret == 0)
        __tbp->usage = prop;

    // 获取trip字段下phandle
    trip = of_parse_phandle(np, "trip", 0);
    if (!trip) {
        pr_err("missing trip property\n");
        return -ENODEV;
    }

    //匹配trips列表中的trip
    for (i = 0; i < ntrips; i++)
        if (trip == trips[i].np) {
            __tbp->trip_id = i;
            break;
        }

    if (i == ntrips) {
        ret = -ENODEV;
        goto end;
    }

    //获取cooling-device的phandle个数
    count = of_count_phandle_with_args(np, "cooling-device",
                       "#cooling-cells");
    if (count <= 0) {
        pr_err("Add a cooling_device property with at least one device\n");
        ret = -ENOENT;
        goto end;
    }

    __tcbp = kcalloc(count, sizeof(*__tcbp), GFP_KERNEL);
    if (!__tcbp) {
        ret = -ENOMEM;
        goto end;
    }

    for (i = 0; i < count; i++) {
        //获取cooling-device的phandle参数
        ret = of_parse_phandle_with_args(np, "cooling-device",
                "#cooling-cells", i, &cooling_spec);
        if (ret < 0) {
            pr_err("Invalid cooling-device entry\n");
            goto free_tcbp;
        }

        __tcbp[i].cooling_device = cooling_spec.np;
        //参数个数必须大于等于2,写最小最大的范围值,代表可调整最小最大档位
        if (cooling_spec.args_count >= 2) { /* at least min and max */
            __tcbp[i].min = cooling_spec.args[0];
            __tcbp[i].max = cooling_spec.args[1];
        } else {
            pr_err("wrong reference to cooling device, missing limits\n");
        }
    }

    __tbp->tcbp = __tcbp;
    __tbp->count = count;

    goto end;

free_tcbp:
    for (i = i - 1; i >= 0; i--)
        of_node_put(__tcbp[i].cooling_device);
    kfree(__tcbp);
end:
    of_node_put(trip);

    return ret;
}

thermal_extra.drawio.svg

2.3 thermal governor

目前可配置默认的thermal governor策略

/* Default Thermal Governor */
#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)
#define DEFAULT_THERMAL_GOVERNOR       "step_wise"
#elif defined(CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE)
#define DEFAULT_THERMAL_GOVERNOR       "fair_share"
#elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE)
#define DEFAULT_THERMAL_GOVERNOR       "user_space"
#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR)
#define DEFAULT_THERMAL_GOVERNOR       "power_allocator"
#endif

Kconfig中配置thermal governor默认step_wise下降温度值恢复状态

config THERMAL_WRITABLE_TRIPS
    bool "Enable writable trip points"
    help
      This option allows the system integrator to choose whether
      trip temperatures can be changed from userspace. The
      writable trips need to be specified when setting up the
      thermal zone but the choice here takes precedence.

      Say 'Y' here if you would like to allow userspace tools to
      change trip temperatures.

choice
    prompt "Default Thermal governor"
    default THERMAL_DEFAULT_GOV_STEP_WISE
    help
      This option sets which thermal governor shall be loaded at
      startup. If in doubt, select 'step_wise'.

config THERMAL_DEFAULT_GOV_STEP_WISE
    bool "step_wise"
    select THERMAL_GOV_STEP_WISE
    help
      Use the step_wise governor as default. This throttles the
      devices one step at a time.

config THERMAL_DEFAULT_GOV_FAIR_SHARE
    bool "fair_share"
    select THERMAL_GOV_FAIR_SHARE
    help
      Use the fair_share governor as default. This throttles the
      devices based on their 'contribution' to a zone. The
      contribution should be provided through platform data.

config THERMAL_DEFAULT_GOV_USER_SPACE
    bool "user_space"
    select THERMAL_GOV_USER_SPACE
    help
      Select this if you want to let the user space manage the
      platform thermals.

config THERMAL_DEFAULT_GOV_POWER_ALLOCATOR
    bool "power_allocator"
    depends on THERMAL_GOV_POWER_ALLOCATOR
    help
      Select this if you want to control temperature based on
      system and device power allocation. This governor can only
      operate on cooling devices that implement the power API.

endchoice

2.3.1 step_wise governor

step_wise governor 是每个轮询周期逐级提高冷却状态,是一种相对温和的温控策略。根据cur_state、温升趋势trend、是否throttle去计算cooling_device的target_state,从而达到控制cooling_device来控制温升。

对于cooling state的计算策略:

  1. 当温升趋势为上升且发生throttle,使用更高一级的cooling state
  2. 当温升趋势为下降

若发生throttle,不改变cooling state

若解除throttle,使用更低一级的cooling state

  1. 当达到最高温线且发生throttle,使用最高级的 cooling state
  2. 当达到最低温线且发生throttle,使用最低级的cooling state

注意: cooling state 取值范围在[instance->lower,instance->upper],若cur_state < instance->lower,target_state则取值为THERMAL_NO_TARGET。

代码框架图

step_wise.drawio.png

thermal.h </include/linux/thermal.h | 源代码 | v5.10.43> 定义了温升趋势trend。

enum thermal_trend {
    THERMAL_TREND_STABLE, /* 稳定 temperature is stable */
    THERMAL_TREND_RAISING, /* 上升 temperature is raising */
    THERMAL_TREND_DROPPING, /* 下降 temperature is dropping */
    THERMAL_TREND_RAISE_FULL, /* 最高温线 apply highest cooling action */
    THERMAL_TREND_DROP_FULL, /* 最低温线 apply lowest cooling action */
};

gov_step_wise.c </drivers/thermal/gov_step_wise.c | 源代码 | v5.10.43>

static int step_wise_throttle(struct thermal_zone_device *tz, int trip)
{
    struct thermal_instance *instance;

    // 更新trip、trend和计算cooling_device的target_state
    thermal_zone_trip_update(tz, trip);

    if (tz->forced_passive)
        thermal_zone_trip_update(tz, THERMAL_TRIPS_NONE);

    mutex_lock(&tz->lock);

    // 遍历更新cooling_device的state
    list_for_each_entry(instance, &tz->thermal_instances, tz_node)
        thermal_cdev_update(instance->cdev);

    mutex_unlock(&tz->lock);

    return 0;
}
// 更新trip、trend和计算cooling_device的target_state
static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
{
    int trip_temp;
    enum thermal_trip_type trip_type;
    enum thermal_trend trend;
    struct thermal_instance *instance;
    bool throttle = false;
    int old_target;

    // 获取trip的类型和温度
    if (trip == THERMAL_TRIPS_NONE) {
        trip_temp = tz->forced_passive;
        trip_type = THERMAL_TRIPS_NONE;
    } else {
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
        tz->ops->get_trip_type(tz, trip, &trip_type);
    }

    // 获取温升趋势,稳定(THERMAL_TREND_STABLE), 上升(THERMAL_TREND_RAISING), 下降(THERMAL_TREND_DROPPING)
    trend = get_tz_trend(tz, trip);

    // 当zone温度大于trip_temp,则需要进行触发
    if (tz->temperature >= trip_temp) {
        throttle = true;
        trace_thermal_zone_trip(tz, trip, trip_type);
    }

    dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",
                trip, trip_type, trip_temp, trend, throttle);

    mutex_lock(&tz->lock);

    list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
        if (instance->trip != trip)
            continue;

        old_target = instance->target;
        
        // 计算cooling_device的target_state
        instance->target = get_target_state(instance, trend, throttle);
        dev_dbg(&instance->cdev->device, "old_target=%d, target=%d\n",
                    old_target, (int)instance->target);

        if (instance->initialized && old_target == instance->target)
            continue;

        /* Activate a passive thermal instance */
        if (old_target == THERMAL_NO_TARGET &&
            instance->target != THERMAL_NO_TARGET)
            update_passive_instance(tz, trip_type, 1);
        /* Deactivate a passive thermal instance */
        else if (old_target != THERMAL_NO_TARGET &&
            instance->target == THERMAL_NO_TARGET)
            update_passive_instance(tz, trip_type, -1);

        instance->initialized = true;
        mutex_lock(&instance->cdev->lock);
        instance->cdev->updated = false; /* cdev needs update */
        mutex_unlock(&instance->cdev->lock);
    }

    mutex_unlock(&tz->lock);
}
// 计算cooling_device的target_state
static unsigned long get_target_state(struct thermal_instance *instance,
                enum thermal_trend trend, bool throttle)
{
    struct thermal_cooling_device *cdev = instance->cdev;
    unsigned long cur_state;
    unsigned long next_target;

    /*
     * We keep this instance the way it is by default.
     * Otherwise, we use the current state of the
     * cdev in use to determine the next_target.
     */
    //获取cooling device的当前state
    cdev->ops->get_cur_state(cdev, &cur_state);
    next_target = instance->target;
    dev_dbg(&cdev->device, "cur_state=%ld\n", cur_state);

    //如果没有初始化
    if (!instance->initialized) {
        if (throttle) {
            // next_target初始值为(cur_state + 1),取值范围在[instance->lower,instance->upper]
            next_target = (cur_state + 1) >= instance->upper ?
                    instance->upper :
                    ((cur_state + 1) < instance->lower ?
                    instance->lower : (cur_state + 1));
        } else {
            next_target = THERMAL_NO_TARGET;
        }

        return next_target;
    }

    switch (trend) {
    // 当温升趋势为上升且发生throttle,使用更高一级的 cooling state
    // 取值范围在[instance->lower,instance->upper]
    case THERMAL_TREND_RAISING:
        if (throttle) {
            next_target = cur_state < instance->upper ?
                    (cur_state + 1) : instance->upper;
            if (next_target < instance->lower)
                next_target = instance->lower;
        }
        break;
    // 当达到最高温线且发生throttle,使用最高级的 cooling state,将温度快速降下来
    case THERMAL_TREND_RAISE_FULL:
        if (throttle)
            next_target = instance->upper;
        break;

    // 当温升趋势为下降
  // 发生throttle,不改变cooling state
    // 解除throttle,使用低一级的cooling state
    case THERMAL_TREND_DROPPING:
        if (cur_state <= instance->lower) {
            if (!throttle)
                next_target = THERMAL_NO_TARGET;
        } else {    
            if (!throttle) {
                next_target = cur_state - 1;
                if (next_target > instance->upper)
                    next_target = instance->upper;
            }
        }
        break;

    // 当达到最低温线且发生throttle,使用最低级的cooling state
    case THERMAL_TREND_DROP_FULL:
        if (cur_state == instance->lower) {
            if (!throttle)
                next_target = THERMAL_NO_TARGET;
        } else
            next_target = instance->lower;
        break;
    default:
        break;
    }

    return next_target;
}

thermal_cdev_update函数 </drivers/thermal/thermal_helpers.c | 源代码 | v5.10.43>

// 更新cooling device的state
void thermal_cdev_update(struct thermal_cooling_device *cdev)
{
    struct thermal_instance *instance;
    unsigned long target = 0;

    mutex_lock(&cdev->lock);
    /* cooling device is updated*/
    if (cdev->updated) {
        mutex_unlock(&cdev->lock);
        return;
    }

    /* Make sure cdev enters the deepest cooling state */
    list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
        dev_dbg(&cdev->device, "zone%d->target=%lu\n",
            instance->tz->id, instance->target);
        if (instance->target == THERMAL_NO_TARGET)
            continue;
        if (instance->target > target)
            target = instance->target;
    }
    // 设置cooling device的state
    thermal_cdev_set_cur_state(cdev, target);

    cdev->updated = true;
    mutex_unlock(&cdev->lock);
    trace_cdev_update(cdev, target);
    dev_dbg(&cdev->device, "set to state %lu\n", target);
}

2.3.2 power_allocator governor

IPA(Intelligent PowerAllocation)<drivers/thermal/gov_power_allocator.c | 源代码 | v5.10.43> 是由ARM开发的符合linux内核thermalframework的governor,代码中的名字为power_allocator,旨在满足温控效果的条件下最大化性能。IPA(Intelligent Power Allocator)模型的核心是利用 PID 控制器,ThermalZone 的温度作为输入,可分配功耗值作为输出,调节 Allocator 的频率和电压值。

代码框架图

power_allocator.png

功耗均衡原理图

power_allocator_divvy.png

gov_power_allocator.c <drivers/thermal/gov_power_allocator.c | 源代码 | v5.10.43>

static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
{
    int ret;
    int switch_on_temp, control_temp;
    struct power_allocator_params *params = tz->governor_data;

    /*
     * We get called for every trip point but we only need to do
     * our calculations once
     */
    if (trip != params->trip_max_desired_temperature)
        return 0;

    // 获取trip温度,作为switch_on触发温度
    ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
                     &switch_on_temp);
    if (!ret && (tz->temperature < switch_on_temp)) {
        tz->passive = 0;
        reset_pid_controller(params);
        allow_maximum_power(tz);
        return 0;
    }

    tz->passive = 1;

    // 获取trip温度,作为目标的温度值
    ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature,
                &control_temp);
    if (ret) {
        dev_warn(&tz->device,
             "Failed to get the maximum desired temperature: %d\n",
             ret);
        return ret;
    }

    // IPA主要的算法逻辑
    return allocate_power(tz, control_temp);
}
// IPA主要的算法逻辑
static int allocate_power(struct thermal_zone_device *tz,
              int control_temp)
{
    struct thermal_instance *instance;
    struct power_allocator_params *params = tz->governor_data;
    u32 *req_power, *max_power, *granted_power, *extra_actor_power;
    u32 *weighted_req_power;
    u32 total_req_power, max_allocatable_power, total_weighted_req_power;
    u32 total_granted_power, power_range;
    int i, num_actors, total_weight, ret = 0;
    int trip_max_desired_temperature = params->trip_max_desired_temperature;

    mutex_lock(&tz->lock);

    num_actors = 0;
    total_weight = 0;
    list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
        if ((instance->trip == trip_max_desired_temperature) &&
            cdev_is_power_actor(instance->cdev)) {
            num_actors++;
            total_weight += instance->weight;
        }
    }

    if (!num_actors) {
        ret = -ENODEV;
        goto unlock;
    }

    /*
     * We need to allocate five arrays of the same size:
     * req_power, max_power, granted_power, extra_actor_power and
     * weighted_req_power.  They are going to be needed until this
     * function returns.  Allocate them all in one go to simplify
     * the allocation and deallocation logic.
     */
    BUILD_BUG_ON(sizeof(*req_power) != sizeof(*max_power));
    BUILD_BUG_ON(sizeof(*req_power) != sizeof(*granted_power));
    BUILD_BUG_ON(sizeof(*req_power) != sizeof(*extra_actor_power));
    BUILD_BUG_ON(sizeof(*req_power) != sizeof(*weighted_req_power));
    req_power = kcalloc(num_actors * 5, sizeof(*req_power), GFP_KERNEL);
    if (!req_power) {
        ret = -ENOMEM;
        goto unlock;
    }

    max_power = &req_power[num_actors];
    granted_power = &req_power[2 * num_actors];
    extra_actor_power = &req_power[3 * num_actors];
    weighted_req_power = &req_power[4 * num_actors];

    i = 0;
    total_weighted_req_power = 0;
    total_req_power = 0;
    max_allocatable_power = 0;

    // 遍历所有的cooling device
    list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
        int weight;
        struct thermal_cooling_device *cdev = instance->cdev;

        if (instance->trip != trip_max_desired_temperature)
            continue;
        
        // cooling device的ops的函数指针get_requested_power、state2power和power2state是否存在
        if (!cdev_is_power_actor(cdev))
            continue;

        // 获取cooling device的功耗需求requested power
        if (cdev->ops->get_requested_power(cdev, &req_power[i]))
            continue;

        if (!total_weight)
            weight = 1 << FRAC_BITS;
        else
            weight = instance->weight;

        //获取cooling device的权重功耗,weight*requested_power
        weighted_req_power[i] = frac_to_int(weight * req_power[i]);

        // 获取cooling device可以消耗的最大功率
        if (power_actor_get_max_power(cdev, &max_power[i]))
            continue;
        
    // 总的cdev需要的功耗
        total_req_power += req_power[i];
        // 总的最大可分配的功耗
        max_allocatable_power += max_power[i];
        // 总的cdev需要的权重功耗
        total_weighted_req_power += weighted_req_power[i];

        i++;
    }

    // PID控制算法,power_range是当前温度下可配置的最大功耗值
    power_range = pid_controller(tz, control_temp, max_allocatable_power);

    // 分摊计算出当前温度下每个cooling device的最终的total granted_power
    // 公式:total granted_power = granted_power + extra_granted_power
    divvy_up_power(weighted_req_power, max_power, num_actors,
               total_weighted_req_power, power_range, granted_power,
               extra_actor_power);

    total_granted_power = 0;
    i = 0;
    list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
        if (instance->trip != trip_max_desired_temperature)
            continue;

        if (!cdev_is_power_actor(instance->cdev))
            continue;
        
    // 给cooling device设置granted_power
        power_actor_set_power(instance->cdev, instance,
                      granted_power[i]);
        total_granted_power += granted_power[i];

        i++;
    }

    trace_thermal_power_allocator(tz, req_power, total_req_power,
                      granted_power, total_granted_power,
                      num_actors, power_range,
                      max_allocatable_power, tz->temperature,
                      control_temp - tz->temperature);

    kfree(req_power);
unlock:
    mutex_unlock(&tz->lock);

    return ret;
}
// 分摊计算出当前温度下cooling device的最终的total_granted_power
static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
               u32 total_req_power, u32 power_range,
               u32 *granted_power, u32 *extra_actor_power)
{
    u32 extra_power, capped_extra_power;
    int i;

    /*
     * Prevent division by 0 if none of the actors request power.
     */
    if (!total_req_power)
        total_req_power = 1;

    capped_extra_power = 0;
    extra_power = 0;
    for (i = 0; i < num_actors; i++) {
        u64 req_range = (u64)req_power[i] * power_range;

        // granted_power(cooling device被分配的功耗),
        // total_req_power值为total_weighted_req_power
        // req_power值为weighted_req_power
        // power_range:power_range是当前温度下可配置的最大功耗值
    //公式:四舍五入power_range * (weighted_req_power[i] / total_weighted_req_power)
        granted_power[i] = DIV_ROUND_CLOSEST_ULL(req_range,
                             total_req_power);

        // device granted_power不能大于max_power
        if (granted_power[i] > max_power[i]) {
            // 额外需要的功耗,累加分配过多的功耗
            extra_power += granted_power[i] - max_power[i];
            granted_power[i] = max_power[i];
        }

        // 计算分配过多的功耗,再分配的权重 
    // 公式:(max_power[i] - granted_power[i])/capped_extra_power
        extra_actor_power[i] = max_power[i] - granted_power[i];
        capped_extra_power += extra_actor_power[i];
    }

    if (!extra_power)
        return;

    /*
     * Re-divvy the reclaimed extra among actors based on
     * how far they are from the max
     */
    // 重新分配额外功耗
    // 假设granted_extra_power
    // 公式:granted_extra_power[i] = extra_power * (max_power[i] - granted_power[i])/capped_extra_power
    // cooling device总的分配功耗:granted_power[i] += granted_extra_power[i]

    // extra_power最大取值为capped_extra_power
    extra_power = min(extra_power, capped_extra_power);
    if (capped_extra_power > 0)
        for (i = 0; i < num_actors; i++)
            granted_power[i] += (extra_actor_power[i] *
                    extra_power) / capped_extra_power;
}
// pid控制算法
static u32 pid_controller(struct thermal_zone_device *tz,
              int control_temp,
              u32 max_allocatable_power)
{
    s64 p, i, d, power_range;
    s32 err, max_power_frac;
    u32 sustainable_power;
    struct power_allocator_params *params = tz->governor_data;

    max_power_frac = int_to_frac(max_allocatable_power);

    // sustainable_power:保证所有cooling device的正常运行的最小功耗值。(state最大)
    if (tz->tzp->sustainable_power) {
        //如果设置了,按照设置的来
        sustainable_power = tz->tzp->sustainable_power;
    } else {
        // 默认sustainable_power,所有cooling device在最大state下的最小功耗值进行累加
        sustainable_power = estimate_sustainable_power(tz);
        // 默认pid的参数值,K_pu、K_po、K_pi
        estimate_pid_constants(tz, sustainable_power,
                       params->trip_switch_on, control_temp,
                       true);
    }

    // 当前温度和目标温度的差值
    err = control_temp - tz->temperature;
    err = int_to_frac(err);

    /*
     * 计算比例项
     * 公式:K_p*err(目标温度和当前温度的差值)
     * 当前温度<=目标温度 k_pu = int_to_frac(2*sustainable_power / (control_temp - switch_on_temp))
     * 当前温度>目标温度 k_po = int_to_frac(sustainable_power / (control_temp - switch_on_temp))
     */
    p = mul_frac(err < 0 ? tz->tzp->k_po : tz->tzp->k_pu, err);

    /*
     * 计算积分项
     * 公式:K_i*err_integral(差值的累加)
     * 默认:K_i = int_to_frac(10 / 1000)
     * if the error is less than cut off allow integration (but
     * the integral is limited to max power)
     */
    i = mul_frac(tz->tzp->k_i, params->err_integral);

    // integral_cutoff默认为0
    // err < 0,这次的err不进行累加
    if (err < int_to_frac(tz->tzp->integral_cutoff)) {
        s64 i_next = i + mul_frac(tz->tzp->k_i, err);
        // (K_i * err_integral)必须小于max_power_frac
        if (abs(i_next) < max_power_frac) {
            i = i_next;
            params->err_integral += err;
        }
    }

    /*
     * 计算微分项
     * 公式:K_d*(err - prev_err) / passive_delay
     * 默认:K_d = 0
     * We do err - prev_err, so with a positive k_d, a decreasing
     * error (i.e. driving closer to the line) results in less
     * power being applied, slowing down the controller)
     */
    d = mul_frac(tz->tzp->k_d, err - params->prev_err);
    d = div_frac(d, tz->passive_delay);
    params->prev_err = err;

    power_range = p + i + d;

    //当前温度下允许的最大功耗值 = sustainable_power + frac_to_int(p + i + d)
    power_range = sustainable_power + frac_to_int(power_range);

    // power_range 取值在[0,max_allocatable_power]
    power_range = clamp(power_range, (s64)0, (s64)max_allocatable_power);

    trace_thermal_power_allocator_pid(tz, frac_to_int(err),
                      frac_to_int(params->err_integral),
                      frac_to_int(p), frac_to_int(i),
                      frac_to_int(d), power_range);

    return power_range;
}
// 所有cooling device在最大state下的最小功耗值进行累加
static u32 estimate_sustainable_power(struct thermal_zone_device *tz)
{
    u32 sustainable_power = 0;
    struct thermal_instance *instance;
    struct power_allocator_params *params = tz->governor_data;

    list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
        struct thermal_cooling_device *cdev = instance->cdev;
        u32 min_power;

        if (instance->trip != params->trip_max_desired_temperature)
            continue;
        // 获取cdev的最小功耗值
        if (power_actor_get_min_power(cdev, &min_power))
            continue;
        // 累加cooling device的最小功耗值
        sustainable_power += min_power;
    }

    return sustainable_power;
}
// 默认pid的参数值
static void estimate_pid_constants(struct thermal_zone_device *tz,
                   u32 sustainable_power, int trip_switch_on,
                   int control_temp, bool force)
{
    int ret;
    int switch_on_temp;
    u32 temperature_threshold;

    // 获取switch_on_temp,触发算法开关
    ret = tz->ops->get_trip_temp(tz, trip_switch_on, &switch_on_temp);
    if (ret)
        switch_on_temp = 0;

    // 目标温度和触发温度的差值
    temperature_threshold = control_temp - switch_on_temp;
    /*
     * estimate_pid_constants() tries to find appropriate default
     * values for thermal zones that don't provide them. If a
     * system integrator has configured a thermal zone with two
     * passive trip points at the same temperature, that person
     * hasn't put any effort to set up the thermal zone properly
     * so just give up.
     */
    if (!temperature_threshold)
        return;

    // Kp的取值分阶段k_pu和k_po,int_to_frac只是为了避免小数的影响,先左移动,后在mul_frac中右移
    // k_po = int_to_frac(sustainable_power / (control_temp - switch_on_temp))
    if (!tz->tzp->k_po || force)
        tz->tzp->k_po = int_to_frac(sustainable_power) /
            temperature_threshold;

    // k_pu = int_to_frac(2*sustainable_power / (control_temp - switch_on_temp))
    if (!tz->tzp->k_pu || force)
        tz->tzp->k_pu = int_to_frac(2 * sustainable_power) /
            temperature_threshold;

    // k_i = int_to_frac(10 / 1000)
    if (!tz->tzp->k_i || force)
        tz->tzp->k_i = int_to_frac(10) / 1000;
    /*
     * The default for k_d and integral_cutoff is 0, so we can
     * leave them as they are.
     */
    // 默认k_d = 0 , integral_cutoff = 0
}

power_actor_get_max_power </drivers/thermal/thermal_core.c | 函数power_actor_get_max_power | v5.10.43> ,获取cooling device最大功耗值

int power_actor_get_max_power(struct thermal_cooling_device *cdev,
                  u32 *max_power)
{
    if (!cdev_is_power_actor(cdev))
        return -EINVAL;

    // 将cooling device的state转换为power,当power = max_power,state为0
    return cdev->ops->state2power(cdev, 0, max_power);
}

例如,cooling device是cpu,冷却措施是调节cpu frequency,cpufreq_cooling.c </drivers/thermal/cpufreq_cooling.c | 函数cpufreq_state2power | v5.10.43>

// 将 cpu cdev state转换为功耗
static int cpufreq_state2power(struct thermal_cooling_device *cdev,
                   unsigned long state, u32 *power)
{
    unsigned int freq, num_cpus, idx;
    struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;

    /* Request state should be less than max_level */
    if (state > cpufreq_cdev->max_level)
        return -EINVAL;

    //获取同一个簇中的cpu数量
    num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);

    idx = cpufreq_cdev->max_level - state;
    // 获取相应的state对应的CPU频率
    freq = cpufreq_cdev->em->table[idx].frequency;
    // 获取同一簇的cpu频率对应的功耗值,查表
    *power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;

    return 0;
}
// 获取CPU freq的requested_power(当前cpu load需要的功耗值)
static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
                       u32 *power)
{
    unsigned long freq;
    int i = 0, cpu;
    u32 total_load = 0;
    struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
    struct cpufreq_policy *policy = cpufreq_cdev->policy;
    u32 *load_cpu = NULL;

    // 获取当前的CPU频率
    freq = cpufreq_quick_get(policy->cpu);

    if (trace_thermal_power_cpu_get_power_enabled()) {
        u32 ncpus = cpumask_weight(policy->related_cpus);

        load_cpu = kcalloc(ncpus, sizeof(*load_cpu), GFP_KERNEL);
    }

    // 遍历获取cpu的负载
    for_each_cpu(cpu, policy->related_cpus) {
        u32 load;

        if (cpu_online(cpu))
            load = get_load(cpufreq_cdev, cpu, i);
        else
            load = 0;

        total_load += load;
        if (load_cpu)
            load_cpu[i] = load;

        i++;
    }
    //cpu总负载
    cpufreq_cdev->last_load = total_load;

    // 获取cpu动态功耗值
    // 根据查找表,cpu当前频率对应的功耗值
    // 然后raw_cpu_power * (total_load / 100)
    *power = get_dynamic_power(cpufreq_cdev, freq);

    if (load_cpu) {
        trace_thermal_power_cpu_get_power(policy->related_cpus, freq,
                          load_cpu, i, *power);

        kfree(load_cpu);
    }

    return 0;
}

2.3.3 bang_bang governor

  • 当throttle发生,打开风扇
  • 当throttle解除,关闭风扇。

2.3.4 user_space governor

user_space governor 是通过 uevent 将温区当前温度,温控触发点等信息上报到用户空间,由用户空间软件制定温控的策略。

2.4 绑定sensor

以bcl_soc为例,这里是创建一个platform_driver,platform_driver必须实现probe和remove函数,bcl_soc是不需要通过polling(轮询)的方式去检查是否触发,polling-delay是轮询的周期。它是通过监听系统电量的变化,去回调battery_supply_callback函数,去唤醒队列中的bcl_evaluate_soc函数,通过bcl_evaluate_soc函数进行获取当前的温度和处理符合触发条件的trips。

bcl_soc:bcl-soc {
        compatible = "qcom,msm-bcl-soc";
        #thermal-sensor-cells = <0>;
};

bcl_soc.c </drivers/thermal/qcom/bcl_soc.c | 源代码 | android-12.1.0_r0.24>

#define pr_fmt(fmt) "%s:%s " fmt, KBUILD_MODNAME, __func__
#include <linux/module.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/io.h>
#include <linux/err.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/mutex.h>
#include <linux/power_supply.h>
#include <linux/thermal.h>
#include "../thermal_core.h"
#define BCL_DRIVER_NAME       "bcl_soc_peripheral"
struct bcl_device {
    struct notifier_block            psy_nb;
    struct work_struct            soc_eval_work;
    long int                trip_temp;
    int                    trip_val;
    struct mutex                state_trans_lock;
    bool                    irq_enabled;
    struct thermal_zone_device        *tz_dev;
    struct thermal_zone_of_device_ops    ops;
};
static struct bcl_device *bcl_perph;
// 绑定trip_temp接口,设置触发值trip_temp
static int bcl_set_soc(void *data, int low, int high)
{
    if (low == bcl_perph->trip_temp)
        return 0;
    mutex_lock(&bcl_perph->state_trans_lock);
    pr_debug("low soc threshold:%d\n", low);
    // 设置trip_temp
    bcl_perph->trip_temp = low;
    if (low == INT_MIN) {
        bcl_perph->irq_enabled = false;
        goto unlock_and_exit;
    }
    bcl_perph->irq_enabled = true;
    schedule_work(&bcl_perph->soc_eval_work);
unlock_and_exit:
    mutex_unlock(&bcl_perph->state_trans_lock);
    return 0;
}

// 绑定get_temp接口,获取电量值
static int bcl_read_soc(void *data, int *val)
{
    static struct power_supply *batt_psy;
    union power_supply_propval ret = {0,};
    int err = 0;
    *val = 100;
    if (!batt_psy)
        batt_psy = power_supply_get_by_name("battery");
    if (batt_psy) {
        // 获取电量
        err = power_supply_get_property(batt_psy,
                POWER_SUPPLY_PROP_CAPACITY, &ret);
        if (err) {
            pr_err("battery percentage read error:%d\n",
                err);
            return err;
        }
        *val = ret.intval;
    }
    pr_debug("soc:%d\n", *val);
    return err;
}

// 获取当前温度和处理thermal zone trip
static void bcl_evaluate_soc(struct work_struct *work)
{
    int battery_percentage;
    // 获取电量
    if (bcl_read_soc(NULL, &battery_percentage))
        return;
    mutex_lock(&bcl_perph->state_trans_lock);
    if (!bcl_perph->irq_enabled)
        goto eval_exit;
    if (battery_percentage > bcl_perph->trip_temp)
        goto eval_exit;
    // 当前电量值
    bcl_perph->trip_val = battery_percentage;
    mutex_unlock(&bcl_perph->state_trans_lock);
    // 处理thermal zone trip,调用的是thermal core中的handle_thermal_trip
    of_thermal_handle_trip(bcl_perph->tz_dev);
    return;
eval_exit:
    mutex_unlock(&bcl_perph->state_trans_lock);
}

// 电量变化回调battery_supply_callback函数,去唤醒队列中的bcl_evaluate_soc函数
static int battery_supply_callback(struct notifier_block *nb,
            unsigned long event, void *data)
{
    struct power_supply *psy = data;
    if (strcmp(psy->desc->name, "battery"))
        return NOTIFY_OK;
    schedule_work(&bcl_perph->soc_eval_work);
    return NOTIFY_OK;
}
static int bcl_soc_remove(struct platform_device *pdev)
{
    power_supply_unreg_notifier(&bcl_perph->psy_nb);
    flush_work(&bcl_perph->soc_eval_work);
    if (bcl_perph->tz_dev)
        thermal_zone_of_sensor_unregister(&pdev->dev,
                bcl_perph->tz_dev);
    return 0;
}

static int bcl_soc_probe(struct platform_device *pdev)
{
    int ret = 0;
    //申请内存空间, 当设备被拆卸或者驱动程序卸载时,内存会被自动释放
    bcl_perph = devm_kzalloc(&pdev->dev, sizeof(*bcl_perph), GFP_KERNEL);
    if (!bcl_perph)
        return -ENOMEM;
    mutex_init(&bcl_perph->state_trans_lock);
    // 指向get_temp、set_trips函数
    bcl_perph->ops.get_temp = bcl_read_soc;
    bcl_perph->ops.set_trips = bcl_set_soc;
    // 定义初始化工作队列
    INIT_WORK(&bcl_perph->soc_eval_work, bcl_evaluate_soc);
  // 回调函数
    bcl_perph->psy_nb.notifier_call = battery_supply_callback;
    //注册监听接口,系统任何PSY设备的状态发生改变,并调用了power_supply_changed接口,power supply core就通知notifier的监听者。
    ret = power_supply_reg_notifier(&bcl_perph->psy_nb);
    if (ret < 0) {
        pr_err("soc notifier registration error. defer. err:%d\n",
            ret);
        ret = -EPROBE_DEFER;
        goto bcl_soc_probe_exit;
    }
    // 向thermal zone注册sensor
    bcl_perph->tz_dev = thermal_zone_of_sensor_register(&pdev->dev,
                0, bcl_perph, &bcl_perph->ops);
    if (IS_ERR(bcl_perph->tz_dev)) {
        pr_err("soc TZ register failed. err:%ld\n",
                PTR_ERR(bcl_perph->tz_dev));
        ret = PTR_ERR(bcl_perph->tz_dev);
        bcl_perph->tz_dev = NULL;
        goto bcl_soc_probe_exit;
    }
    thermal_zone_device_update(bcl_perph->tz_dev, THERMAL_DEVICE_UP);

  // 将soc_eval_work添加到默认的工作队列
    schedule_work(&bcl_perph->soc_eval_work);
    // 设置driver data的结构体是bcl_perph
    dev_set_drvdata(&pdev->dev, bcl_perph);
    return 0;
bcl_soc_probe_exit:
    bcl_soc_remove(pdev);
    return ret;
}

//在dtsi中匹配.compatible = "qcom,msm-bcl-soc"的sensor,可以多个
static const struct of_device_id bcl_match[] = {
    {
        .compatible = "qcom,msm-bcl-soc",
    },
    {},
};
static struct platform_driver bcl_driver = {
    .probe  = bcl_soc_probe,
    .remove = bcl_soc_remove,
    .driver = {
        .name           = BCL_DRIVER_NAME,
        .owner          = THIS_MODULE,
        .of_match_table = bcl_match,
    },
};
builtin_platform_driver(bcl_driver);

提供给sensor driver去调用的API接口 <drivers/thermal/thermal_of.c | 源代码 | v5.10.43>

// 向thermal zone注册sensor,通过data传入sensor_data
struct thermal_zone_device *
thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
                const struct thermal_zone_of_device_ops *ops)
{
    struct device_node *np, *child, *sensor_np;
    struct thermal_zone_device *tzd = ERR_PTR(-ENODEV);

    np = of_find_node_by_name(NULL, "thermal-zones");
    if (!np)
        return ERR_PTR(-ENODEV);

    if (!dev || !dev->of_node) {
        of_node_put(np);
        return ERR_PTR(-ENODEV);
    }

    sensor_np = of_node_get(dev->of_node);

    for_each_available_child_of_node(np, child) {
        int ret, id;

        // //解析dtsi中thermal-sensors节点
        ret = thermal_zone_of_get_sensor_id(child, sensor_np, &id);
        if (ret)
            continue;

        if (id == sensor_id) {
            // 在thermal zone中绑定sensor
            tzd = thermal_zone_of_add_sensor(child, sensor_np,
                             data, ops);
            if (!IS_ERR(tzd))
                thermal_zone_device_enable(tzd);

            of_node_put(child);
            goto exit;
        }
    }
exit:
    of_node_put(sensor_np);
    of_node_put(np);

    return tzd;
}

/***   sensor API   ***/
// 在thermal zone中绑定sensor
static struct thermal_zone_device *
thermal_zone_of_add_sensor(struct device_node *zone,
               struct device_node *sensor, void *data,
               const struct thermal_zone_of_device_ops *ops)
{
    struct thermal_zone_device *tzd;
    struct __thermal_zone *tz;

    // 获取当前的thermal zone
    tzd = thermal_zone_get_zone_by_name(zone->name);
    if (IS_ERR(tzd))
        return ERR_PTR(-EPROBE_DEFER);

    tz = tzd->devdata;

    if (!ops)
        return ERR_PTR(-EINVAL);

    mutex_lock(&tzd->lock);
    // 绑定ops
    tz->ops = ops;
    // 绑定sensor_data
    tz->sensor_data = data;
    // 绑定sensor中实现的get_temp、get_trend
    tzd->ops->get_temp = of_thermal_get_temp;
    tzd->ops->get_trend = of_thermal_get_trend;

    /*
     * The thermal zone core will calculate the window if they have set the
     * optional set_trips pointer.
     */
    if (ops->set_trips)
        tzd->ops->set_trips = of_thermal_set_trips;

    if (ops->set_emul_temp)
        tzd->ops->set_emul_temp = of_thermal_set_emul_temp;

    mutex_unlock(&tzd->lock);

    return tzd;
}
文章目录