[v3,4/8] cpufreq: amd_pstate: add AMD Pstate EPP support for the MSR based processors

Message ID 20221107175705.2207842-5-Perry.Yuan@amd.com
State New
Headers
Series Implement AMD Pstate EPP Driver |

Commit Message

Yuan, Perry Nov. 7, 2022, 5:57 p.m. UTC
  Add EPP driver support for those AMD CPUs which has full MSR feature
enabled, The EPP is used in the DPM controller to drive the frequency
that a core is going to operate during short periods of activity.

EPP values will be utilized for different OS profiles (balanced, performance,
power savings). cppc performance can be controlled by the user space interface
sys attributes for min and max frequency limits, when pstate driver is
working under power save policy.

EPP scale is 0 - 255, 0 is the max performance and 255 is min level.
balance_performance (0x80) can provide best balance performance and watt for
most of system, meanwhile user can choose performance policy on needs.

$ cat /sys/devices/system/cpu/cpufreq/policy0/energy_performance_available_preferences
default performance balance_performance balance_power power

$ cat /sys/devices/system/cpu/cpufreq/policy0/energy_performance_preference
balance_performance

Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
---
 drivers/cpufreq/amd-pstate.c | 658 ++++++++++++++++++++++++++++++++++-
 include/linux/amd-pstate.h   |  81 +++++
 2 files changed, 734 insertions(+), 5 deletions(-)
  

Comments

Mario Limonciello Nov. 7, 2022, 8:32 p.m. UTC | #1
On 11/7/2022 11:57, Perry Yuan wrote:
> Add EPP driver support for those AMD CPUs which has full MSR feature
> enabled, The EPP is used in the DPM controller to drive the frequency
> that a core is going to operate during short periods of activity.

To avoid the run on sentence, here is a different wording proposal.

Add EPP driver support for AMD SoCs which support a dedicated MSR for 
CPPC.  EPP is used by the DPM controller to configure the frequency that 
a core operates at during short periods of activity.

> 
> EPP values will be utilized for different OS profiles (balanced, performance,
> power savings). cppc performance can be controlled by the user space interface
> sys attributes for min and max frequency limits, when pstate driver is
> working under power save policy.
> 
> EPP scale is 0 - 255, 0 is the max performance and 255 is min level.
> balance_performance (0x80) can provide best balance performance and watt for
> most of system, meanwhile user can choose performance policy on needs.

As a user reading this message it is confusing that there are values and 
then there are strings, but you don't know the linkage between the two. 
My proposal for rewording this:

The SoC EPP targets are configured on a scale from 0 to 255 where 0 
represents maximum performance and 255 represents maximum efficiency.

The amd-pstate driver exports profile string names to userspace that are 
tied to specific EPP values.

The balance_performance string (0x80) provides the best balance for 
efficiency versus power on most systems, but users can choose other 
strings to meet their needs as well.

> 
> $ cat /sys/devices/system/cpu/cpufreq/policy0/energy_performance_available_preferences
> default performance balance_performance balance_power power
> 
> $ cat /sys/devices/system/cpu/cpufreq/policy0/energy_performance_preference
> balance_performance
> 
> Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
> ---
>   drivers/cpufreq/amd-pstate.c | 658 ++++++++++++++++++++++++++++++++++-
>   include/linux/amd-pstate.h   |  81 +++++
>   2 files changed, 734 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
> index 14906431dc15..eb82bc6a7f66 100644
> --- a/drivers/cpufreq/amd-pstate.c
> +++ b/drivers/cpufreq/amd-pstate.c
> @@ -60,8 +60,136 @@
>    * module parameter to be able to enable it manually for debugging.
>    */
>   static bool shared_mem __read_mostly;
> +static int cppc_active __read_mostly;
> +static int disable_pstate_load __initdata;
> +static int epp_off __initdata;
>   
> -static struct cpufreq_driver amd_pstate_driver;
> +static struct cpufreq_driver *default_pstate_driver;
> +static struct amd_cpudata **all_cpu_data;
> +
> +static struct amd_pstate_params global_params;
> +
> +static DEFINE_MUTEX(amd_pstate_limits_lock);
> +static DEFINE_MUTEX(amd_pstate_driver_lock);
> +
> +static bool cppc_boost __read_mostly;
> +struct kobject *amd_pstate_kobj;
> +
> +#ifdef CONFIG_ACPI_CPPC_LIB
> +static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
> +{
> +	s16 epp;
> +	struct cppc_perf_caps perf_caps;
> +	int ret;
> +
> +	if (boot_cpu_has(X86_FEATURE_CPPC)) {
> +		if (!cppc_req_cached) {
> +			epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
> +					    &cppc_req_cached);
> +			if (epp)
> +				return epp;
> +		}
> +		epp = (cppc_req_cached >> 24) & 0xFF;
> +	} else {
> +		ret = cppc_get_epp_caps(cpudata->cpu, &perf_caps);
> +		if (ret < 0) {
> +			pr_debug("Could not retrieve energy perf value (%d)\n", ret);
> +			return -EIO;
> +		}
> +		epp = (s16) perf_caps.energy_perf;
> +	}
> +
> +	return epp;
> +}
> +#endif
> +
> +static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata, int *raw_epp)
> +{
> +	s16 epp;
> +	int index = -EINVAL;
> +
> +	*raw_epp = 0;
> +	epp = amd_pstate_get_epp(cpudata, 0);
> +	if (epp < 0)
> +		return epp;
> +
> +	switch (epp) {
> +	case AMD_CPPC_EPP_PERFORMANCE:
> +		index = EPP_INDEX_PERFORMANCE;
> +		break;
> +	case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
> +		index = EPP_INDEX_BALANCE_PERFORMANCE;
> +		break;
> +	case AMD_CPPC_EPP_BALANCE_POWERSAVE:
> +		index = EPP_INDEX_BALANCE_POWERSAVE;
> +		break;
> +	case AMD_CPPC_EPP_POWERSAVE:
> +		index = EPP_INDEX_POWERSAVE;
> +		break;
> +	default:
> +		*raw_epp = epp;
> +		index = 0;
> +	}
> +
> +	return index;
> +}
> +
> +#ifdef CONFIG_ACPI_CPPC_LIB
> +static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
> +{
> +	int ret;
> +	struct cppc_perf_ctrls perf_ctrls;
> +
> +	if (boot_cpu_has(X86_FEATURE_CPPC)) {
> +		u64 value = READ_ONCE(cpudata->cppc_req_cached);
> +
> +		value &= ~GENMASK_ULL(31, 24);
> +		value |= (u64)epp << 24;
> +		WRITE_ONCE(cpudata->cppc_req_cached, value);
> +
> +		ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
> +		if (!ret)
> +			cpudata->epp_cached = epp;
> +	} else {
> +		perf_ctrls.energy_perf = epp;
> +		ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls);
> +		if (ret) {
> +			pr_debug("failed to set energy perf value (%d)\n", ret);
> +			return ret;
> +		}
> +		cpudata->epp_cached = epp;
> +	}
> +
> +	return ret;
> +}
> +
> +static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
> +					      int pref_index, bool use_raw,
> +					      u32 raw_epp)
> +{
> +	int epp = -EINVAL;
> +	int ret;
> +
> +	if (!pref_index) {
> +		pr_debug("EPP pref_index is invalid\n");
> +		return -EINVAL;
> +	}
> +
> +	if (use_raw)
> +		epp = raw_epp;
> +	else if (epp == -EINVAL)
> +		epp = epp_values[pref_index];
> +
> +	if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
> +		pr_debug("EPP cannot be set under performance policy\n");
> +		return -EBUSY;
> +	}
> +
> +	ret = amd_pstate_set_epp(cpudata, epp);
> +
> +	return ret;
> +}
> +#endif
>   
>   static inline int pstate_enable(bool enable)
>   {
> @@ -71,11 +199,25 @@ static inline int pstate_enable(bool enable)
>   static int cppc_enable(bool enable)
>   {
>   	int cpu, ret = 0;
> +	struct cppc_perf_ctrls perf_ctrls;
>   
>   	for_each_present_cpu(cpu) {
>   		ret = cppc_set_enable(cpu, enable);
>   		if (ret)
>   			return ret;
> +
> +		/* Enable autonomous mode for EPP */
> +		if (!cppc_active) {
> +			ret = cppc_set_auto_epp(cpu, enable);
> +			if (ret)
> +				return ret;
> +
> +			/* Set desired perf as zero to allow EPP firmware control */
> +			perf_ctrls.desired_perf = 0;
> +			ret = cppc_set_perf(cpu, &perf_ctrls);
> +			if (ret)
> +				return ret;
> +		}
>   	}
>   
>   	return ret;
> @@ -418,7 +560,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata)
>   		return;
>   
>   	cpudata->boost_supported = true;
> -	amd_pstate_driver.boost_enabled = true;
> +	default_pstate_driver->boost_enabled = true;
>   }
>   
>   static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
> @@ -582,10 +724,74 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
>   	return sprintf(&buf[0], "%u\n", perf);
>   }
>   
> +static ssize_t show_energy_performance_available_preferences(
> +				struct cpufreq_policy *policy, char *buf)
> +{
> +	int i = 0;
> +	int ret = 0;
> +
> +	while (energy_perf_strings[i] != NULL)
> +		ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
> +
> +	ret += sprintf(&buf[ret], "\n");
> +
> +	return ret;
> +}
> +
> +static ssize_t store_energy_performance_preference(
> +		struct cpufreq_policy *policy, const char *buf, size_t count)
> +{
> +	struct amd_cpudata *cpudata = policy->driver_data;
> +	char str_preference[21];
> +	bool raw = false;
> +	ssize_t ret;
> +	u32 epp = 0;
> +
> +	ret = sscanf(buf, "%20s", str_preference);
> +	if (ret != 1)
> +		return -EINVAL;
> +
> +	ret = match_string(energy_perf_strings, -1, str_preference);
> +	if (ret < 0) {
> +		ret = kstrtouint(buf, 10, &epp);
> +		if (ret)
> +			return ret;
> +
> +		if ((epp > 255) || (epp < 0))
> +			return -EINVAL;
> +
> +		raw = true;
> +	}

What's the reason for supporting putting the raw number in here for 
stuff "in between"?  I think this is going to pretty confusing to 
userspace that you can use string values or integer values.  It also 
means that if userspace writes an integer with a mapping to string and 
tries to read it back they'll get the string rather than the integer!

I can understand using the raw values for internal characterization and 
development to possibly introduce a new mapping string, but I don't 
think that makes sense in the kernel.

> +
> +	mutex_lock(&amd_pstate_limits_lock);
> +	ret = amd_pstate_set_energy_pref_index(cpudata, ret, raw, epp);
> +	mutex_unlock(&amd_pstate_limits_lock);
> +
> +	return ret ?: count;
> +}
> +
> +static ssize_t show_energy_performance_preference(
> +				struct cpufreq_policy *policy, char *buf)
> +{
> +	struct amd_cpudata *cpudata = policy->driver_data;
> +	int preference, raw_epp;
> +
> +	preference = amd_pstate_get_energy_pref_index(cpudata, &raw_epp);
> +	if (preference < 0)
> +		return preference;
> +
> +	if (raw_epp)
> +		return  sprintf(buf, "%d\n", raw_epp);
> +	else
> +		return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
> +}
> +
>   cpufreq_freq_attr_ro(amd_pstate_max_freq);
>   cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
>   
>   cpufreq_freq_attr_ro(amd_pstate_highest_perf);
> +cpufreq_freq_attr_rw(energy_performance_preference);
> +cpufreq_freq_attr_ro(energy_performance_available_preferences);
>   
>   static struct freq_attr *amd_pstate_attr[] = {
>   	&amd_pstate_max_freq,
> @@ -594,6 +800,415 @@ static struct freq_attr *amd_pstate_attr[] = {
>   	NULL,
>   };
>   
> +static struct freq_attr *amd_pstate_epp_attr[] = {
> +	&amd_pstate_max_freq,
> +	&amd_pstate_lowest_nonlinear_freq,
> +	&amd_pstate_highest_perf,
> +	&energy_performance_preference,
> +	&energy_performance_available_preferences,
> +	NULL,
> +};
> +
> +static inline void update_boost_state(void)
> +{
> +	u64 misc_en;
> +	struct amd_cpudata *cpudata;
> +
> +	cpudata = all_cpu_data[0];
> +	rdmsrl(MSR_K7_HWCR, misc_en);
> +	global_params.cppc_boost_disabled = misc_en & BIT_ULL(25);
> +}
> +
> +static int amd_pstate_init_cpu(unsigned int cpunum)
> +{
> +	struct amd_cpudata *cpudata;
> +
> +	cpudata = all_cpu_data[cpunum];
> +	if (!cpudata) {
> +		cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL);
> +		if (!cpudata)
> +			return -ENOMEM;
> +		WRITE_ONCE(all_cpu_data[cpunum], cpudata);
> +
> +		cpudata->cpu = cpunum;
> +	}
> +	cpudata->epp_powersave = -EINVAL;
> +	cpudata->epp_policy = 0;
> +	pr_debug("controlling: cpu %d\n", cpunum);
> +	return 0;
> +}
> +
> +static int __amd_pstate_cpu_init(struct cpufreq_policy *policy)
> +{
> +	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
> +	struct amd_cpudata *cpudata;
> +	struct device *dev;
> +	int rc;
> +	u64 value;
> +
> +	rc = amd_pstate_init_cpu(policy->cpu);
> +	if (rc)
> +		return rc;
> +
> +	cpudata = all_cpu_data[policy->cpu];
> +
> +	dev = get_cpu_device(policy->cpu);
> +	if (!dev)
> +		goto free_cpudata1;
> +
> +	rc = amd_pstate_init_perf(cpudata);
> +	if (rc)
> +		goto free_cpudata1;
> +
> +	min_freq = amd_get_min_freq(cpudata);
> +	max_freq = amd_get_max_freq(cpudata);
> +	nominal_freq = amd_get_nominal_freq(cpudata);
> +	lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata);
> +	if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) {
> +		dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n",
> +				min_freq, max_freq);
> +		ret = -EINVAL;
> +		goto free_cpudata1;
> +	}
> +
> +	policy->min = min_freq;
> +	policy->max = max_freq;
> +
> +	policy->cpuinfo.min_freq = min_freq;
> +	policy->cpuinfo.max_freq = max_freq;
> +	/* It will be updated by governor */
> +	policy->cur = policy->cpuinfo.min_freq;
> +
> +	/* Initial processor data capability frequencies */
> +	cpudata->max_freq = max_freq;
> +	cpudata->min_freq = min_freq;
> +	cpudata->nominal_freq = nominal_freq;
> +	cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq;
> +
> +	policy->driver_data = cpudata;
> +
> +	update_boost_state();
> +	cpudata->epp_cached = amd_pstate_get_epp(cpudata, value);
> +
> +	policy->min = policy->cpuinfo.min_freq;
> +	policy->max = policy->cpuinfo.max_freq;
> +
> +	if (boot_cpu_has(X86_FEATURE_CPPC))
> +		policy->fast_switch_possible = true;
> +
> +	if (!shared_mem && boot_cpu_has(X86_FEATURE_CPPC)) {
> +		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
> +		if (ret)
> +			return ret;
> +		WRITE_ONCE(cpudata->cppc_req_cached, value);
> +
> +		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value);
> +		if (ret)
> +			return ret;
> +		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
> +	}
> +	amd_pstate_boost_init(cpudata);
> +
> +	return 0;
> +
> +free_cpudata1:
> +	kfree(cpudata);
> +	return ret;
> +}
> +
> +static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
> +{
> +	int ret;
> +
> +	ret = __amd_pstate_cpu_init(policy);
> +	if (ret)
> +		return ret;
> +	/*
> +	 * Set the policy to powersave to provide a valid fallback value in case
> +	 * the default cpufreq governor is neither powersave nor performance.
> +	 */
> +	policy->policy = CPUFREQ_POLICY_POWERSAVE;
> +
> +	return 0;
> +}
> +
> +static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
> +{
> +	pr_debug("amd-pstate: CPU %d exiting\n", policy->cpu);

Drop the "amd-pstate:", this file has pr_fmt.

> +	policy->fast_switch_possible = false;
> +	return 0;
> +}
> +
> +static void amd_pstate_update_max_freq(unsigned int cpu)
> +{
> +	struct cpufreq_policy *policy = policy = cpufreq_cpu_get(cpu);
> +
> +	if (!policy)
> +		return;
> +
> +	refresh_frequency_limits(policy);
> +	cpufreq_cpu_put(policy);
> +}
> +
> +static void amd_pstate_epp_update_limits(unsigned int cpu)
> +{
> +	mutex_lock(&amd_pstate_driver_lock);
> +	update_boost_state();
> +	if (global_params.cppc_boost_disabled) {
> +		for_each_possible_cpu(cpu)
> +			amd_pstate_update_max_freq(cpu);
> +	} else {
> +		cpufreq_update_policy(cpu);
> +	}
> +	mutex_unlock(&amd_pstate_driver_lock);
> +}
> +
> +static int cppc_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
> +
> +static inline void amd_pstate_boost_up(struct amd_cpudata *cpudata)
> +{
> +	u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
> +	u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
> +	u32 max_limit = (hwp_req & 0xff);
> +	u32 min_limit = (hwp_req & 0xff00) >> 8;
> +	u32 boost_level1;
> +
> +	/* If max and min are equal or already at max, nothing to boost */
> +	if (max_limit == min_limit)
> +		return;
> +
> +	/* Set boost max and min to initial value */
> +	if (!cpudata->cppc_boost_min)
> +		cpudata->cppc_boost_min = min_limit;
> +
> +	boost_level1 = ((AMD_CPPC_NOMINAL_PERF(hwp_cap) + min_limit) >> 1);
> +
> +	if (cpudata->cppc_boost_min < boost_level1)
> +		cpudata->cppc_boost_min = boost_level1;
> +	else if (cpudata->cppc_boost_min < AMD_CPPC_NOMINAL_PERF(hwp_cap))
> +		cpudata->cppc_boost_min = AMD_CPPC_NOMINAL_PERF(hwp_cap);
> +	else if (cpudata->cppc_boost_min == AMD_CPPC_NOMINAL_PERF(hwp_cap))
> +		cpudata->cppc_boost_min = max_limit;
> +	else
> +		return;
> +
> +	hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
> +	hwp_req |= AMD_CPPC_MIN_PERF(cpudata->cppc_boost_min);
> +	wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, hwp_req);
> +	cpudata->last_update = cpudata->sample.time;
> +}
> +
> +static inline void amd_pstate_boost_down(struct amd_cpudata *cpudata)
> +{
> +	bool expired;
> +
> +	if (cpudata->cppc_boost_min) {
> +		expired = time_after64(cpudata->sample.time, cpudata->last_update +
> +					cppc_boost_hold_time_ns);
> +
> +		if (expired) {
> +			wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
> +						cpudata->cppc_req_cached);
> +			cpudata->cppc_boost_min = 0;
> +		}
> +	}
> +
> +	cpudata->last_update = cpudata->sample.time;
> +}
> +
> +static inline void amd_pstate_boost_update_util(struct amd_cpudata *cpudata,
> +						      u64 time)
> +{
> +	cpudata->sample.time = time;
> +	if (smp_processor_id() != cpudata->cpu)
> +		return;
> +
> +	if (cpudata->sched_flags & SCHED_CPUFREQ_IOWAIT) {
> +		bool do_io = false;
> +
> +		cpudata->sched_flags = 0;
> +		/*
> +		 * Set iowait_boost flag and update time. Since IO WAIT flag
> +		 * is set all the time, we can't just conclude that there is
> +		 * some IO bound activity is scheduled on this CPU with just
> +		 * one occurrence. If we receive at least two in two
> +		 * consecutive ticks, then we treat as boost candidate.
> +		 * This is leveraged from Intel Pstate driver.
> +		 */
> +		if (time_before64(time, cpudata->last_io_update + 2 * TICK_NSEC))
> +			do_io = true;
> +
> +		cpudata->last_io_update = time;
> +
> +		if (do_io)
> +			amd_pstate_boost_up(cpudata);
> +
> +	} else {
> +		amd_pstate_boost_down(cpudata);
> +	}
> +}
> +
> +static inline void amd_pstate_cppc_update_hook(struct update_util_data *data,
> +						u64 time, unsigned int flags)
> +{
> +	struct amd_cpudata *cpudata = container_of(data,
> +				struct amd_cpudata, update_util);
> +
> +	cpudata->sched_flags |= flags;
> +
> +	if (smp_processor_id() == cpudata->cpu)
> +		amd_pstate_boost_update_util(cpudata, time);
> +}
> +
> +static void amd_pstate_clear_update_util_hook(unsigned int cpu)
> +{
> +	struct amd_cpudata *cpudata = all_cpu_data[cpu];
> +
> +	if (!cpudata->update_util_set)
> +		return;
> +
> +	cpufreq_remove_update_util_hook(cpu);
> +	cpudata->update_util_set = false;
> +	synchronize_rcu();
> +}
> +
> +static void amd_pstate_set_update_util_hook(unsigned int cpu_num)
> +{
> +	struct amd_cpudata *cpudata = all_cpu_data[cpu_num];
> +
> +	if (!cppc_boost) {
> +		if (cpudata->update_util_set)
> +			amd_pstate_clear_update_util_hook(cpudata->cpu);
> +		return;
> +	}
> +
> +	if (cpudata->update_util_set)
> +		return;
> +
> +	cpudata->sample.time = 0;
> +	cpufreq_add_update_util_hook(cpu_num, &cpudata->update_util,
> +						amd_pstate_cppc_update_hook);
> +	cpudata->update_util_set = true;
> +}
> +
> +static void amd_pstate_epp_init(unsigned int cpu)
> +{
> +	struct amd_cpudata *cpudata = all_cpu_data[cpu];
> +	u32 max_perf, min_perf;
> +	u64 value;
> +	s16 epp;
> +	int ret;
> +
> +	max_perf = READ_ONCE(cpudata->highest_perf);
> +	min_perf = READ_ONCE(cpudata->lowest_perf);
> +
> +	value = READ_ONCE(cpudata->cppc_req_cached);
> +
> +	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
> +		min_perf = max_perf;
> +
> +	/* Initial min/max values for CPPC Performance Controls Register */
> +	value &= ~AMD_CPPC_MIN_PERF(~0L);
> +	value |= AMD_CPPC_MIN_PERF(min_perf);
> +
> +	value &= ~AMD_CPPC_MAX_PERF(~0L);
> +	value |= AMD_CPPC_MAX_PERF(max_perf);
> +
> +	/* CPPC EPP feature require to set zero to the desire perf bit */
> +	value &= ~AMD_CPPC_DES_PERF(~0L);
> +	value |= AMD_CPPC_DES_PERF(0);
> +
> +	if (cpudata->epp_policy == cpudata->policy)
> +		goto skip_epp;
> +
> +	cpudata->epp_policy = cpudata->policy;
> +
> +	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
> +		epp = amd_pstate_get_epp(cpudata, value);
> +		cpudata->epp_powersave = epp;
> +		if (epp < 0)
> +			goto skip_epp;
> +		/* force the epp value to be zero for performance policy */
> +		epp = 0;
> +	} else {
> +		if (cpudata->epp_powersave < 0)
> +			goto skip_epp;
> +		/* Get BIOS pre-defined epp value */
> +		epp = amd_pstate_get_epp(cpudata, value);
> +		if (epp)
> +			goto skip_epp;
> +		epp = cpudata->epp_powersave;
> +	}
> +	/* Set initial EPP value */
> +	if (boot_cpu_has(X86_FEATURE_CPPC)) {
> +		value &= ~GENMASK_ULL(31, 24);
> +		value |= (u64)epp << 24;
> +	}
> +
> +skip_epp:
> +	WRITE_ONCE(cpudata->cppc_req_cached, value);
> +	ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
> +	if (!ret)
> +		cpudata->epp_cached = epp;
> +}
> +
> +static void amd_pstate_set_max_limits(struct amd_cpudata *cpudata)
> +{
> +	u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
> +	u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
> +	u32 max_limit = (hwp_cap >> 24) & 0xff;
> +
> +	hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
> +	hwp_req |= AMD_CPPC_MIN_PERF(max_limit);
> +	wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, hwp_req);
> +}
> +
> +static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
> +{
> +	struct amd_cpudata *cpudata;
> +
> +	if (!policy->cpuinfo.max_freq)
> +		return -ENODEV;
> +
> +	pr_debug("set_policy: cpuinfo.max %u policy->max %u\n",
> +				policy->cpuinfo.max_freq, policy->max);
> +
> +	cpudata = all_cpu_data[policy->cpu];
> +	cpudata->policy = policy->policy;
> +
> +	if (boot_cpu_has(X86_FEATURE_CPPC)) {
> +		mutex_lock(&amd_pstate_limits_lock);
> +
> +		if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
> +			amd_pstate_clear_update_util_hook(policy->cpu);
> +			amd_pstate_set_max_limits(cpudata);
> +		} else {
> +			amd_pstate_set_update_util_hook(policy->cpu);
> +		}
> +
> +		if (boot_cpu_has(X86_FEATURE_CPPC))
> +			amd_pstate_epp_init(policy->cpu);
> +
> +		mutex_unlock(&amd_pstate_limits_lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static void amd_pstate_verify_cpu_policy(struct amd_cpudata *cpudata,
> +					   struct cpufreq_policy_data *policy)
> +{
> +	update_boost_state();
> +	cpufreq_verify_within_cpu_limits(policy);
> +}
> +
> +static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
> +{
> +	amd_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy);
> +	pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
> +	return 0;
> +}
> +
>   static struct cpufreq_driver amd_pstate_driver = {
>   	.flags		= CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
>   	.verify		= amd_pstate_verify,
> @@ -607,8 +1222,20 @@ static struct cpufreq_driver amd_pstate_driver = {
>   	.attr		= amd_pstate_attr,
>   };
>   
> +static struct cpufreq_driver amd_pstate_epp_driver = {
> +	.flags		= CPUFREQ_CONST_LOOPS,
> +	.verify		= amd_pstate_epp_verify_policy,
> +	.setpolicy	= amd_pstate_epp_set_policy,
> +	.init		= amd_pstate_epp_cpu_init,
> +	.exit		= amd_pstate_epp_cpu_exit,
> +	.update_limits	= amd_pstate_epp_update_limits,
> +	.name		= "amd_pstate_epp",
> +	.attr		= amd_pstate_epp_attr,
> +};
> +
>   static int __init amd_pstate_init(void)
>   {
> +	static struct amd_cpudata **cpudata;
>   	int ret;
>   
>   	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
> @@ -623,10 +1250,18 @@ static int __init amd_pstate_init(void)
>   	if (cpufreq_get_current_driver())
>   		return -EEXIST;
>   
> +	if (!epp_off) {
> +		WRITE_ONCE(cppc_active, 1);
> +		if (!default_pstate_driver)
> +			default_pstate_driver = &amd_pstate_epp_driver;
> +	}
> +	pr_info("AMD CPPC loading with %s driver instance.\n", default_pstate_driver->name);

This is pretty noisy, do we really need it on every boot if we can 
easily check it from sysfs?

> +
>   	/* capability check */
>   	if (boot_cpu_has(X86_FEATURE_CPPC)) {
> +		if (!cppc_active)
> +			default_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
>   		pr_debug("AMD CPPC MSR based functionality is supported\n");
> -		amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf;
>   	} else if (shared_mem) {
>   		static_call_update(amd_pstate_enable, cppc_enable);
>   		static_call_update(amd_pstate_init_perf, cppc_init_perf);
> @@ -636,6 +1271,10 @@ static int __init amd_pstate_init(void)
>   		return -ENODEV;
>   	}
>   
> +	cpudata = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
> +	if (!cpudata)
> +		return -ENOMEM;
> +	WRITE_ONCE(all_cpu_data, cpudata);
>   	/* enable amd pstate feature */
>   	ret = amd_pstate_enable(true);
>   	if (ret) {
> @@ -643,9 +1282,9 @@ static int __init amd_pstate_init(void)
>   		return ret;
>   	}
>   
> -	ret = cpufreq_register_driver(&amd_pstate_driver);
> +	ret = cpufreq_register_driver(default_pstate_driver);
>   	if (ret)
> -		pr_err("failed to register amd_pstate_driver with return %d\n",
> +		pr_err("failed to register amd pstate driver with return %d\n",
>   		       ret);
>   
>   	return ret;
> @@ -657,6 +1296,15 @@ static int __init amd_pstate_param(char *str)
>   	if (!str)
>   		return -EINVAL;
>   
> +	if (!strcmp(str, "disable"))
> +		disable_pstate_load = 1;
> +	else if (!strcmp(str, "active")) {
> +		default_pstate_driver = &amd_pstate_epp_driver;
> +	} else if (!strcmp(str, "passive")) {
> +		epp_off = 1;
> +		default_pstate_driver = &amd_pstate_driver;
> +	}
> +
>   	/* enable shared memory type CPPC ,if you processor has no MSR, you have to add this
>   	 * to your grub to make cppc driver loaded successfully.
>   	 */
> diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
> index 1c4b8659f171..7e6e8cab97b3 100644
> --- a/include/linux/amd-pstate.h
> +++ b/include/linux/amd-pstate.h
> @@ -25,6 +25,7 @@ struct amd_aperf_mperf {
>   	u64 aperf;
>   	u64 mperf;
>   	u64 tsc;
> +	u64 time;
>   };
>   
>   /**
> @@ -47,6 +48,18 @@ struct amd_aperf_mperf {
>    * @prev: Last Aperf/Mperf/tsc count value read from register
>    * @freq: current cpu frequency value
>    * @boost_supported: check whether the Processor or SBIOS supports boost mode
> + * @epp_powersave: Last saved CPPC energy performance preference
> +				when policy switched to performance
> + * @epp_policy: Last saved policy used to set energy-performance preference
> + * @epp_cached: Cached CPPC energy-performance preference value
> + * @policy: Cpufreq policy value
> + * @sched_flags: Store scheduler flags for possible cross CPU update
> + * @update_util_set: CPUFreq utility callback is set
> + * @last_update: Time stamp of the last performance state update
> + * @cppc_boost_min: Last CPPC boosted min performance state
> + * @cppc_cap1_cached: Cached value of the last CPPC Capabilities MSR
> + * @update_util: Cpufreq utility callback information
> + * @sample: the stored performance sample
>    *
>    * The amd_cpudata is key private data for each CPU thread in AMD P-State, and
>    * represents all the attributes and goals that AMD P-State requests at runtime.
> @@ -72,6 +85,74 @@ struct amd_cpudata {
>   
>   	u64	freq;
>   	bool	boost_supported;
> +
> +	/* EPP feature related attributes*/
> +	s16	epp_powersave;
> +	s16	epp_policy;
> +	s16	epp_cached;
> +	u32	policy;
> +	u32	sched_flags;
> +	bool	update_util_set;
> +	u64	last_update;
> +	u64	last_io_update;
> +	u32	cppc_boost_min;
> +	u64	cppc_cap1_cached;
> +	struct	update_util_data update_util;
> +	struct	amd_aperf_mperf sample;
> +};
> +
> +/**
> + * struct amd_pstate_params - global parameters for the performance control
> + * @ cppc_boost_disabled wheher the core performance boost disabled
> + */
> +struct amd_pstate_params {
> +	bool cppc_boost_disabled;
> +};
> +
> +#define AMD_CPPC_EPP_PERFORMANCE		0x00
> +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE	0x80
> +#define AMD_CPPC_EPP_BALANCE_POWERSAVE		0xBF
> +#define AMD_CPPC_EPP_POWERSAVE			0xFF
> +
> +/*
> + * AMD Energy Preference Performance (EPP)
> + * The EPP is used in the CCLK DPM controller to drive
> + * the frequency that a core is going to operate during
> + * short periods of activity. EPP values will be utilized for
> + * different OS profiles (balanced, performance, power savings)
> + * display strings corresponding to EPP index in the
> + * energy_perf_strings[]
> + *	index		String
> + *-------------------------------------
> + *	0		default
> + *	1		performance
> + *	2		balance_performance
> + *	3		balance_power
> + *	4		power
> + */
> +enum energy_perf_value_index {
> +	EPP_INDEX_DEFAULT = 0,
> +	EPP_INDEX_PERFORMANCE,
> +	EPP_INDEX_BALANCE_PERFORMANCE,
> +	EPP_INDEX_BALANCE_POWERSAVE,
> +	EPP_INDEX_POWERSAVE,
> +};
> +
> +static const char * const energy_perf_strings[] = {
> +	[EPP_INDEX_DEFAULT] = "default",
> +	[EPP_INDEX_PERFORMANCE] = "performance",
> +	[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
> +	[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
> +	[EPP_INDEX_POWERSAVE] = "power",
> +	NULL
> +};
> +
> +static unsigned int epp_values[] = {
> +	[EPP_INDEX_DEFAULT] = 0,
> +	[EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE,
> +	[EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE,
> +	[EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE,
> +	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
>   };
>   
>   #endif /* _LINUX_AMD_PSTATE_H */
  
kernel test robot Nov. 8, 2022, 7:21 a.m. UTC | #2
Hi Perry,

I love your patch! Perhaps something to improve:

[auto build test WARNING on rafael-pm/linux-next]
[also build test WARNING on linus/master v6.1-rc4 next-20221107]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Perry-Yuan/Implement-AMD-Pstate-EPP-Driver/20221108-020418
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
patch link:    https://lore.kernel.org/r/20221107175705.2207842-5-Perry.Yuan%40amd.com
patch subject: [PATCH v3 4/8] cpufreq: amd_pstate: add AMD Pstate EPP support for the MSR based processors
config: i386-allyesconfig
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/3d16457032d0668bcaa51c537e74f6e1aa0c9a73
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Perry-Yuan/Implement-AMD-Pstate-EPP-Driver/20221108-020418
        git checkout 3d16457032d0668bcaa51c537e74f6e1aa0c9a73
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/cpufreq/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/cpufreq/amd-pstate.c: In function 'amd_pstate_get_energy_pref_index':
   drivers/cpufreq/amd-pstate.c:112:15: error: implicit declaration of function 'amd_pstate_get_epp' [-Werror=implicit-function-declaration]
     112 |         epp = amd_pstate_get_epp(cpudata, 0);
         |               ^~~~~~~~~~~~~~~~~~
   drivers/cpufreq/amd-pstate.c: In function 'store_energy_performance_preference':
   drivers/cpufreq/amd-pstate.c:767:15: error: implicit declaration of function 'amd_pstate_set_energy_pref_index'; did you mean 'amd_pstate_get_energy_pref_index'? [-Werror=implicit-function-declaration]
     767 |         ret = amd_pstate_set_energy_pref_index(cpudata, ret, raw, epp);
         |               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |               amd_pstate_get_energy_pref_index
   drivers/cpufreq/amd-pstate.c: In function 'update_boost_state':
>> drivers/cpufreq/amd-pstate.c:815:29: warning: variable 'cpudata' set but not used [-Wunused-but-set-variable]
     815 |         struct amd_cpudata *cpudata;
         |                             ^~~~~~~
   In file included from drivers/cpufreq/amd-pstate.c:39:
   At top level:
   include/linux/amd-pstate.h:150:21: warning: 'epp_values' defined but not used [-Wunused-variable]
     150 | static unsigned int epp_values[] = {
         |                     ^~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/cpudata +815 drivers/cpufreq/amd-pstate.c

   811	
   812	static inline void update_boost_state(void)
   813	{
   814		u64 misc_en;
 > 815		struct amd_cpudata *cpudata;
   816	
   817		cpudata = all_cpu_data[0];
   818		rdmsrl(MSR_K7_HWCR, misc_en);
   819		global_params.cppc_boost_disabled = misc_en & BIT_ULL(25);
   820	}
   821
  
kernel test robot Nov. 9, 2022, 5:55 a.m. UTC | #3
Hi Perry,

I love your patch! Yet something to improve:

[auto build test ERROR on rafael-pm/linux-next]
[also build test ERROR on linus/master v6.1-rc4 next-20221108]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Perry-Yuan/Implement-AMD-Pstate-EPP-Driver/20221108-020418
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
patch link:    https://lore.kernel.org/r/20221107175705.2207842-5-Perry.Yuan%40amd.com
patch subject: [PATCH v3 4/8] cpufreq: amd_pstate: add AMD Pstate EPP support for the MSR based processors
config: i386-allyesconfig
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/3d16457032d0668bcaa51c537e74f6e1aa0c9a73
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Perry-Yuan/Implement-AMD-Pstate-EPP-Driver/20221108-020418
        git checkout 3d16457032d0668bcaa51c537e74f6e1aa0c9a73
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   drivers/cpufreq/amd-pstate.c: In function 'amd_pstate_get_energy_pref_index':
>> drivers/cpufreq/amd-pstate.c:112:15: error: implicit declaration of function 'amd_pstate_get_epp' [-Werror=implicit-function-declaration]
     112 |         epp = amd_pstate_get_epp(cpudata, 0);
         |               ^~~~~~~~~~~~~~~~~~
   drivers/cpufreq/amd-pstate.c: In function 'store_energy_performance_preference':
>> drivers/cpufreq/amd-pstate.c:767:15: error: implicit declaration of function 'amd_pstate_set_energy_pref_index'; did you mean 'amd_pstate_get_energy_pref_index'? [-Werror=implicit-function-declaration]
     767 |         ret = amd_pstate_set_energy_pref_index(cpudata, ret, raw, epp);
         |               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |               amd_pstate_get_energy_pref_index
   drivers/cpufreq/amd-pstate.c: In function 'update_boost_state':
   drivers/cpufreq/amd-pstate.c:815:29: warning: variable 'cpudata' set but not used [-Wunused-but-set-variable]
     815 |         struct amd_cpudata *cpudata;
         |                             ^~~~~~~
   In file included from drivers/cpufreq/amd-pstate.c:39:
   At top level:
   include/linux/amd-pstate.h:150:21: warning: 'epp_values' defined but not used [-Wunused-variable]
     150 | static unsigned int epp_values[] = {
         |                     ^~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/amd_pstate_get_epp +112 drivers/cpufreq/amd-pstate.c

   105	
   106	static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata, int *raw_epp)
   107	{
   108		s16 epp;
   109		int index = -EINVAL;
   110	
   111		*raw_epp = 0;
 > 112		epp = amd_pstate_get_epp(cpudata, 0);
   113		if (epp < 0)
   114			return epp;
   115	
   116		switch (epp) {
   117		case AMD_CPPC_EPP_PERFORMANCE:
   118			index = EPP_INDEX_PERFORMANCE;
   119			break;
   120		case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
   121			index = EPP_INDEX_BALANCE_PERFORMANCE;
   122			break;
   123		case AMD_CPPC_EPP_BALANCE_POWERSAVE:
   124			index = EPP_INDEX_BALANCE_POWERSAVE;
   125			break;
   126		case AMD_CPPC_EPP_POWERSAVE:
   127			index = EPP_INDEX_POWERSAVE;
   128			break;
   129		default:
   130			*raw_epp = epp;
   131			index = 0;
   132		}
   133	
   134		return index;
   135	}
   136
  
Nathan Fontenot Nov. 10, 2022, 3:59 p.m. UTC | #4
On 11/7/22 14:32, Limonciello, Mario wrote:
> On 11/7/2022 11:57, Perry Yuan wrote:
>> Add EPP driver support for those AMD CPUs which has full MSR feature
>> enabled, The EPP is used in the DPM controller to drive the frequency
>> that a core is going to operate during short periods of activity.
> 
> To avoid the run on sentence, here is a different wording proposal.
> 
> Add EPP driver support for AMD SoCs which support a dedicated MSR for CPPC.  EPP is used by the DPM controller to configure the frequency that a core operates at during short periods of activity.
> 
>>
>> EPP values will be utilized for different OS profiles (balanced, performance,
>> power savings). cppc performance can be controlled by the user space interface
>> sys attributes for min and max frequency limits, when pstate driver is
>> working under power save policy.
>>
>> EPP scale is 0 - 255, 0 is the max performance and 255 is min level.
>> balance_performance (0x80) can provide best balance performance and watt for
>> most of system, meanwhile user can choose performance policy on needs.
> 
> As a user reading this message it is confusing that there are values and then there are strings, but you don't know the linkage between the two. My proposal for rewording this:
> 
> The SoC EPP targets are configured on a scale from 0 to 255 where 0 represents maximum performance and 255 represents maximum efficiency.
> 
> The amd-pstate driver exports profile string names to userspace that are tied to specific EPP values.
> 
> The balance_performance string (0x80) provides the best balance for efficiency versus power on most systems, but users can choose other strings to meet their needs as well.
> 
>>
>> $ cat /sys/devices/system/cpu/cpufreq/policy0/energy_performance_available_preferences
>> default performance balance_performance balance_power power
>>
>> $ cat /sys/devices/system/cpu/cpufreq/policy0/energy_performance_preference
>> balance_performance
>>
>> Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
>> ---
>>   drivers/cpufreq/amd-pstate.c | 658 ++++++++++++++++++++++++++++++++++-
>>   include/linux/amd-pstate.h   |  81 +++++
>>   2 files changed, 734 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
>> index 14906431dc15..eb82bc6a7f66 100644
>> --- a/drivers/cpufreq/amd-pstate.c
>> +++ b/drivers/cpufreq/amd-pstate.c
>> @@ -60,8 +60,136 @@
>>    * module parameter to be able to enable it manually for debugging.
>>    */
>>   static bool shared_mem __read_mostly;
>> +static int cppc_active __read_mostly;
>> +static int disable_pstate_load __initdata;
>> +static int epp_off __initdata;
>>   -static struct cpufreq_driver amd_pstate_driver;
>> +static struct cpufreq_driver *default_pstate_driver;
>> +static struct amd_cpudata **all_cpu_data;
>> +
>> +static struct amd_pstate_params global_params;
>> +
>> +static DEFINE_MUTEX(amd_pstate_limits_lock);
>> +static DEFINE_MUTEX(amd_pstate_driver_lock);
>> +
>> +static bool cppc_boost __read_mostly;
>> +struct kobject *amd_pstate_kobj;
>> +
>> +#ifdef CONFIG_ACPI_CPPC_LIB
>> +static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
>> +{
>> +    s16 epp;
>> +    struct cppc_perf_caps perf_caps;
>> +    int ret;
>> +
>> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
>> +        if (!cppc_req_cached) {
>> +            epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
>> +                        &cppc_req_cached);
>> +            if (epp)
>> +                return epp;
>> +        }
>> +        epp = (cppc_req_cached >> 24) & 0xFF;
>> +    } else {
>> +        ret = cppc_get_epp_caps(cpudata->cpu, &perf_caps);
>> +        if (ret < 0) {
>> +            pr_debug("Could not retrieve energy perf value (%d)\n", ret);
>> +            return -EIO;
>> +        }
>> +        epp = (s16) perf_caps.energy_perf;
>> +    }
>> +
>> +    return epp;
>> +}
>> +#endif
>> +
>> +static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata, int *raw_epp)
>> +{
>> +    s16 epp;
>> +    int index = -EINVAL;
>> +
>> +    *raw_epp = 0;
>> +    epp = amd_pstate_get_epp(cpudata, 0);
>> +    if (epp < 0)
>> +        return epp;
>> +
>> +    switch (epp) {
>> +    case AMD_CPPC_EPP_PERFORMANCE:
>> +        index = EPP_INDEX_PERFORMANCE;
>> +        break;
>> +    case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
>> +        index = EPP_INDEX_BALANCE_PERFORMANCE;
>> +        break;
>> +    case AMD_CPPC_EPP_BALANCE_POWERSAVE:
>> +        index = EPP_INDEX_BALANCE_POWERSAVE;
>> +        break;
>> +    case AMD_CPPC_EPP_POWERSAVE:
>> +        index = EPP_INDEX_POWERSAVE;
>> +        break;
>> +    default:
>> +        *raw_epp = epp;
>> +        index = 0;
>> +    }
>> +
>> +    return index;
>> +}
>> +
>> +#ifdef CONFIG_ACPI_CPPC_LIB
>> +static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
>> +{
>> +    int ret;
>> +    struct cppc_perf_ctrls perf_ctrls;
>> +
>> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
>> +        u64 value = READ_ONCE(cpudata->cppc_req_cached);
>> +
>> +        value &= ~GENMASK_ULL(31, 24);
>> +        value |= (u64)epp << 24;
>> +        WRITE_ONCE(cpudata->cppc_req_cached, value);
>> +
>> +        ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
>> +        if (!ret)
>> +            cpudata->epp_cached = epp;
>> +    } else {
>> +        perf_ctrls.energy_perf = epp;
>> +        ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls);
>> +        if (ret) {
>> +            pr_debug("failed to set energy perf value (%d)\n", ret);
>> +            return ret;
>> +        }
>> +        cpudata->epp_cached = epp;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
>> +                          int pref_index, bool use_raw,
>> +                          u32 raw_epp)
>> +{
>> +    int epp = -EINVAL;
>> +    int ret;
>> +
>> +    if (!pref_index) {
>> +        pr_debug("EPP pref_index is invalid\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (use_raw)
>> +        epp = raw_epp;
>> +    else if (epp == -EINVAL)
>> +        epp = epp_values[pref_index];
>> +
>> +    if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
>> +        pr_debug("EPP cannot be set under performance policy\n");
>> +        return -EBUSY;
>> +    }
>> +
>> +    ret = amd_pstate_set_epp(cpudata, epp);
>> +
>> +    return ret;
>> +}
>> +#endif
>>     static inline int pstate_enable(bool enable)
>>   {
>> @@ -71,11 +199,25 @@ static inline int pstate_enable(bool enable)
>>   static int cppc_enable(bool enable)
>>   {
>>       int cpu, ret = 0;
>> +    struct cppc_perf_ctrls perf_ctrls;
>>         for_each_present_cpu(cpu) {
>>           ret = cppc_set_enable(cpu, enable);
>>           if (ret)
>>               return ret;
>> +
>> +        /* Enable autonomous mode for EPP */
>> +        if (!cppc_active) {
>> +            ret = cppc_set_auto_epp(cpu, enable);
>> +            if (ret)
>> +                return ret;
>> +
>> +            /* Set desired perf as zero to allow EPP firmware control */
>> +            perf_ctrls.desired_perf = 0;
>> +            ret = cppc_set_perf(cpu, &perf_ctrls);
>> +            if (ret)
>> +                return ret;
>> +        }
>>       }
>>         return ret;
>> @@ -418,7 +560,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata)
>>           return;
>>         cpudata->boost_supported = true;
>> -    amd_pstate_driver.boost_enabled = true;
>> +    default_pstate_driver->boost_enabled = true;
>>   }
>>     static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
>> @@ -582,10 +724,74 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
>>       return sprintf(&buf[0], "%u\n", perf);
>>   }
>>   +static ssize_t show_energy_performance_available_preferences(
>> +                struct cpufreq_policy *policy, char *buf)
>> +{
>> +    int i = 0;
>> +    int ret = 0;
>> +
>> +    while (energy_perf_strings[i] != NULL)
>> +        ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
>> +
>> +    ret += sprintf(&buf[ret], "\n");
>> +
>> +    return ret;
>> +}
>> +
>> +static ssize_t store_energy_performance_preference(
>> +        struct cpufreq_policy *policy, const char *buf, size_t count)
>> +{
>> +    struct amd_cpudata *cpudata = policy->driver_data;
>> +    char str_preference[21];
>> +    bool raw = false;
>> +    ssize_t ret;
>> +    u32 epp = 0;
>> +
>> +    ret = sscanf(buf, "%20s", str_preference);
>> +    if (ret != 1)
>> +        return -EINVAL;
>> +
>> +    ret = match_string(energy_perf_strings, -1, str_preference);
>> +    if (ret < 0) {
>> +        ret = kstrtouint(buf, 10, &epp);
>> +        if (ret)
>> +            return ret;
>> +
>> +        if ((epp > 255) || (epp < 0))
>> +            return -EINVAL;
>> +
>> +        raw = true;
>> +    }
> 
> What's the reason for supporting putting the raw number in here for stuff "in between"?  I think this is going to pretty confusing to userspace that you can use string values or integer values.  It also means that if userspace writes an integer with a mapping to string and tries to read it back they'll get the string rather than the integer!
> 
> I can understand using the raw values for internal characterization and development to possibly introduce a new mapping string, but I don't think that makes sense in the kernel.
> 

This is really doing what Intel does for handling EPP settings. Yes, writing a value and getting back a string
could be a bit confusing bit it is already done from the Intel side. I think keeping EPP value setting common
would be a good thing if we can do it.

I don't think we should remove the ability to set raw values, we're allowed a range of 0 - 255 for the EPP
setting. Why we then limit ourselves to only 4 or so values?

-Nathan

>> +
>> +    mutex_lock(&amd_pstate_limits_lock);
>> +    ret = amd_pstate_set_energy_pref_index(cpudata, ret, raw, epp);
>> +    mutex_unlock(&amd_pstate_limits_lock);
>> +
>> +    return ret ?: count;
>> +}
>> +
>> +static ssize_t show_energy_performance_preference(
>> +                struct cpufreq_policy *policy, char *buf)
>> +{
>> +    struct amd_cpudata *cpudata = policy->driver_data;
>> +    int preference, raw_epp;
>> +
>> +    preference = amd_pstate_get_energy_pref_index(cpudata, &raw_epp);
>> +    if (preference < 0)
>> +        return preference;
>> +
>> +    if (raw_epp)
>> +        return  sprintf(buf, "%d\n", raw_epp);
>> +    else
>> +        return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
>> +}
>> +
>>   cpufreq_freq_attr_ro(amd_pstate_max_freq);
>>   cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
>>     cpufreq_freq_attr_ro(amd_pstate_highest_perf);
>> +cpufreq_freq_attr_rw(energy_performance_preference);
>> +cpufreq_freq_attr_ro(energy_performance_available_preferences);
>>     static struct freq_attr *amd_pstate_attr[] = {
>>       &amd_pstate_max_freq,
>> @@ -594,6 +800,415 @@ static struct freq_attr *amd_pstate_attr[] = {
>>       NULL,
>>   };
>>   +static struct freq_attr *amd_pstate_epp_attr[] = {
>> +    &amd_pstate_max_freq,
>> +    &amd_pstate_lowest_nonlinear_freq,
>> +    &amd_pstate_highest_perf,
>> +    &energy_performance_preference,
>> +    &energy_performance_available_preferences,
>> +    NULL,
>> +};
>> +
>> +static inline void update_boost_state(void)
>> +{
>> +    u64 misc_en;
>> +    struct amd_cpudata *cpudata;
>> +
>> +    cpudata = all_cpu_data[0];
>> +    rdmsrl(MSR_K7_HWCR, misc_en);
>> +    global_params.cppc_boost_disabled = misc_en & BIT_ULL(25);
>> +}
>> +
>> +static int amd_pstate_init_cpu(unsigned int cpunum)
>> +{
>> +    struct amd_cpudata *cpudata;
>> +
>> +    cpudata = all_cpu_data[cpunum];
>> +    if (!cpudata) {
>> +        cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL);
>> +        if (!cpudata)
>> +            return -ENOMEM;
>> +        WRITE_ONCE(all_cpu_data[cpunum], cpudata);
>> +
>> +        cpudata->cpu = cpunum;
>> +    }
>> +    cpudata->epp_powersave = -EINVAL;
>> +    cpudata->epp_policy = 0;
>> +    pr_debug("controlling: cpu %d\n", cpunum);
>> +    return 0;
>> +}
>> +
>> +static int __amd_pstate_cpu_init(struct cpufreq_policy *policy)
>> +{
>> +    int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
>> +    struct amd_cpudata *cpudata;
>> +    struct device *dev;
>> +    int rc;
>> +    u64 value;
>> +
>> +    rc = amd_pstate_init_cpu(policy->cpu);
>> +    if (rc)
>> +        return rc;
>> +
>> +    cpudata = all_cpu_data[policy->cpu];
>> +
>> +    dev = get_cpu_device(policy->cpu);
>> +    if (!dev)
>> +        goto free_cpudata1;
>> +
>> +    rc = amd_pstate_init_perf(cpudata);
>> +    if (rc)
>> +        goto free_cpudata1;
>> +
>> +    min_freq = amd_get_min_freq(cpudata);
>> +    max_freq = amd_get_max_freq(cpudata);
>> +    nominal_freq = amd_get_nominal_freq(cpudata);
>> +    lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata);
>> +    if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) {
>> +        dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n",
>> +                min_freq, max_freq);
>> +        ret = -EINVAL;
>> +        goto free_cpudata1;
>> +    }
>> +
>> +    policy->min = min_freq;
>> +    policy->max = max_freq;
>> +
>> +    policy->cpuinfo.min_freq = min_freq;
>> +    policy->cpuinfo.max_freq = max_freq;
>> +    /* It will be updated by governor */
>> +    policy->cur = policy->cpuinfo.min_freq;
>> +
>> +    /* Initial processor data capability frequencies */
>> +    cpudata->max_freq = max_freq;
>> +    cpudata->min_freq = min_freq;
>> +    cpudata->nominal_freq = nominal_freq;
>> +    cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq;
>> +
>> +    policy->driver_data = cpudata;
>> +
>> +    update_boost_state();
>> +    cpudata->epp_cached = amd_pstate_get_epp(cpudata, value);
>> +
>> +    policy->min = policy->cpuinfo.min_freq;
>> +    policy->max = policy->cpuinfo.max_freq;
>> +
>> +    if (boot_cpu_has(X86_FEATURE_CPPC))
>> +        policy->fast_switch_possible = true;
>> +
>> +    if (!shared_mem && boot_cpu_has(X86_FEATURE_CPPC)) {
>> +        ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
>> +        if (ret)
>> +            return ret;
>> +        WRITE_ONCE(cpudata->cppc_req_cached, value);
>> +
>> +        ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value);
>> +        if (ret)
>> +            return ret;
>> +        WRITE_ONCE(cpudata->cppc_cap1_cached, value);
>> +    }
>> +    amd_pstate_boost_init(cpudata);
>> +
>> +    return 0;
>> +
>> +free_cpudata1:
>> +    kfree(cpudata);
>> +    return ret;
>> +}
>> +
>> +static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
>> +{
>> +    int ret;
>> +
>> +    ret = __amd_pstate_cpu_init(policy);
>> +    if (ret)
>> +        return ret;
>> +    /*
>> +     * Set the policy to powersave to provide a valid fallback value in case
>> +     * the default cpufreq governor is neither powersave nor performance.
>> +     */
>> +    policy->policy = CPUFREQ_POLICY_POWERSAVE;
>> +
>> +    return 0;
>> +}
>> +
>> +static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
>> +{
>> +    pr_debug("amd-pstate: CPU %d exiting\n", policy->cpu);
> 
> Drop the "amd-pstate:", this file has pr_fmt.
> 
>> +    policy->fast_switch_possible = false;
>> +    return 0;
>> +}
>> +
>> +static void amd_pstate_update_max_freq(unsigned int cpu)
>> +{
>> +    struct cpufreq_policy *policy = policy = cpufreq_cpu_get(cpu);
>> +
>> +    if (!policy)
>> +        return;
>> +
>> +    refresh_frequency_limits(policy);
>> +    cpufreq_cpu_put(policy);
>> +}
>> +
>> +static void amd_pstate_epp_update_limits(unsigned int cpu)
>> +{
>> +    mutex_lock(&amd_pstate_driver_lock);
>> +    update_boost_state();
>> +    if (global_params.cppc_boost_disabled) {
>> +        for_each_possible_cpu(cpu)
>> +            amd_pstate_update_max_freq(cpu);
>> +    } else {
>> +        cpufreq_update_policy(cpu);
>> +    }
>> +    mutex_unlock(&amd_pstate_driver_lock);
>> +}
>> +
>> +static int cppc_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
>> +
>> +static inline void amd_pstate_boost_up(struct amd_cpudata *cpudata)
>> +{
>> +    u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
>> +    u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
>> +    u32 max_limit = (hwp_req & 0xff);
>> +    u32 min_limit = (hwp_req & 0xff00) >> 8;
>> +    u32 boost_level1;
>> +
>> +    /* If max and min are equal or already at max, nothing to boost */
>> +    if (max_limit == min_limit)
>> +        return;
>> +
>> +    /* Set boost max and min to initial value */
>> +    if (!cpudata->cppc_boost_min)
>> +        cpudata->cppc_boost_min = min_limit;
>> +
>> +    boost_level1 = ((AMD_CPPC_NOMINAL_PERF(hwp_cap) + min_limit) >> 1);
>> +
>> +    if (cpudata->cppc_boost_min < boost_level1)
>> +        cpudata->cppc_boost_min = boost_level1;
>> +    else if (cpudata->cppc_boost_min < AMD_CPPC_NOMINAL_PERF(hwp_cap))
>> +        cpudata->cppc_boost_min = AMD_CPPC_NOMINAL_PERF(hwp_cap);
>> +    else if (cpudata->cppc_boost_min == AMD_CPPC_NOMINAL_PERF(hwp_cap))
>> +        cpudata->cppc_boost_min = max_limit;
>> +    else
>> +        return;
>> +
>> +    hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
>> +    hwp_req |= AMD_CPPC_MIN_PERF(cpudata->cppc_boost_min);
>> +    wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, hwp_req);
>> +    cpudata->last_update = cpudata->sample.time;
>> +}
>> +
>> +static inline void amd_pstate_boost_down(struct amd_cpudata *cpudata)
>> +{
>> +    bool expired;
>> +
>> +    if (cpudata->cppc_boost_min) {
>> +        expired = time_after64(cpudata->sample.time, cpudata->last_update +
>> +                    cppc_boost_hold_time_ns);
>> +
>> +        if (expired) {
>> +            wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
>> +                        cpudata->cppc_req_cached);
>> +            cpudata->cppc_boost_min = 0;
>> +        }
>> +    }
>> +
>> +    cpudata->last_update = cpudata->sample.time;
>> +}
>> +
>> +static inline void amd_pstate_boost_update_util(struct amd_cpudata *cpudata,
>> +                              u64 time)
>> +{
>> +    cpudata->sample.time = time;
>> +    if (smp_processor_id() != cpudata->cpu)
>> +        return;
>> +
>> +    if (cpudata->sched_flags & SCHED_CPUFREQ_IOWAIT) {
>> +        bool do_io = false;
>> +
>> +        cpudata->sched_flags = 0;
>> +        /*
>> +         * Set iowait_boost flag and update time. Since IO WAIT flag
>> +         * is set all the time, we can't just conclude that there is
>> +         * some IO bound activity is scheduled on this CPU with just
>> +         * one occurrence. If we receive at least two in two
>> +         * consecutive ticks, then we treat as boost candidate.
>> +         * This is leveraged from Intel Pstate driver.
>> +         */
>> +        if (time_before64(time, cpudata->last_io_update + 2 * TICK_NSEC))
>> +            do_io = true;
>> +
>> +        cpudata->last_io_update = time;
>> +
>> +        if (do_io)
>> +            amd_pstate_boost_up(cpudata);
>> +
>> +    } else {
>> +        amd_pstate_boost_down(cpudata);
>> +    }
>> +}
>> +
>> +static inline void amd_pstate_cppc_update_hook(struct update_util_data *data,
>> +                        u64 time, unsigned int flags)
>> +{
>> +    struct amd_cpudata *cpudata = container_of(data,
>> +                struct amd_cpudata, update_util);
>> +
>> +    cpudata->sched_flags |= flags;
>> +
>> +    if (smp_processor_id() == cpudata->cpu)
>> +        amd_pstate_boost_update_util(cpudata, time);
>> +}
>> +
>> +static void amd_pstate_clear_update_util_hook(unsigned int cpu)
>> +{
>> +    struct amd_cpudata *cpudata = all_cpu_data[cpu];
>> +
>> +    if (!cpudata->update_util_set)
>> +        return;
>> +
>> +    cpufreq_remove_update_util_hook(cpu);
>> +    cpudata->update_util_set = false;
>> +    synchronize_rcu();
>> +}
>> +
>> +static void amd_pstate_set_update_util_hook(unsigned int cpu_num)
>> +{
>> +    struct amd_cpudata *cpudata = all_cpu_data[cpu_num];
>> +
>> +    if (!cppc_boost) {
>> +        if (cpudata->update_util_set)
>> +            amd_pstate_clear_update_util_hook(cpudata->cpu);
>> +        return;
>> +    }
>> +
>> +    if (cpudata->update_util_set)
>> +        return;
>> +
>> +    cpudata->sample.time = 0;
>> +    cpufreq_add_update_util_hook(cpu_num, &cpudata->update_util,
>> +                        amd_pstate_cppc_update_hook);
>> +    cpudata->update_util_set = true;
>> +}
>> +
>> +static void amd_pstate_epp_init(unsigned int cpu)
>> +{
>> +    struct amd_cpudata *cpudata = all_cpu_data[cpu];
>> +    u32 max_perf, min_perf;
>> +    u64 value;
>> +    s16 epp;
>> +    int ret;
>> +
>> +    max_perf = READ_ONCE(cpudata->highest_perf);
>> +    min_perf = READ_ONCE(cpudata->lowest_perf);
>> +
>> +    value = READ_ONCE(cpudata->cppc_req_cached);
>> +
>> +    if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
>> +        min_perf = max_perf;
>> +
>> +    /* Initial min/max values for CPPC Performance Controls Register */
>> +    value &= ~AMD_CPPC_MIN_PERF(~0L);
>> +    value |= AMD_CPPC_MIN_PERF(min_perf);
>> +
>> +    value &= ~AMD_CPPC_MAX_PERF(~0L);
>> +    value |= AMD_CPPC_MAX_PERF(max_perf);
>> +
>> +    /* CPPC EPP feature require to set zero to the desire perf bit */
>> +    value &= ~AMD_CPPC_DES_PERF(~0L);
>> +    value |= AMD_CPPC_DES_PERF(0);
>> +
>> +    if (cpudata->epp_policy == cpudata->policy)
>> +        goto skip_epp;
>> +
>> +    cpudata->epp_policy = cpudata->policy;
>> +
>> +    if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
>> +        epp = amd_pstate_get_epp(cpudata, value);
>> +        cpudata->epp_powersave = epp;
>> +        if (epp < 0)
>> +            goto skip_epp;
>> +        /* force the epp value to be zero for performance policy */
>> +        epp = 0;
>> +    } else {
>> +        if (cpudata->epp_powersave < 0)
>> +            goto skip_epp;
>> +        /* Get BIOS pre-defined epp value */
>> +        epp = amd_pstate_get_epp(cpudata, value);
>> +        if (epp)
>> +            goto skip_epp;
>> +        epp = cpudata->epp_powersave;
>> +    }
>> +    /* Set initial EPP value */
>> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
>> +        value &= ~GENMASK_ULL(31, 24);
>> +        value |= (u64)epp << 24;
>> +    }
>> +
>> +skip_epp:
>> +    WRITE_ONCE(cpudata->cppc_req_cached, value);
>> +    ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
>> +    if (!ret)
>> +        cpudata->epp_cached = epp;
>> +}
>> +
>> +static void amd_pstate_set_max_limits(struct amd_cpudata *cpudata)
>> +{
>> +    u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
>> +    u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
>> +    u32 max_limit = (hwp_cap >> 24) & 0xff;
>> +
>> +    hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
>> +    hwp_req |= AMD_CPPC_MIN_PERF(max_limit);
>> +    wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, hwp_req);
>> +}
>> +
>> +static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
>> +{
>> +    struct amd_cpudata *cpudata;
>> +
>> +    if (!policy->cpuinfo.max_freq)
>> +        return -ENODEV;
>> +
>> +    pr_debug("set_policy: cpuinfo.max %u policy->max %u\n",
>> +                policy->cpuinfo.max_freq, policy->max);
>> +
>> +    cpudata = all_cpu_data[policy->cpu];
>> +    cpudata->policy = policy->policy;
>> +
>> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
>> +        mutex_lock(&amd_pstate_limits_lock);
>> +
>> +        if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
>> +            amd_pstate_clear_update_util_hook(policy->cpu);
>> +            amd_pstate_set_max_limits(cpudata);
>> +        } else {
>> +            amd_pstate_set_update_util_hook(policy->cpu);
>> +        }
>> +
>> +        if (boot_cpu_has(X86_FEATURE_CPPC))
>> +            amd_pstate_epp_init(policy->cpu);
>> +
>> +        mutex_unlock(&amd_pstate_limits_lock);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void amd_pstate_verify_cpu_policy(struct amd_cpudata *cpudata,
>> +                       struct cpufreq_policy_data *policy)
>> +{
>> +    update_boost_state();
>> +    cpufreq_verify_within_cpu_limits(policy);
>> +}
>> +
>> +static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
>> +{
>> +    amd_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy);
>> +    pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
>> +    return 0;
>> +}
>> +
>>   static struct cpufreq_driver amd_pstate_driver = {
>>       .flags        = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
>>       .verify        = amd_pstate_verify,
>> @@ -607,8 +1222,20 @@ static struct cpufreq_driver amd_pstate_driver = {
>>       .attr        = amd_pstate_attr,
>>   };
>>   +static struct cpufreq_driver amd_pstate_epp_driver = {
>> +    .flags        = CPUFREQ_CONST_LOOPS,
>> +    .verify        = amd_pstate_epp_verify_policy,
>> +    .setpolicy    = amd_pstate_epp_set_policy,
>> +    .init        = amd_pstate_epp_cpu_init,
>> +    .exit        = amd_pstate_epp_cpu_exit,
>> +    .update_limits    = amd_pstate_epp_update_limits,
>> +    .name        = "amd_pstate_epp",
>> +    .attr        = amd_pstate_epp_attr,
>> +};
>> +
>>   static int __init amd_pstate_init(void)
>>   {
>> +    static struct amd_cpudata **cpudata;
>>       int ret;
>>         if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
>> @@ -623,10 +1250,18 @@ static int __init amd_pstate_init(void)
>>       if (cpufreq_get_current_driver())
>>           return -EEXIST;
>>   +    if (!epp_off) {
>> +        WRITE_ONCE(cppc_active, 1);
>> +        if (!default_pstate_driver)
>> +            default_pstate_driver = &amd_pstate_epp_driver;
>> +    }
>> +    pr_info("AMD CPPC loading with %s driver instance.\n", default_pstate_driver->name);
> 
> This is pretty noisy, do we really need it on every boot if we can easily check it from sysfs?
> 
>> +
>>       /* capability check */
>>       if (boot_cpu_has(X86_FEATURE_CPPC)) {
>> +        if (!cppc_active)
>> +            default_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
>>           pr_debug("AMD CPPC MSR based functionality is supported\n");
>> -        amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf;
>>       } else if (shared_mem) {
>>           static_call_update(amd_pstate_enable, cppc_enable);
>>           static_call_update(amd_pstate_init_perf, cppc_init_perf);
>> @@ -636,6 +1271,10 @@ static int __init amd_pstate_init(void)
>>           return -ENODEV;
>>       }
>>   +    cpudata = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
>> +    if (!cpudata)
>> +        return -ENOMEM;
>> +    WRITE_ONCE(all_cpu_data, cpudata);
>>       /* enable amd pstate feature */
>>       ret = amd_pstate_enable(true);
>>       if (ret) {
>> @@ -643,9 +1282,9 @@ static int __init amd_pstate_init(void)
>>           return ret;
>>       }
>>   -    ret = cpufreq_register_driver(&amd_pstate_driver);
>> +    ret = cpufreq_register_driver(default_pstate_driver);
>>       if (ret)
>> -        pr_err("failed to register amd_pstate_driver with return %d\n",
>> +        pr_err("failed to register amd pstate driver with return %d\n",
>>                  ret);
>>         return ret;
>> @@ -657,6 +1296,15 @@ static int __init amd_pstate_param(char *str)
>>       if (!str)
>>           return -EINVAL;
>>   +    if (!strcmp(str, "disable"))
>> +        disable_pstate_load = 1;
>> +    else if (!strcmp(str, "active")) {
>> +        default_pstate_driver = &amd_pstate_epp_driver;
>> +    } else if (!strcmp(str, "passive")) {
>> +        epp_off = 1;
>> +        default_pstate_driver = &amd_pstate_driver;
>> +    }
>> +
>>       /* enable shared memory type CPPC ,if you processor has no MSR, you have to add this
>>        * to your grub to make cppc driver loaded successfully.
>>        */
>> diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
>> index 1c4b8659f171..7e6e8cab97b3 100644
>> --- a/include/linux/amd-pstate.h
>> +++ b/include/linux/amd-pstate.h
>> @@ -25,6 +25,7 @@ struct amd_aperf_mperf {
>>       u64 aperf;
>>       u64 mperf;
>>       u64 tsc;
>> +    u64 time;
>>   };
>>     /**
>> @@ -47,6 +48,18 @@ struct amd_aperf_mperf {
>>    * @prev: Last Aperf/Mperf/tsc count value read from register
>>    * @freq: current cpu frequency value
>>    * @boost_supported: check whether the Processor or SBIOS supports boost mode
>> + * @epp_powersave: Last saved CPPC energy performance preference
>> +                when policy switched to performance
>> + * @epp_policy: Last saved policy used to set energy-performance preference
>> + * @epp_cached: Cached CPPC energy-performance preference value
>> + * @policy: Cpufreq policy value
>> + * @sched_flags: Store scheduler flags for possible cross CPU update
>> + * @update_util_set: CPUFreq utility callback is set
>> + * @last_update: Time stamp of the last performance state update
>> + * @cppc_boost_min: Last CPPC boosted min performance state
>> + * @cppc_cap1_cached: Cached value of the last CPPC Capabilities MSR
>> + * @update_util: Cpufreq utility callback information
>> + * @sample: the stored performance sample
>>    *
>>    * The amd_cpudata is key private data for each CPU thread in AMD P-State, and
>>    * represents all the attributes and goals that AMD P-State requests at runtime.
>> @@ -72,6 +85,74 @@ struct amd_cpudata {
>>         u64    freq;
>>       bool    boost_supported;
>> +
>> +    /* EPP feature related attributes*/
>> +    s16    epp_powersave;
>> +    s16    epp_policy;
>> +    s16    epp_cached;
>> +    u32    policy;
>> +    u32    sched_flags;
>> +    bool    update_util_set;
>> +    u64    last_update;
>> +    u64    last_io_update;
>> +    u32    cppc_boost_min;
>> +    u64    cppc_cap1_cached;
>> +    struct    update_util_data update_util;
>> +    struct    amd_aperf_mperf sample;
>> +};
>> +
>> +/**
>> + * struct amd_pstate_params - global parameters for the performance control
>> + * @ cppc_boost_disabled wheher the core performance boost disabled
>> + */
>> +struct amd_pstate_params {
>> +    bool cppc_boost_disabled;
>> +};
>> +
>> +#define AMD_CPPC_EPP_PERFORMANCE        0x00
>> +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE    0x80
>> +#define AMD_CPPC_EPP_BALANCE_POWERSAVE        0xBF
>> +#define AMD_CPPC_EPP_POWERSAVE            0xFF
>> +
>> +/*
>> + * AMD Energy Preference Performance (EPP)
>> + * The EPP is used in the CCLK DPM controller to drive
>> + * the frequency that a core is going to operate during
>> + * short periods of activity. EPP values will be utilized for
>> + * different OS profiles (balanced, performance, power savings)
>> + * display strings corresponding to EPP index in the
>> + * energy_perf_strings[]
>> + *    index        String
>> + *-------------------------------------
>> + *    0        default
>> + *    1        performance
>> + *    2        balance_performance
>> + *    3        balance_power
>> + *    4        power
>> + */
>> +enum energy_perf_value_index {
>> +    EPP_INDEX_DEFAULT = 0,
>> +    EPP_INDEX_PERFORMANCE,
>> +    EPP_INDEX_BALANCE_PERFORMANCE,
>> +    EPP_INDEX_BALANCE_POWERSAVE,
>> +    EPP_INDEX_POWERSAVE,
>> +};
>> +
>> +static const char * const energy_perf_strings[] = {
>> +    [EPP_INDEX_DEFAULT] = "default",
>> +    [EPP_INDEX_PERFORMANCE] = "performance",
>> +    [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
>> +    [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
>> +    [EPP_INDEX_POWERSAVE] = "power",
>> +    NULL
>> +};
>> +
>> +static unsigned int epp_values[] = {
>> +    [EPP_INDEX_DEFAULT] = 0,
>> +    [EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE,
>> +    [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE,
>> +    [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE,
>> +    [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
>>   };
>>     #endif /* _LINUX_AMD_PSTATE_H */
>
  
Yuan, Perry Nov. 10, 2022, 4:22 p.m. UTC | #5
[AMD Official Use Only - General]

Hi Nathan. 

> -----Original Message-----
> From: Fontenot, Nathan <Nathan.Fontenot@amd.com>
> Sent: Friday, November 11, 2022 12:00 AM
> To: Limonciello, Mario <Mario.Limonciello@amd.com>; Yuan, Perry
> <Perry.Yuan@amd.com>; rafael.j.wysocki@intel.com; Huang, Ray
> <Ray.Huang@amd.com>; viresh.kumar@linaro.org
> Cc: Sharma, Deepak <Deepak.Sharma@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; Huang, Shimmer
> <Shimmer.Huang@amd.com>; Du, Xiaojian <Xiaojian.Du@amd.com>; Meng,
> Li (Jassmine) <Li.Meng@amd.com>; linux-pm@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH v3 4/8] cpufreq: amd_pstate: add AMD Pstate EPP
> support for the MSR based processors
> 
> 
> 
> On 11/7/22 14:32, Limonciello, Mario wrote:
> > On 11/7/2022 11:57, Perry Yuan wrote:
> >> Add EPP driver support for those AMD CPUs which has full MSR feature
> >> enabled, The EPP is used in the DPM controller to drive the frequency
> >> that a core is going to operate during short periods of activity.
> >
> > To avoid the run on sentence, here is a different wording proposal.
> >
> > Add EPP driver support for AMD SoCs which support a dedicated MSR for
> CPPC.  EPP is used by the DPM controller to configure the frequency that a
> core operates at during short periods of activity.
> >
> >>
> >> EPP values will be utilized for different OS profiles (balanced,
> >> performance, power savings). cppc performance can be controlled by
> >> the user space interface sys attributes for min and max frequency
> >> limits, when pstate driver is working under power save policy.
> >>
> >> EPP scale is 0 - 255, 0 is the max performance and 255 is min level.
> >> balance_performance (0x80) can provide best balance performance and
> >> watt for most of system, meanwhile user can choose performance policy
> on needs.
> >
> > As a user reading this message it is confusing that there are values and then
> there are strings, but you don't know the linkage between the two. My
> proposal for rewording this:
> >
> > The SoC EPP targets are configured on a scale from 0 to 255 where 0
> represents maximum performance and 255 represents maximum efficiency.
> >
> > The amd-pstate driver exports profile string names to userspace that are
> tied to specific EPP values.
> >
> > The balance_performance string (0x80) provides the best balance for
> efficiency versus power on most systems, but users can choose other strings
> to meet their needs as well.
> >
> >>
> >> $ cat
> >>
> /sys/devices/system/cpu/cpufreq/policy0/energy_performance_available_
> >> preferences default performance balance_performance balance_power
> >> power
> >>
> >> $ cat
> >>
> /sys/devices/system/cpu/cpufreq/policy0/energy_performance_preferenc
> e
> >> balance_performance
> >>
> >> Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
> >> ---
> >>   drivers/cpufreq/amd-pstate.c | 658
> >> ++++++++++++++++++++++++++++++++++-
> >>   include/linux/amd-pstate.h   |  81 +++++
> >>   2 files changed, 734 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/drivers/cpufreq/amd-pstate.c
> >> b/drivers/cpufreq/amd-pstate.c index 14906431dc15..eb82bc6a7f66
> >> 100644
> >> --- a/drivers/cpufreq/amd-pstate.c
> >> +++ b/drivers/cpufreq/amd-pstate.c
> >> @@ -60,8 +60,136 @@
> >>    * module parameter to be able to enable it manually for debugging.
> >>    */
> >>   static bool shared_mem __read_mostly;
> >> +static int cppc_active __read_mostly; static int disable_pstate_load
> >> +__initdata; static int epp_off __initdata;
> >>   -static struct cpufreq_driver amd_pstate_driver;
> >> +static struct cpufreq_driver *default_pstate_driver; static struct
> >> +amd_cpudata **all_cpu_data;
> >> +
> >> +static struct amd_pstate_params global_params;
> >> +
> >> +static DEFINE_MUTEX(amd_pstate_limits_lock);
> >> +static DEFINE_MUTEX(amd_pstate_driver_lock);
> >> +
> >> +static bool cppc_boost __read_mostly; struct kobject
> >> +*amd_pstate_kobj;
> >> +
> >> +#ifdef CONFIG_ACPI_CPPC_LIB
> >> +static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64
> >> +cppc_req_cached) {
> >> +    s16 epp;
> >> +    struct cppc_perf_caps perf_caps;
> >> +    int ret;
> >> +
> >> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
> >> +        if (!cppc_req_cached) {
> >> +            epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
> >> +                        &cppc_req_cached);
> >> +            if (epp)
> >> +                return epp;
> >> +        }
> >> +        epp = (cppc_req_cached >> 24) & 0xFF;
> >> +    } else {
> >> +        ret = cppc_get_epp_caps(cpudata->cpu, &perf_caps);
> >> +        if (ret < 0) {
> >> +            pr_debug("Could not retrieve energy perf value (%d)\n",
> >> +ret);
> >> +            return -EIO;
> >> +        }
> >> +        epp = (s16) perf_caps.energy_perf;
> >> +    }
> >> +
> >> +    return epp;
> >> +}
> >> +#endif
> >> +
> >> +static int amd_pstate_get_energy_pref_index(struct amd_cpudata
> >> +*cpudata, int *raw_epp) {
> >> +    s16 epp;
> >> +    int index = -EINVAL;
> >> +
> >> +    *raw_epp = 0;
> >> +    epp = amd_pstate_get_epp(cpudata, 0);
> >> +    if (epp < 0)
> >> +        return epp;
> >> +
> >> +    switch (epp) {
> >> +    case AMD_CPPC_EPP_PERFORMANCE:
> >> +        index = EPP_INDEX_PERFORMANCE;
> >> +        break;
> >> +    case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
> >> +        index = EPP_INDEX_BALANCE_PERFORMANCE;
> >> +        break;
> >> +    case AMD_CPPC_EPP_BALANCE_POWERSAVE:
> >> +        index = EPP_INDEX_BALANCE_POWERSAVE;
> >> +        break;
> >> +    case AMD_CPPC_EPP_POWERSAVE:
> >> +        index = EPP_INDEX_POWERSAVE;
> >> +        break;
> >> +    default:
> >> +        *raw_epp = epp;
> >> +        index = 0;
> >> +    }
> >> +
> >> +    return index;
> >> +}
> >> +
> >> +#ifdef CONFIG_ACPI_CPPC_LIB
> >> +static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
> >> +{
> >> +    int ret;
> >> +    struct cppc_perf_ctrls perf_ctrls;
> >> +
> >> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
> >> +        u64 value = READ_ONCE(cpudata->cppc_req_cached);
> >> +
> >> +        value &= ~GENMASK_ULL(31, 24);
> >> +        value |= (u64)epp << 24;
> >> +        WRITE_ONCE(cpudata->cppc_req_cached, value);
> >> +
> >> +        ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
> >> +        if (!ret)
> >> +            cpudata->epp_cached = epp;
> >> +    } else {
> >> +        perf_ctrls.energy_perf = epp;
> >> +        ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls);
> >> +        if (ret) {
> >> +            pr_debug("failed to set energy perf value (%d)\n", ret);
> >> +            return ret;
> >> +        }
> >> +        cpudata->epp_cached = epp;
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static int amd_pstate_set_energy_pref_index(struct amd_cpudata
> >> +*cpudata,
> >> +                          int pref_index, bool use_raw,
> >> +                          u32 raw_epp) {
> >> +    int epp = -EINVAL;
> >> +    int ret;
> >> +
> >> +    if (!pref_index) {
> >> +        pr_debug("EPP pref_index is invalid\n");
> >> +        return -EINVAL;
> >> +    }
> >> +
> >> +    if (use_raw)
> >> +        epp = raw_epp;
> >> +    else if (epp == -EINVAL)
> >> +        epp = epp_values[pref_index];
> >> +
> >> +    if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
> >> +        pr_debug("EPP cannot be set under performance policy\n");
> >> +        return -EBUSY;
> >> +    }
> >> +
> >> +    ret = amd_pstate_set_epp(cpudata, epp);
> >> +
> >> +    return ret;
> >> +}
> >> +#endif
> >>     static inline int pstate_enable(bool enable)
> >>   {
> >> @@ -71,11 +199,25 @@ static inline int pstate_enable(bool enable)
> >>   static int cppc_enable(bool enable)
> >>   {
> >>       int cpu, ret = 0;
> >> +    struct cppc_perf_ctrls perf_ctrls;
> >>         for_each_present_cpu(cpu) {
> >>           ret = cppc_set_enable(cpu, enable);
> >>           if (ret)
> >>               return ret;
> >> +
> >> +        /* Enable autonomous mode for EPP */
> >> +        if (!cppc_active) {
> >> +            ret = cppc_set_auto_epp(cpu, enable);
> >> +            if (ret)
> >> +                return ret;
> >> +
> >> +            /* Set desired perf as zero to allow EPP firmware
> >> +control */
> >> +            perf_ctrls.desired_perf = 0;
> >> +            ret = cppc_set_perf(cpu, &perf_ctrls);
> >> +            if (ret)
> >> +                return ret;
> >> +        }
> >>       }
> >>         return ret;
> >> @@ -418,7 +560,7 @@ static void amd_pstate_boost_init(struct
> >> amd_cpudata *cpudata)
> >>           return;
> >>         cpudata->boost_supported = true;
> >> -    amd_pstate_driver.boost_enabled = true;
> >> +    default_pstate_driver->boost_enabled = true;
> >>   }
> >>     static int amd_pstate_cpu_init(struct cpufreq_policy *policy) @@
> >> -582,10 +724,74 @@ static ssize_t show_amd_pstate_highest_perf(struct
> >> cpufreq_policy *policy,
> >>       return sprintf(&buf[0], "%u\n", perf);
> >>   }
> >>   +static ssize_t show_energy_performance_available_preferences(
> >> +                struct cpufreq_policy *policy, char *buf) {
> >> +    int i = 0;
> >> +    int ret = 0;
> >> +
> >> +    while (energy_perf_strings[i] != NULL)
> >> +        ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
> >> +
> >> +    ret += sprintf(&buf[ret], "\n");
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static ssize_t store_energy_performance_preference(
> >> +        struct cpufreq_policy *policy, const char *buf, size_t
> >> +count) {
> >> +    struct amd_cpudata *cpudata = policy->driver_data;
> >> +    char str_preference[21];
> >> +    bool raw = false;
> >> +    ssize_t ret;
> >> +    u32 epp = 0;
> >> +
> >> +    ret = sscanf(buf, "%20s", str_preference);
> >> +    if (ret != 1)
> >> +        return -EINVAL;
> >> +
> >> +    ret = match_string(energy_perf_strings, -1, str_preference);
> >> +    if (ret < 0) {
> >> +        ret = kstrtouint(buf, 10, &epp);
> >> +        if (ret)
> >> +            return ret;
> >> +
> >> +        if ((epp > 255) || (epp < 0))
> >> +            return -EINVAL;
> >> +
> >> +        raw = true;
> >> +    }
> >
> > What's the reason for supporting putting the raw number in here for stuff
> "in between"?  I think this is going to pretty confusing to userspace that you
> can use string values or integer values.  It also means that if userspace writes
> an integer with a mapping to string and tries to read it back they'll get the
> string rather than the integer!
> >
> > I can understand using the raw values for internal characterization and
> development to possibly introduce a new mapping string, but I don't think
> that makes sense in the kernel.
> >
> 
> This is really doing what Intel does for handling EPP settings. Yes, writing a
> value and getting back a string could be a bit confusing bit it is already done
> from the Intel side. I think keeping EPP value setting common would be a
> good thing if we can do it.
> 
> I don't think we should remove the ability to set raw values, we're allowed a
> range of 0 - 255 for the EPP setting. Why we then limit ourselves to only 4 or
> so values?
> 
> -Nathan

The raw values are userd for tunning , we have another utility to fine tune the performance/power under development.
So from the customer perspective, we do not need to keep the raw epp set any more in my opinion.
And Mario also has strong justification for the change.

Perry. 


> 
> >> +
> >> +    mutex_lock(&amd_pstate_limits_lock);
> >> +    ret = amd_pstate_set_energy_pref_index(cpudata, ret, raw, epp);
> >> +    mutex_unlock(&amd_pstate_limits_lock);
> >> +
> >> +    return ret ?: count;
> >> +}
> >> +
> >> +static ssize_t show_energy_performance_preference(
> >> +                struct cpufreq_policy *policy, char *buf) {
> >> +    struct amd_cpudata *cpudata = policy->driver_data;
> >> +    int preference, raw_epp;
> >> +
> >> +    preference = amd_pstate_get_energy_pref_index(cpudata,
> >> +&raw_epp);
> >> +    if (preference < 0)
> >> +        return preference;
> >> +
> >> +    if (raw_epp)
> >> +        return  sprintf(buf, "%d\n", raw_epp);
> >> +    else
> >> +        return  sprintf(buf, "%s\n",
> >> +energy_perf_strings[preference]); }
> >> +
> >>   cpufreq_freq_attr_ro(amd_pstate_max_freq);
> >>   cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
> >>     cpufreq_freq_attr_ro(amd_pstate_highest_perf);
> >> +cpufreq_freq_attr_rw(energy_performance_preference);
> >> +cpufreq_freq_attr_ro(energy_performance_available_preferences);
> >>     static struct freq_attr *amd_pstate_attr[] = {
> >>       &amd_pstate_max_freq,
> >> @@ -594,6 +800,415 @@ static struct freq_attr *amd_pstate_attr[] = {
> >>       NULL,
> >>   };
> >>   +static struct freq_attr *amd_pstate_epp_attr[] = {
> >> +    &amd_pstate_max_freq,
> >> +    &amd_pstate_lowest_nonlinear_freq,
> >> +    &amd_pstate_highest_perf,
> >> +    &energy_performance_preference,
> >> +    &energy_performance_available_preferences,
> >> +    NULL,
> >> +};
> >> +
> >> +static inline void update_boost_state(void) {
> >> +    u64 misc_en;
> >> +    struct amd_cpudata *cpudata;
> >> +
> >> +    cpudata = all_cpu_data[0];
> >> +    rdmsrl(MSR_K7_HWCR, misc_en);
> >> +    global_params.cppc_boost_disabled = misc_en & BIT_ULL(25); }
> >> +
> >> +static int amd_pstate_init_cpu(unsigned int cpunum) {
> >> +    struct amd_cpudata *cpudata;
> >> +
> >> +    cpudata = all_cpu_data[cpunum];
> >> +    if (!cpudata) {
> >> +        cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL);
> >> +        if (!cpudata)
> >> +            return -ENOMEM;
> >> +        WRITE_ONCE(all_cpu_data[cpunum], cpudata);
> >> +
> >> +        cpudata->cpu = cpunum;
> >> +    }
> >> +    cpudata->epp_powersave = -EINVAL;
> >> +    cpudata->epp_policy = 0;
> >> +    pr_debug("controlling: cpu %d\n", cpunum);
> >> +    return 0;
> >> +}
> >> +
> >> +static int __amd_pstate_cpu_init(struct cpufreq_policy *policy) {
> >> +    int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq,
> >> +ret;
> >> +    struct amd_cpudata *cpudata;
> >> +    struct device *dev;
> >> +    int rc;
> >> +    u64 value;
> >> +
> >> +    rc = amd_pstate_init_cpu(policy->cpu);
> >> +    if (rc)
> >> +        return rc;
> >> +
> >> +    cpudata = all_cpu_data[policy->cpu];
> >> +
> >> +    dev = get_cpu_device(policy->cpu);
> >> +    if (!dev)
> >> +        goto free_cpudata1;
> >> +
> >> +    rc = amd_pstate_init_perf(cpudata);
> >> +    if (rc)
> >> +        goto free_cpudata1;
> >> +
> >> +    min_freq = amd_get_min_freq(cpudata);
> >> +    max_freq = amd_get_max_freq(cpudata);
> >> +    nominal_freq = amd_get_nominal_freq(cpudata);
> >> +    lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata);
> >> +    if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) {
> >> +        dev_err(dev, "min_freq(%d) or max_freq(%d) value is
> >> +incorrect\n",
> >> +                min_freq, max_freq);
> >> +        ret = -EINVAL;
> >> +        goto free_cpudata1;
> >> +    }
> >> +
> >> +    policy->min = min_freq;
> >> +    policy->max = max_freq;
> >> +
> >> +    policy->cpuinfo.min_freq = min_freq;
> >> +    policy->cpuinfo.max_freq = max_freq;
> >> +    /* It will be updated by governor */
> >> +    policy->cur = policy->cpuinfo.min_freq;
> >> +
> >> +    /* Initial processor data capability frequencies */
> >> +    cpudata->max_freq = max_freq;
> >> +    cpudata->min_freq = min_freq;
> >> +    cpudata->nominal_freq = nominal_freq;
> >> +    cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq;
> >> +
> >> +    policy->driver_data = cpudata;
> >> +
> >> +    update_boost_state();
> >> +    cpudata->epp_cached = amd_pstate_get_epp(cpudata, value);
> >> +
> >> +    policy->min = policy->cpuinfo.min_freq;
> >> +    policy->max = policy->cpuinfo.max_freq;
> >> +
> >> +    if (boot_cpu_has(X86_FEATURE_CPPC))
> >> +        policy->fast_switch_possible = true;
> >> +
> >> +    if (!shared_mem && boot_cpu_has(X86_FEATURE_CPPC)) {
> >> +        ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
> >> +        if (ret)
> >> +            return ret;
> >> +        WRITE_ONCE(cpudata->cppc_req_cached, value);
> >> +
> >> +        ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
> >> +&value);
> >> +        if (ret)
> >> +            return ret;
> >> +        WRITE_ONCE(cpudata->cppc_cap1_cached, value);
> >> +    }
> >> +    amd_pstate_boost_init(cpudata);
> >> +
> >> +    return 0;
> >> +
> >> +free_cpudata1:
> >> +    kfree(cpudata);
> >> +    return ret;
> >> +}
> >> +
> >> +static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) {
> >> +    int ret;
> >> +
> >> +    ret = __amd_pstate_cpu_init(policy);
> >> +    if (ret)
> >> +        return ret;
> >> +    /*
> >> +     * Set the policy to powersave to provide a valid fallback value
> >> +in case
> >> +     * the default cpufreq governor is neither powersave nor performance.
> >> +     */
> >> +    policy->policy = CPUFREQ_POLICY_POWERSAVE;
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) {
> >> +    pr_debug("amd-pstate: CPU %d exiting\n", policy->cpu);
> >
> > Drop the "amd-pstate:", this file has pr_fmt.
> >
> >> +    policy->fast_switch_possible = false;
> >> +    return 0;
> >> +}
> >> +
> >> +static void amd_pstate_update_max_freq(unsigned int cpu) {
> >> +    struct cpufreq_policy *policy = policy = cpufreq_cpu_get(cpu);
> >> +
> >> +    if (!policy)
> >> +        return;
> >> +
> >> +    refresh_frequency_limits(policy);
> >> +    cpufreq_cpu_put(policy);
> >> +}
> >> +
> >> +static void amd_pstate_epp_update_limits(unsigned int cpu) {
> >> +    mutex_lock(&amd_pstate_driver_lock);
> >> +    update_boost_state();
> >> +    if (global_params.cppc_boost_disabled) {
> >> +        for_each_possible_cpu(cpu)
> >> +            amd_pstate_update_max_freq(cpu);
> >> +    } else {
> >> +        cpufreq_update_policy(cpu);
> >> +    }
> >> +    mutex_unlock(&amd_pstate_driver_lock);
> >> +}
> >> +
> >> +static int cppc_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
> >> +
> >> +static inline void amd_pstate_boost_up(struct amd_cpudata *cpudata)
> >> +{
> >> +    u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
> >> +    u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
> >> +    u32 max_limit = (hwp_req & 0xff);
> >> +    u32 min_limit = (hwp_req & 0xff00) >> 8;
> >> +    u32 boost_level1;
> >> +
> >> +    /* If max and min are equal or already at max, nothing to boost
> >> +*/
> >> +    if (max_limit == min_limit)
> >> +        return;
> >> +
> >> +    /* Set boost max and min to initial value */
> >> +    if (!cpudata->cppc_boost_min)
> >> +        cpudata->cppc_boost_min = min_limit;
> >> +
> >> +    boost_level1 = ((AMD_CPPC_NOMINAL_PERF(hwp_cap) +
> min_limit) >>
> >> +1);
> >> +
> >> +    if (cpudata->cppc_boost_min < boost_level1)
> >> +        cpudata->cppc_boost_min = boost_level1;
> >> +    else if (cpudata->cppc_boost_min <
> >> +AMD_CPPC_NOMINAL_PERF(hwp_cap))
> >> +        cpudata->cppc_boost_min =
> AMD_CPPC_NOMINAL_PERF(hwp_cap);
> >> +    else if (cpudata->cppc_boost_min ==
> >> +AMD_CPPC_NOMINAL_PERF(hwp_cap))
> >> +        cpudata->cppc_boost_min = max_limit;
> >> +    else
> >> +        return;
> >> +
> >> +    hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
> >> +    hwp_req |= AMD_CPPC_MIN_PERF(cpudata->cppc_boost_min);
> >> +    wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
> hwp_req);
> >> +    cpudata->last_update = cpudata->sample.time; }
> >> +
> >> +static inline void amd_pstate_boost_down(struct amd_cpudata
> >> +*cpudata) {
> >> +    bool expired;
> >> +
> >> +    if (cpudata->cppc_boost_min) {
> >> +        expired = time_after64(cpudata->sample.time,
> >> +cpudata->last_update +
> >> +                    cppc_boost_hold_time_ns);
> >> +
> >> +        if (expired) {
> >> +            wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
> >> +                        cpudata->cppc_req_cached);
> >> +            cpudata->cppc_boost_min = 0;
> >> +        }
> >> +    }
> >> +
> >> +    cpudata->last_update = cpudata->sample.time; }
> >> +
> >> +static inline void amd_pstate_boost_update_util(struct amd_cpudata
> >> +*cpudata,
> >> +                              u64 time) {
> >> +    cpudata->sample.time = time;
> >> +    if (smp_processor_id() != cpudata->cpu)
> >> +        return;
> >> +
> >> +    if (cpudata->sched_flags & SCHED_CPUFREQ_IOWAIT) {
> >> +        bool do_io = false;
> >> +
> >> +        cpudata->sched_flags = 0;
> >> +        /*
> >> +         * Set iowait_boost flag and update time. Since IO WAIT flag
> >> +         * is set all the time, we can't just conclude that there is
> >> +         * some IO bound activity is scheduled on this CPU with just
> >> +         * one occurrence. If we receive at least two in two
> >> +         * consecutive ticks, then we treat as boost candidate.
> >> +         * This is leveraged from Intel Pstate driver.
> >> +         */
> >> +        if (time_before64(time, cpudata->last_io_update + 2 *
> >> +TICK_NSEC))
> >> +            do_io = true;
> >> +
> >> +        cpudata->last_io_update = time;
> >> +
> >> +        if (do_io)
> >> +            amd_pstate_boost_up(cpudata);
> >> +
> >> +    } else {
> >> +        amd_pstate_boost_down(cpudata);
> >> +    }
> >> +}
> >> +
> >> +static inline void amd_pstate_cppc_update_hook(struct
> >> +update_util_data *data,
> >> +                        u64 time, unsigned int flags) {
> >> +    struct amd_cpudata *cpudata = container_of(data,
> >> +                struct amd_cpudata, update_util);
> >> +
> >> +    cpudata->sched_flags |= flags;
> >> +
> >> +    if (smp_processor_id() == cpudata->cpu)
> >> +        amd_pstate_boost_update_util(cpudata, time); }
> >> +
> >> +static void amd_pstate_clear_update_util_hook(unsigned int cpu) {
> >> +    struct amd_cpudata *cpudata = all_cpu_data[cpu];
> >> +
> >> +    if (!cpudata->update_util_set)
> >> +        return;
> >> +
> >> +    cpufreq_remove_update_util_hook(cpu);
> >> +    cpudata->update_util_set = false;
> >> +    synchronize_rcu();
> >> +}
> >> +
> >> +static void amd_pstate_set_update_util_hook(unsigned int cpu_num) {
> >> +    struct amd_cpudata *cpudata = all_cpu_data[cpu_num];
> >> +
> >> +    if (!cppc_boost) {
> >> +        if (cpudata->update_util_set)
> >> +            amd_pstate_clear_update_util_hook(cpudata->cpu);
> >> +        return;
> >> +    }
> >> +
> >> +    if (cpudata->update_util_set)
> >> +        return;
> >> +
> >> +    cpudata->sample.time = 0;
> >> +    cpufreq_add_update_util_hook(cpu_num, &cpudata->update_util,
> >> +                        amd_pstate_cppc_update_hook);
> >> +    cpudata->update_util_set = true; }
> >> +
> >> +static void amd_pstate_epp_init(unsigned int cpu) {
> >> +    struct amd_cpudata *cpudata = all_cpu_data[cpu];
> >> +    u32 max_perf, min_perf;
> >> +    u64 value;
> >> +    s16 epp;
> >> +    int ret;
> >> +
> >> +    max_perf = READ_ONCE(cpudata->highest_perf);
> >> +    min_perf = READ_ONCE(cpudata->lowest_perf);
> >> +
> >> +    value = READ_ONCE(cpudata->cppc_req_cached);
> >> +
> >> +    if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
> >> +        min_perf = max_perf;
> >> +
> >> +    /* Initial min/max values for CPPC Performance Controls Register
> >> +*/
> >> +    value &= ~AMD_CPPC_MIN_PERF(~0L);
> >> +    value |= AMD_CPPC_MIN_PERF(min_perf);
> >> +
> >> +    value &= ~AMD_CPPC_MAX_PERF(~0L);
> >> +    value |= AMD_CPPC_MAX_PERF(max_perf);
> >> +
> >> +    /* CPPC EPP feature require to set zero to the desire perf bit
> >> +*/
> >> +    value &= ~AMD_CPPC_DES_PERF(~0L);
> >> +    value |= AMD_CPPC_DES_PERF(0);
> >> +
> >> +    if (cpudata->epp_policy == cpudata->policy)
> >> +        goto skip_epp;
> >> +
> >> +    cpudata->epp_policy = cpudata->policy;
> >> +
> >> +    if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
> >> +        epp = amd_pstate_get_epp(cpudata, value);
> >> +        cpudata->epp_powersave = epp;
> >> +        if (epp < 0)
> >> +            goto skip_epp;
> >> +        /* force the epp value to be zero for performance policy */
> >> +        epp = 0;
> >> +    } else {
> >> +        if (cpudata->epp_powersave < 0)
> >> +            goto skip_epp;
> >> +        /* Get BIOS pre-defined epp value */
> >> +        epp = amd_pstate_get_epp(cpudata, value);
> >> +        if (epp)
> >> +            goto skip_epp;
> >> +        epp = cpudata->epp_powersave;
> >> +    }
> >> +    /* Set initial EPP value */
> >> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
> >> +        value &= ~GENMASK_ULL(31, 24);
> >> +        value |= (u64)epp << 24;
> >> +    }
> >> +
> >> +skip_epp:
> >> +    WRITE_ONCE(cpudata->cppc_req_cached, value);
> >> +    ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
> >> +    if (!ret)
> >> +        cpudata->epp_cached = epp;
> >> +}
> >> +
> >> +static void amd_pstate_set_max_limits(struct amd_cpudata *cpudata) {
> >> +    u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
> >> +    u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
> >> +    u32 max_limit = (hwp_cap >> 24) & 0xff;
> >> +
> >> +    hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
> >> +    hwp_req |= AMD_CPPC_MIN_PERF(max_limit);
> >> +    wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, hwp_req); }
> >> +
> >> +static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
> >> +{
> >> +    struct amd_cpudata *cpudata;
> >> +
> >> +    if (!policy->cpuinfo.max_freq)
> >> +        return -ENODEV;
> >> +
> >> +    pr_debug("set_policy: cpuinfo.max %u policy->max %u\n",
> >> +                policy->cpuinfo.max_freq, policy->max);
> >> +
> >> +    cpudata = all_cpu_data[policy->cpu];
> >> +    cpudata->policy = policy->policy;
> >> +
> >> +    if (boot_cpu_has(X86_FEATURE_CPPC)) {
> >> +        mutex_lock(&amd_pstate_limits_lock);
> >> +
> >> +        if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
> >> +            amd_pstate_clear_update_util_hook(policy->cpu);
> >> +            amd_pstate_set_max_limits(cpudata);
> >> +        } else {
> >> +            amd_pstate_set_update_util_hook(policy->cpu);
> >> +        }
> >> +
> >> +        if (boot_cpu_has(X86_FEATURE_CPPC))
> >> +            amd_pstate_epp_init(policy->cpu);
> >> +
> >> +        mutex_unlock(&amd_pstate_limits_lock);
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static void amd_pstate_verify_cpu_policy(struct amd_cpudata
> >> +*cpudata,
> >> +                       struct cpufreq_policy_data *policy) {
> >> +    update_boost_state();
> >> +    cpufreq_verify_within_cpu_limits(policy);
> >> +}
> >> +
> >> +static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data
> >> +*policy) {
> >> +    amd_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy);
> >> +    pr_debug("policy_max =%d, policy_min=%d\n", policy->max,
> >> +policy->min);
> >> +    return 0;
> >> +}
> >> +
> >>   static struct cpufreq_driver amd_pstate_driver = {
> >>       .flags        = CPUFREQ_CONST_LOOPS |
> >> CPUFREQ_NEED_UPDATE_LIMITS,
> >>       .verify        = amd_pstate_verify, @@ -607,8 +1222,20 @@
> >> static struct cpufreq_driver amd_pstate_driver = {
> >>       .attr        = amd_pstate_attr,
> >>   };
> >>   +static struct cpufreq_driver amd_pstate_epp_driver = {
> >> +    .flags        = CPUFREQ_CONST_LOOPS,
> >> +    .verify        = amd_pstate_epp_verify_policy,
> >> +    .setpolicy    = amd_pstate_epp_set_policy,
> >> +    .init        = amd_pstate_epp_cpu_init,
> >> +    .exit        = amd_pstate_epp_cpu_exit,
> >> +    .update_limits    = amd_pstate_epp_update_limits,
> >> +    .name        = "amd_pstate_epp",
> >> +    .attr        = amd_pstate_epp_attr, };
> >> +
> >>   static int __init amd_pstate_init(void)
> >>   {
> >> +    static struct amd_cpudata **cpudata;
> >>       int ret;
> >>         if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) @@ -623,10
> >> +1250,18 @@ static int __init amd_pstate_init(void)
> >>       if (cpufreq_get_current_driver())
> >>           return -EEXIST;
> >>   +    if (!epp_off) {
> >> +        WRITE_ONCE(cppc_active, 1);
> >> +        if (!default_pstate_driver)
> >> +            default_pstate_driver = &amd_pstate_epp_driver;
> >> +    }
> >> +    pr_info("AMD CPPC loading with %s driver instance.\n",
> >> +default_pstate_driver->name);
> >
> > This is pretty noisy, do we really need it on every boot if we can easily
> check it from sysfs?
> >
> >> +
> >>       /* capability check */
> >>       if (boot_cpu_has(X86_FEATURE_CPPC)) {
> >> +        if (!cppc_active)
> >> +            default_pstate_driver->adjust_perf =
> >> +amd_pstate_adjust_perf;
> >>           pr_debug("AMD CPPC MSR based functionality is
> >> supported\n");
> >> -        amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf;
> >>       } else if (shared_mem) {
> >>           static_call_update(amd_pstate_enable, cppc_enable);
> >>           static_call_update(amd_pstate_init_perf, cppc_init_perf);
> >> @@ -636,6 +1271,10 @@ static int __init amd_pstate_init(void)
> >>           return -ENODEV;
> >>       }
> >>   +    cpudata = vzalloc(array_size(sizeof(void *),
> >> num_possible_cpus()));
> >> +    if (!cpudata)
> >> +        return -ENOMEM;
> >> +    WRITE_ONCE(all_cpu_data, cpudata);
> >>       /* enable amd pstate feature */
> >>       ret = amd_pstate_enable(true);
> >>       if (ret) {
> >> @@ -643,9 +1282,9 @@ static int __init amd_pstate_init(void)
> >>           return ret;
> >>       }
> >>   -    ret = cpufreq_register_driver(&amd_pstate_driver);
> >> +    ret = cpufreq_register_driver(default_pstate_driver);
> >>       if (ret)
> >> -        pr_err("failed to register amd_pstate_driver with return
> >> %d\n",
> >> +        pr_err("failed to register amd pstate driver with return
> >> +%d\n",
> >>                  ret);
> >>         return ret;
> >> @@ -657,6 +1296,15 @@ static int __init amd_pstate_param(char *str)
> >>       if (!str)
> >>           return -EINVAL;
> >>   +    if (!strcmp(str, "disable"))
> >> +        disable_pstate_load = 1;
> >> +    else if (!strcmp(str, "active")) {
> >> +        default_pstate_driver = &amd_pstate_epp_driver;
> >> +    } else if (!strcmp(str, "passive")) {
> >> +        epp_off = 1;
> >> +        default_pstate_driver = &amd_pstate_driver;
> >> +    }
> >> +
> >>       /* enable shared memory type CPPC ,if you processor has no MSR,
> >> you have to add this
> >>        * to your grub to make cppc driver loaded successfully.
> >>        */
> >> diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
> >> index 1c4b8659f171..7e6e8cab97b3 100644
> >> --- a/include/linux/amd-pstate.h
> >> +++ b/include/linux/amd-pstate.h
> >> @@ -25,6 +25,7 @@ struct amd_aperf_mperf {
> >>       u64 aperf;
> >>       u64 mperf;
> >>       u64 tsc;
> >> +    u64 time;
> >>   };
> >>     /**
> >> @@ -47,6 +48,18 @@ struct amd_aperf_mperf {
> >>    * @prev: Last Aperf/Mperf/tsc count value read from register
> >>    * @freq: current cpu frequency value
> >>    * @boost_supported: check whether the Processor or SBIOS supports
> >> boost mode
> >> + * @epp_powersave: Last saved CPPC energy performance preference
> >> +                when policy switched to performance
> >> + * @epp_policy: Last saved policy used to set energy-performance
> >> +preference
> >> + * @epp_cached: Cached CPPC energy-performance preference value
> >> + * @policy: Cpufreq policy value
> >> + * @sched_flags: Store scheduler flags for possible cross CPU update
> >> + * @update_util_set: CPUFreq utility callback is set
> >> + * @last_update: Time stamp of the last performance state update
> >> + * @cppc_boost_min: Last CPPC boosted min performance state
> >> + * @cppc_cap1_cached: Cached value of the last CPPC Capabilities MSR
> >> + * @update_util: Cpufreq utility callback information
> >> + * @sample: the stored performance sample
> >>    *
> >>    * The amd_cpudata is key private data for each CPU thread in AMD
> >> P-State, and
> >>    * represents all the attributes and goals that AMD P-State requests at
> runtime.
> >> @@ -72,6 +85,74 @@ struct amd_cpudata {
> >>         u64    freq;
> >>       bool    boost_supported;
> >> +
> >> +    /* EPP feature related attributes*/
> >> +    s16    epp_powersave;
> >> +    s16    epp_policy;
> >> +    s16    epp_cached;
> >> +    u32    policy;
> >> +    u32    sched_flags;
> >> +    bool    update_util_set;
> >> +    u64    last_update;
> >> +    u64    last_io_update;
> >> +    u32    cppc_boost_min;
> >> +    u64    cppc_cap1_cached;
> >> +    struct    update_util_data update_util;
> >> +    struct    amd_aperf_mperf sample; };
> >> +
> >> +/**
> >> + * struct amd_pstate_params - global parameters for the performance
> >> +control
> >> + * @ cppc_boost_disabled wheher the core performance boost disabled
> >> +*/ struct amd_pstate_params {
> >> +    bool cppc_boost_disabled;
> >> +};
> >> +
> >> +#define AMD_CPPC_EPP_PERFORMANCE        0x00 #define
> >> +AMD_CPPC_EPP_BALANCE_PERFORMANCE    0x80 #define
> >> +AMD_CPPC_EPP_BALANCE_POWERSAVE        0xBF #define
> >> +AMD_CPPC_EPP_POWERSAVE            0xFF
> >> +
> >> +/*
> >> + * AMD Energy Preference Performance (EPP)
> >> + * The EPP is used in the CCLK DPM controller to drive
> >> + * the frequency that a core is going to operate during
> >> + * short periods of activity. EPP values will be utilized for
> >> + * different OS profiles (balanced, performance, power savings)
> >> + * display strings corresponding to EPP index in the
> >> + * energy_perf_strings[]
> >> + *    index        String
> >> + *-------------------------------------
> >> + *    0        default
> >> + *    1        performance
> >> + *    2        balance_performance
> >> + *    3        balance_power
> >> + *    4        power
> >> + */
> >> +enum energy_perf_value_index {
> >> +    EPP_INDEX_DEFAULT = 0,
> >> +    EPP_INDEX_PERFORMANCE,
> >> +    EPP_INDEX_BALANCE_PERFORMANCE,
> >> +    EPP_INDEX_BALANCE_POWERSAVE,
> >> +    EPP_INDEX_POWERSAVE,
> >> +};
> >> +
> >> +static const char * const energy_perf_strings[] = {
> >> +    [EPP_INDEX_DEFAULT] = "default",
> >> +    [EPP_INDEX_PERFORMANCE] = "performance",
> >> +    [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
> >> +    [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
> >> +    [EPP_INDEX_POWERSAVE] = "power",
> >> +    NULL
> >> +};
> >> +
> >> +static unsigned int epp_values[] = {
> >> +    [EPP_INDEX_DEFAULT] = 0,
> >> +    [EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE,
> >> +    [EPP_INDEX_BALANCE_PERFORMANCE] =
> >> +AMD_CPPC_EPP_BALANCE_PERFORMANCE,
> >> +    [EPP_INDEX_BALANCE_POWERSAVE] =
> AMD_CPPC_EPP_BALANCE_POWERSAVE,
> >> +    [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
> >>   };
> >>     #endif /* _LINUX_AMD_PSTATE_H */
> >
  

Patch

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 14906431dc15..eb82bc6a7f66 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -60,8 +60,136 @@ 
  * module parameter to be able to enable it manually for debugging.
  */
 static bool shared_mem __read_mostly;
+static int cppc_active __read_mostly;
+static int disable_pstate_load __initdata;
+static int epp_off __initdata;
 
-static struct cpufreq_driver amd_pstate_driver;
+static struct cpufreq_driver *default_pstate_driver;
+static struct amd_cpudata **all_cpu_data;
+
+static struct amd_pstate_params global_params;
+
+static DEFINE_MUTEX(amd_pstate_limits_lock);
+static DEFINE_MUTEX(amd_pstate_driver_lock);
+
+static bool cppc_boost __read_mostly;
+struct kobject *amd_pstate_kobj;
+
+#ifdef CONFIG_ACPI_CPPC_LIB
+static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
+{
+	s16 epp;
+	struct cppc_perf_caps perf_caps;
+	int ret;
+
+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+		if (!cppc_req_cached) {
+			epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
+					    &cppc_req_cached);
+			if (epp)
+				return epp;
+		}
+		epp = (cppc_req_cached >> 24) & 0xFF;
+	} else {
+		ret = cppc_get_epp_caps(cpudata->cpu, &perf_caps);
+		if (ret < 0) {
+			pr_debug("Could not retrieve energy perf value (%d)\n", ret);
+			return -EIO;
+		}
+		epp = (s16) perf_caps.energy_perf;
+	}
+
+	return epp;
+}
+#endif
+
+static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata, int *raw_epp)
+{
+	s16 epp;
+	int index = -EINVAL;
+
+	*raw_epp = 0;
+	epp = amd_pstate_get_epp(cpudata, 0);
+	if (epp < 0)
+		return epp;
+
+	switch (epp) {
+	case AMD_CPPC_EPP_PERFORMANCE:
+		index = EPP_INDEX_PERFORMANCE;
+		break;
+	case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
+		index = EPP_INDEX_BALANCE_PERFORMANCE;
+		break;
+	case AMD_CPPC_EPP_BALANCE_POWERSAVE:
+		index = EPP_INDEX_BALANCE_POWERSAVE;
+		break;
+	case AMD_CPPC_EPP_POWERSAVE:
+		index = EPP_INDEX_POWERSAVE;
+		break;
+	default:
+		*raw_epp = epp;
+		index = 0;
+	}
+
+	return index;
+}
+
+#ifdef CONFIG_ACPI_CPPC_LIB
+static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
+{
+	int ret;
+	struct cppc_perf_ctrls perf_ctrls;
+
+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+		u64 value = READ_ONCE(cpudata->cppc_req_cached);
+
+		value &= ~GENMASK_ULL(31, 24);
+		value |= (u64)epp << 24;
+		WRITE_ONCE(cpudata->cppc_req_cached, value);
+
+		ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+		if (!ret)
+			cpudata->epp_cached = epp;
+	} else {
+		perf_ctrls.energy_perf = epp;
+		ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls);
+		if (ret) {
+			pr_debug("failed to set energy perf value (%d)\n", ret);
+			return ret;
+		}
+		cpudata->epp_cached = epp;
+	}
+
+	return ret;
+}
+
+static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
+					      int pref_index, bool use_raw,
+					      u32 raw_epp)
+{
+	int epp = -EINVAL;
+	int ret;
+
+	if (!pref_index) {
+		pr_debug("EPP pref_index is invalid\n");
+		return -EINVAL;
+	}
+
+	if (use_raw)
+		epp = raw_epp;
+	else if (epp == -EINVAL)
+		epp = epp_values[pref_index];
+
+	if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
+		pr_debug("EPP cannot be set under performance policy\n");
+		return -EBUSY;
+	}
+
+	ret = amd_pstate_set_epp(cpudata, epp);
+
+	return ret;
+}
+#endif
 
 static inline int pstate_enable(bool enable)
 {
@@ -71,11 +199,25 @@  static inline int pstate_enable(bool enable)
 static int cppc_enable(bool enable)
 {
 	int cpu, ret = 0;
+	struct cppc_perf_ctrls perf_ctrls;
 
 	for_each_present_cpu(cpu) {
 		ret = cppc_set_enable(cpu, enable);
 		if (ret)
 			return ret;
+
+		/* Enable autonomous mode for EPP */
+		if (!cppc_active) {
+			ret = cppc_set_auto_epp(cpu, enable);
+			if (ret)
+				return ret;
+
+			/* Set desired perf as zero to allow EPP firmware control */
+			perf_ctrls.desired_perf = 0;
+			ret = cppc_set_perf(cpu, &perf_ctrls);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return ret;
@@ -418,7 +560,7 @@  static void amd_pstate_boost_init(struct amd_cpudata *cpudata)
 		return;
 
 	cpudata->boost_supported = true;
-	amd_pstate_driver.boost_enabled = true;
+	default_pstate_driver->boost_enabled = true;
 }
 
 static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
@@ -582,10 +724,74 @@  static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
 	return sprintf(&buf[0], "%u\n", perf);
 }
 
+static ssize_t show_energy_performance_available_preferences(
+				struct cpufreq_policy *policy, char *buf)
+{
+	int i = 0;
+	int ret = 0;
+
+	while (energy_perf_strings[i] != NULL)
+		ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
+
+	ret += sprintf(&buf[ret], "\n");
+
+	return ret;
+}
+
+static ssize_t store_energy_performance_preference(
+		struct cpufreq_policy *policy, const char *buf, size_t count)
+{
+	struct amd_cpudata *cpudata = policy->driver_data;
+	char str_preference[21];
+	bool raw = false;
+	ssize_t ret;
+	u32 epp = 0;
+
+	ret = sscanf(buf, "%20s", str_preference);
+	if (ret != 1)
+		return -EINVAL;
+
+	ret = match_string(energy_perf_strings, -1, str_preference);
+	if (ret < 0) {
+		ret = kstrtouint(buf, 10, &epp);
+		if (ret)
+			return ret;
+
+		if ((epp > 255) || (epp < 0))
+			return -EINVAL;
+
+		raw = true;
+	}
+
+	mutex_lock(&amd_pstate_limits_lock);
+	ret = amd_pstate_set_energy_pref_index(cpudata, ret, raw, epp);
+	mutex_unlock(&amd_pstate_limits_lock);
+
+	return ret ?: count;
+}
+
+static ssize_t show_energy_performance_preference(
+				struct cpufreq_policy *policy, char *buf)
+{
+	struct amd_cpudata *cpudata = policy->driver_data;
+	int preference, raw_epp;
+
+	preference = amd_pstate_get_energy_pref_index(cpudata, &raw_epp);
+	if (preference < 0)
+		return preference;
+
+	if (raw_epp)
+		return  sprintf(buf, "%d\n", raw_epp);
+	else
+		return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
+}
+
 cpufreq_freq_attr_ro(amd_pstate_max_freq);
 cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
 
 cpufreq_freq_attr_ro(amd_pstate_highest_perf);
+cpufreq_freq_attr_rw(energy_performance_preference);
+cpufreq_freq_attr_ro(energy_performance_available_preferences);
 
 static struct freq_attr *amd_pstate_attr[] = {
 	&amd_pstate_max_freq,
@@ -594,6 +800,415 @@  static struct freq_attr *amd_pstate_attr[] = {
 	NULL,
 };
 
+static struct freq_attr *amd_pstate_epp_attr[] = {
+	&amd_pstate_max_freq,
+	&amd_pstate_lowest_nonlinear_freq,
+	&amd_pstate_highest_perf,
+	&energy_performance_preference,
+	&energy_performance_available_preferences,
+	NULL,
+};
+
+static inline void update_boost_state(void)
+{
+	u64 misc_en;
+	struct amd_cpudata *cpudata;
+
+	cpudata = all_cpu_data[0];
+	rdmsrl(MSR_K7_HWCR, misc_en);
+	global_params.cppc_boost_disabled = misc_en & BIT_ULL(25);
+}
+
+static int amd_pstate_init_cpu(unsigned int cpunum)
+{
+	struct amd_cpudata *cpudata;
+
+	cpudata = all_cpu_data[cpunum];
+	if (!cpudata) {
+		cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL);
+		if (!cpudata)
+			return -ENOMEM;
+		WRITE_ONCE(all_cpu_data[cpunum], cpudata);
+
+		cpudata->cpu = cpunum;
+	}
+	cpudata->epp_powersave = -EINVAL;
+	cpudata->epp_policy = 0;
+	pr_debug("controlling: cpu %d\n", cpunum);
+	return 0;
+}
+
+static int __amd_pstate_cpu_init(struct cpufreq_policy *policy)
+{
+	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
+	struct amd_cpudata *cpudata;
+	struct device *dev;
+	int rc;
+	u64 value;
+
+	rc = amd_pstate_init_cpu(policy->cpu);
+	if (rc)
+		return rc;
+
+	cpudata = all_cpu_data[policy->cpu];
+
+	dev = get_cpu_device(policy->cpu);
+	if (!dev)
+		goto free_cpudata1;
+
+	rc = amd_pstate_init_perf(cpudata);
+	if (rc)
+		goto free_cpudata1;
+
+	min_freq = amd_get_min_freq(cpudata);
+	max_freq = amd_get_max_freq(cpudata);
+	nominal_freq = amd_get_nominal_freq(cpudata);
+	lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata);
+	if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) {
+		dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n",
+				min_freq, max_freq);
+		ret = -EINVAL;
+		goto free_cpudata1;
+	}
+
+	policy->min = min_freq;
+	policy->max = max_freq;
+
+	policy->cpuinfo.min_freq = min_freq;
+	policy->cpuinfo.max_freq = max_freq;
+	/* It will be updated by governor */
+	policy->cur = policy->cpuinfo.min_freq;
+
+	/* Initial processor data capability frequencies */
+	cpudata->max_freq = max_freq;
+	cpudata->min_freq = min_freq;
+	cpudata->nominal_freq = nominal_freq;
+	cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq;
+
+	policy->driver_data = cpudata;
+
+	update_boost_state();
+	cpudata->epp_cached = amd_pstate_get_epp(cpudata, value);
+
+	policy->min = policy->cpuinfo.min_freq;
+	policy->max = policy->cpuinfo.max_freq;
+
+	if (boot_cpu_has(X86_FEATURE_CPPC))
+		policy->fast_switch_possible = true;
+
+	if (!shared_mem && boot_cpu_has(X86_FEATURE_CPPC)) {
+		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
+		if (ret)
+			return ret;
+		WRITE_ONCE(cpudata->cppc_req_cached, value);
+
+		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value);
+		if (ret)
+			return ret;
+		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
+	}
+	amd_pstate_boost_init(cpudata);
+
+	return 0;
+
+free_cpudata1:
+	kfree(cpudata);
+	return ret;
+}
+
+static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+{
+	int ret;
+
+	ret = __amd_pstate_cpu_init(policy);
+	if (ret)
+		return ret;
+	/*
+	 * Set the policy to powersave to provide a valid fallback value in case
+	 * the default cpufreq governor is neither powersave nor performance.
+	 */
+	policy->policy = CPUFREQ_POLICY_POWERSAVE;
+
+	return 0;
+}
+
+static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
+{
+	pr_debug("amd-pstate: CPU %d exiting\n", policy->cpu);
+	policy->fast_switch_possible = false;
+	return 0;
+}
+
+static void amd_pstate_update_max_freq(unsigned int cpu)
+{
+	struct cpufreq_policy *policy = policy = cpufreq_cpu_get(cpu);
+
+	if (!policy)
+		return;
+
+	refresh_frequency_limits(policy);
+	cpufreq_cpu_put(policy);
+}
+
+static void amd_pstate_epp_update_limits(unsigned int cpu)
+{
+	mutex_lock(&amd_pstate_driver_lock);
+	update_boost_state();
+	if (global_params.cppc_boost_disabled) {
+		for_each_possible_cpu(cpu)
+			amd_pstate_update_max_freq(cpu);
+	} else {
+		cpufreq_update_policy(cpu);
+	}
+	mutex_unlock(&amd_pstate_driver_lock);
+}
+
+static int cppc_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
+
+static inline void amd_pstate_boost_up(struct amd_cpudata *cpudata)
+{
+	u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
+	u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
+	u32 max_limit = (hwp_req & 0xff);
+	u32 min_limit = (hwp_req & 0xff00) >> 8;
+	u32 boost_level1;
+
+	/* If max and min are equal or already at max, nothing to boost */
+	if (max_limit == min_limit)
+		return;
+
+	/* Set boost max and min to initial value */
+	if (!cpudata->cppc_boost_min)
+		cpudata->cppc_boost_min = min_limit;
+
+	boost_level1 = ((AMD_CPPC_NOMINAL_PERF(hwp_cap) + min_limit) >> 1);
+
+	if (cpudata->cppc_boost_min < boost_level1)
+		cpudata->cppc_boost_min = boost_level1;
+	else if (cpudata->cppc_boost_min < AMD_CPPC_NOMINAL_PERF(hwp_cap))
+		cpudata->cppc_boost_min = AMD_CPPC_NOMINAL_PERF(hwp_cap);
+	else if (cpudata->cppc_boost_min == AMD_CPPC_NOMINAL_PERF(hwp_cap))
+		cpudata->cppc_boost_min = max_limit;
+	else
+		return;
+
+	hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
+	hwp_req |= AMD_CPPC_MIN_PERF(cpudata->cppc_boost_min);
+	wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, hwp_req);
+	cpudata->last_update = cpudata->sample.time;
+}
+
+static inline void amd_pstate_boost_down(struct amd_cpudata *cpudata)
+{
+	bool expired;
+
+	if (cpudata->cppc_boost_min) {
+		expired = time_after64(cpudata->sample.time, cpudata->last_update +
+					cppc_boost_hold_time_ns);
+
+		if (expired) {
+			wrmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
+						cpudata->cppc_req_cached);
+			cpudata->cppc_boost_min = 0;
+		}
+	}
+
+	cpudata->last_update = cpudata->sample.time;
+}
+
+static inline void amd_pstate_boost_update_util(struct amd_cpudata *cpudata,
+						      u64 time)
+{
+	cpudata->sample.time = time;
+	if (smp_processor_id() != cpudata->cpu)
+		return;
+
+	if (cpudata->sched_flags & SCHED_CPUFREQ_IOWAIT) {
+		bool do_io = false;
+
+		cpudata->sched_flags = 0;
+		/*
+		 * Set iowait_boost flag and update time. Since IO WAIT flag
+		 * is set all the time, we can't just conclude that there is
+		 * some IO bound activity is scheduled on this CPU with just
+		 * one occurrence. If we receive at least two in two
+		 * consecutive ticks, then we treat as boost candidate.
+		 * This is leveraged from Intel Pstate driver.
+		 */
+		if (time_before64(time, cpudata->last_io_update + 2 * TICK_NSEC))
+			do_io = true;
+
+		cpudata->last_io_update = time;
+
+		if (do_io)
+			amd_pstate_boost_up(cpudata);
+
+	} else {
+		amd_pstate_boost_down(cpudata);
+	}
+}
+
+static inline void amd_pstate_cppc_update_hook(struct update_util_data *data,
+						u64 time, unsigned int flags)
+{
+	struct amd_cpudata *cpudata = container_of(data,
+				struct amd_cpudata, update_util);
+
+	cpudata->sched_flags |= flags;
+
+	if (smp_processor_id() == cpudata->cpu)
+		amd_pstate_boost_update_util(cpudata, time);
+}
+
+static void amd_pstate_clear_update_util_hook(unsigned int cpu)
+{
+	struct amd_cpudata *cpudata = all_cpu_data[cpu];
+
+	if (!cpudata->update_util_set)
+		return;
+
+	cpufreq_remove_update_util_hook(cpu);
+	cpudata->update_util_set = false;
+	synchronize_rcu();
+}
+
+static void amd_pstate_set_update_util_hook(unsigned int cpu_num)
+{
+	struct amd_cpudata *cpudata = all_cpu_data[cpu_num];
+
+	if (!cppc_boost) {
+		if (cpudata->update_util_set)
+			amd_pstate_clear_update_util_hook(cpudata->cpu);
+		return;
+	}
+
+	if (cpudata->update_util_set)
+		return;
+
+	cpudata->sample.time = 0;
+	cpufreq_add_update_util_hook(cpu_num, &cpudata->update_util,
+						amd_pstate_cppc_update_hook);
+	cpudata->update_util_set = true;
+}
+
+static void amd_pstate_epp_init(unsigned int cpu)
+{
+	struct amd_cpudata *cpudata = all_cpu_data[cpu];
+	u32 max_perf, min_perf;
+	u64 value;
+	s16 epp;
+	int ret;
+
+	max_perf = READ_ONCE(cpudata->highest_perf);
+	min_perf = READ_ONCE(cpudata->lowest_perf);
+
+	value = READ_ONCE(cpudata->cppc_req_cached);
+
+	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+		min_perf = max_perf;
+
+	/* Initial min/max values for CPPC Performance Controls Register */
+	value &= ~AMD_CPPC_MIN_PERF(~0L);
+	value |= AMD_CPPC_MIN_PERF(min_perf);
+
+	value &= ~AMD_CPPC_MAX_PERF(~0L);
+	value |= AMD_CPPC_MAX_PERF(max_perf);
+
+	/* CPPC EPP feature require to set zero to the desire perf bit */
+	value &= ~AMD_CPPC_DES_PERF(~0L);
+	value |= AMD_CPPC_DES_PERF(0);
+
+	if (cpudata->epp_policy == cpudata->policy)
+		goto skip_epp;
+
+	cpudata->epp_policy = cpudata->policy;
+
+	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
+		epp = amd_pstate_get_epp(cpudata, value);
+		cpudata->epp_powersave = epp;
+		if (epp < 0)
+			goto skip_epp;
+		/* force the epp value to be zero for performance policy */
+		epp = 0;
+	} else {
+		if (cpudata->epp_powersave < 0)
+			goto skip_epp;
+		/* Get BIOS pre-defined epp value */
+		epp = amd_pstate_get_epp(cpudata, value);
+		if (epp)
+			goto skip_epp;
+		epp = cpudata->epp_powersave;
+	}
+	/* Set initial EPP value */
+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+		value &= ~GENMASK_ULL(31, 24);
+		value |= (u64)epp << 24;
+	}
+
+skip_epp:
+	WRITE_ONCE(cpudata->cppc_req_cached, value);
+	ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+	if (!ret)
+		cpudata->epp_cached = epp;
+}
+
+static void amd_pstate_set_max_limits(struct amd_cpudata *cpudata)
+{
+	u64 hwp_cap = READ_ONCE(cpudata->cppc_cap1_cached);
+	u64 hwp_req = READ_ONCE(cpudata->cppc_req_cached);
+	u32 max_limit = (hwp_cap >> 24) & 0xff;
+
+	hwp_req &= ~AMD_CPPC_MIN_PERF(~0L);
+	hwp_req |= AMD_CPPC_MIN_PERF(max_limit);
+	wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, hwp_req);
+}
+
+static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
+{
+	struct amd_cpudata *cpudata;
+
+	if (!policy->cpuinfo.max_freq)
+		return -ENODEV;
+
+	pr_debug("set_policy: cpuinfo.max %u policy->max %u\n",
+				policy->cpuinfo.max_freq, policy->max);
+
+	cpudata = all_cpu_data[policy->cpu];
+	cpudata->policy = policy->policy;
+
+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+		mutex_lock(&amd_pstate_limits_lock);
+
+		if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
+			amd_pstate_clear_update_util_hook(policy->cpu);
+			amd_pstate_set_max_limits(cpudata);
+		} else {
+			amd_pstate_set_update_util_hook(policy->cpu);
+		}
+
+		if (boot_cpu_has(X86_FEATURE_CPPC))
+			amd_pstate_epp_init(policy->cpu);
+
+		mutex_unlock(&amd_pstate_limits_lock);
+	}
+
+	return 0;
+}
+
+static void amd_pstate_verify_cpu_policy(struct amd_cpudata *cpudata,
+					   struct cpufreq_policy_data *policy)
+{
+	update_boost_state();
+	cpufreq_verify_within_cpu_limits(policy);
+}
+
+static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
+{
+	amd_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy);
+	pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
+	return 0;
+}
+
 static struct cpufreq_driver amd_pstate_driver = {
 	.flags		= CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
 	.verify		= amd_pstate_verify,
@@ -607,8 +1222,20 @@  static struct cpufreq_driver amd_pstate_driver = {
 	.attr		= amd_pstate_attr,
 };
 
+static struct cpufreq_driver amd_pstate_epp_driver = {
+	.flags		= CPUFREQ_CONST_LOOPS,
+	.verify		= amd_pstate_epp_verify_policy,
+	.setpolicy	= amd_pstate_epp_set_policy,
+	.init		= amd_pstate_epp_cpu_init,
+	.exit		= amd_pstate_epp_cpu_exit,
+	.update_limits	= amd_pstate_epp_update_limits,
+	.name		= "amd_pstate_epp",
+	.attr		= amd_pstate_epp_attr,
+};
+
 static int __init amd_pstate_init(void)
 {
+	static struct amd_cpudata **cpudata;
 	int ret;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
@@ -623,10 +1250,18 @@  static int __init amd_pstate_init(void)
 	if (cpufreq_get_current_driver())
 		return -EEXIST;
 
+	if (!epp_off) {
+		WRITE_ONCE(cppc_active, 1);
+		if (!default_pstate_driver)
+			default_pstate_driver = &amd_pstate_epp_driver;
+	}
+	pr_info("AMD CPPC loading with %s driver instance.\n", default_pstate_driver->name);
+
 	/* capability check */
 	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+		if (!cppc_active)
+			default_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
 		pr_debug("AMD CPPC MSR based functionality is supported\n");
-		amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf;
 	} else if (shared_mem) {
 		static_call_update(amd_pstate_enable, cppc_enable);
 		static_call_update(amd_pstate_init_perf, cppc_init_perf);
@@ -636,6 +1271,10 @@  static int __init amd_pstate_init(void)
 		return -ENODEV;
 	}
 
+	cpudata = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
+	if (!cpudata)
+		return -ENOMEM;
+	WRITE_ONCE(all_cpu_data, cpudata);
 	/* enable amd pstate feature */
 	ret = amd_pstate_enable(true);
 	if (ret) {
@@ -643,9 +1282,9 @@  static int __init amd_pstate_init(void)
 		return ret;
 	}
 
-	ret = cpufreq_register_driver(&amd_pstate_driver);
+	ret = cpufreq_register_driver(default_pstate_driver);
 	if (ret)
-		pr_err("failed to register amd_pstate_driver with return %d\n",
+		pr_err("failed to register amd pstate driver with return %d\n",
 		       ret);
 
 	return ret;
@@ -657,6 +1296,15 @@  static int __init amd_pstate_param(char *str)
 	if (!str)
 		return -EINVAL;
 
+	if (!strcmp(str, "disable"))
+		disable_pstate_load = 1;
+	else if (!strcmp(str, "active")) {
+		default_pstate_driver = &amd_pstate_epp_driver;
+	} else if (!strcmp(str, "passive")) {
+		epp_off = 1;
+		default_pstate_driver = &amd_pstate_driver;
+	}
+
 	/* enable shared memory type CPPC ,if you processor has no MSR, you have to add this
 	 * to your grub to make cppc driver loaded successfully.
 	 */
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index 1c4b8659f171..7e6e8cab97b3 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -25,6 +25,7 @@  struct amd_aperf_mperf {
 	u64 aperf;
 	u64 mperf;
 	u64 tsc;
+	u64 time;
 };
 
 /**
@@ -47,6 +48,18 @@  struct amd_aperf_mperf {
  * @prev: Last Aperf/Mperf/tsc count value read from register
  * @freq: current cpu frequency value
  * @boost_supported: check whether the Processor or SBIOS supports boost mode
+ * @epp_powersave: Last saved CPPC energy performance preference
+				when policy switched to performance
+ * @epp_policy: Last saved policy used to set energy-performance preference
+ * @epp_cached: Cached CPPC energy-performance preference value
+ * @policy: Cpufreq policy value
+ * @sched_flags: Store scheduler flags for possible cross CPU update
+ * @update_util_set: CPUFreq utility callback is set
+ * @last_update: Time stamp of the last performance state update
+ * @cppc_boost_min: Last CPPC boosted min performance state
+ * @cppc_cap1_cached: Cached value of the last CPPC Capabilities MSR
+ * @update_util: Cpufreq utility callback information
+ * @sample: the stored performance sample
  *
  * The amd_cpudata is key private data for each CPU thread in AMD P-State, and
  * represents all the attributes and goals that AMD P-State requests at runtime.
@@ -72,6 +85,74 @@  struct amd_cpudata {
 
 	u64	freq;
 	bool	boost_supported;
+
+	/* EPP feature related attributes*/
+	s16	epp_powersave;
+	s16	epp_policy;
+	s16	epp_cached;
+	u32	policy;
+	u32	sched_flags;
+	bool	update_util_set;
+	u64	last_update;
+	u64	last_io_update;
+	u32	cppc_boost_min;
+	u64	cppc_cap1_cached;
+	struct	update_util_data update_util;
+	struct	amd_aperf_mperf sample;
+};
+
+/**
+ * struct amd_pstate_params - global parameters for the performance control
+ * @ cppc_boost_disabled wheher the core performance boost disabled
+ */
+struct amd_pstate_params {
+	bool cppc_boost_disabled;
+};
+
+#define AMD_CPPC_EPP_PERFORMANCE		0x00
+#define AMD_CPPC_EPP_BALANCE_PERFORMANCE	0x80
+#define AMD_CPPC_EPP_BALANCE_POWERSAVE		0xBF
+#define AMD_CPPC_EPP_POWERSAVE			0xFF
+
+/*
+ * AMD Energy Preference Performance (EPP)
+ * The EPP is used in the CCLK DPM controller to drive
+ * the frequency that a core is going to operate during
+ * short periods of activity. EPP values will be utilized for
+ * different OS profiles (balanced, performance, power savings)
+ * display strings corresponding to EPP index in the
+ * energy_perf_strings[]
+ *	index		String
+ *-------------------------------------
+ *	0		default
+ *	1		performance
+ *	2		balance_performance
+ *	3		balance_power
+ *	4		power
+ */
+enum energy_perf_value_index {
+	EPP_INDEX_DEFAULT = 0,
+	EPP_INDEX_PERFORMANCE,
+	EPP_INDEX_BALANCE_PERFORMANCE,
+	EPP_INDEX_BALANCE_POWERSAVE,
+	EPP_INDEX_POWERSAVE,
+};
+
+static const char * const energy_perf_strings[] = {
+	[EPP_INDEX_DEFAULT] = "default",
+	[EPP_INDEX_PERFORMANCE] = "performance",
+	[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
+	[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
+	[EPP_INDEX_POWERSAVE] = "power",
+	NULL
+};
+
+static unsigned int epp_values[] = {
+	[EPP_INDEX_DEFAULT] = 0,
+	[EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE,
+	[EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE,
+	[EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE,
+	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
 };
 
 #endif /* _LINUX_AMD_PSTATE_H */