Xen project Mailing List

Re: [PATCH v4 09/15] xen/x86: introduce a new amd cppc driver for cpufreq scaling

Date: Tue, 29 Apr 2025 16:29:16 +0200

Autocrypt: addr=jbeulich@xxxxxxxx; keydata= xsDiBFk3nEQRBADAEaSw6zC/EJkiwGPXbWtPxl2xCdSoeepS07jW8UgcHNurfHvUzogEq5xk hu507c3BarVjyWCJOylMNR98Yd8VqD9UfmX0Hb8/BrA+Hl6/DB/eqGptrf4BSRwcZQM32aZK 7Pj2XbGWIUrZrd70x1eAP9QE3P79Y2oLrsCgbZJfEwCgvz9JjGmQqQkRiTVzlZVCJYcyGGsD /0tbFCzD2h20ahe8rC1gbb3K3qk+LpBtvjBu1RY9drYk0NymiGbJWZgab6t1jM7sk2vuf0Py O9Hf9XBmK0uE9IgMaiCpc32XV9oASz6UJebwkX+zF2jG5I1BfnO9g7KlotcA/v5ClMjgo6Gl MDY4HxoSRu3i1cqqSDtVlt+AOVBJBACrZcnHAUSuCXBPy0jOlBhxPqRWv6ND4c9PH1xjQ3NP nxJuMBS8rnNg22uyfAgmBKNLpLgAGVRMZGaGoJObGf72s6TeIqKJo/LtggAS9qAUiuKVnygo 3wjfkS9A3DRO+SpU7JqWdsveeIQyeyEJ/8PTowmSQLakF+3fote9ybzd880fSmFuIEJldWxp Y2ggPGpiZXVsaWNoQHN1c2UuY29tPsJgBBMRAgAgBQJZN5xEAhsDBgsJCAcDAgQVAggDBBYC AwECHgECF4AACgkQoDSui/t3IH4J+wCfQ5jHdEjCRHj23O/5ttg9r9OIruwAn3103WUITZee e7Sbg12UgcQ5lv7SzsFNBFk3nEQQCACCuTjCjFOUdi5Nm244F+78kLghRcin/awv+IrTcIWF hUpSs1Y91iQQ7KItirz5uwCPlwejSJDQJLIS+QtJHaXDXeV6NI0Uef1hP20+y8qydDiVkv6l IreXjTb7DvksRgJNvCkWtYnlS3mYvQ9NzS9PhyALWbXnH6sIJd2O9lKS1Mrfq+y0IXCP10eS FFGg+Av3IQeFatkJAyju0PPthyTqxSI4lZYuJVPknzgaeuJv/2NccrPvmeDg6Coe7ZIeQ8Yj t0ARxu2xytAkkLCel1Lz1WLmwLstV30g80nkgZf/wr+/BXJW/oIvRlonUkxv+IbBM3dX2OV8 AmRv1ySWPTP7AAMFB/9PQK/VtlNUJvg8GXj9ootzrteGfVZVVT4XBJkfwBcpC/XcPzldjv+3 HYudvpdNK3lLujXeA5fLOH+Z/G9WBc5pFVSMocI71I8bT8lIAzreg0WvkWg5V2WZsUMlnDL9 mpwIGFhlbM3gfDMs7MPMu8YQRFVdUvtSpaAs8OFfGQ0ia3LGZcjA6Ik2+xcqscEJzNH+qh8V m5jjp28yZgaqTaRbg3M/+MTbMpicpZuqF4rnB0AQD12/3BNWDR6bmh+EkYSMcEIpQmBM51qM EKYTQGybRCjpnKHGOxG0rfFY1085mBDZCH5Kx0cl0HVJuQKC+dV2ZY5AqjcKwAxpE75MLFkr wkkEGBECAAkFAlk3nEQCGwwACgkQoDSui/t3IH7nnwCfcJWUDUFKdCsBH/E5d+0ZnMQi+G0A nAuWpQkjM1ASeQwSHEeAWPgskBQL

Cc: ray.huang@xxxxxxx, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx

Delivery-date: Tue, 29 Apr 2025 14:29:42 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 14.04.2025 09:40, Penny Zheng wrote: > --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c > +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c > @@ -14,7 +14,56 @@ > #include <xen/domain.h> > #include <xen/init.h> > #include <xen/param.h> > +#include <xen/percpu.h> > +#include <xen/xvmalloc.h> > #include <acpi/cpufreq/cpufreq.h> > +#include <asm/amd.h> > +#include <asm/msr-index.h> > + > +#define amd_cppc_err(cpu, fmt, args...) \ > + printk(XENLOG_ERR "AMD_CPPC: CPU%u error: " fmt, cpu, ## args) > +#define amd_cppc_warn(cpu, fmt, args...) \ > + printk(XENLOG_WARNING "AMD_CPPC: CPU%u warning: " fmt, cpu, ## args) > +#define amd_cppc_verbose(cpu, fmt, args...) \ > +({ \ > + if ( cpufreq_verbose ) \ > + printk(XENLOG_DEBUG "AMD_CPPC: CPU%u " fmt, cpu, ## args); \ > +}) > + > +struct amd_cppc_drv_data > +{ > + const struct xen_processor_cppc *cppc_data; > + union { > + uint64_t raw; > + struct { > + unsigned int lowest_perf:8; > + unsigned int lowest_nonlinear_perf:8; > + unsigned int nominal_perf:8; > + unsigned int highest_perf:8; > + unsigned int :32; > + }; > + } caps; > + union { > + uint64_t raw; > + struct { > + unsigned int max_perf:8; > + unsigned int min_perf:8; > + unsigned int des_perf:8; > + unsigned int epp:8; > + unsigned int :32; > + }; > + } req; > + > + int err; > +}; > + > +static DEFINE_PER_CPU_READ_MOSTLY(struct amd_cppc_drv_data *, > + amd_cppc_drv_data); > +/* > + * Core max frequency read from PstateDef as anchor point > + * for freq-to-perf transition > + */ > +static DEFINE_PER_CPU_READ_MOSTLY(uint64_t, amd_max_pxfreq_mhz); Throughout here (and maybe also further down) - is the amd_ prefix really needed everywhere? Even the cppc part of the identifiers seems excessive in at least some of the cases. For this last one, also once again see the type related comment on patch 07. > @@ -51,10 +100,354 @@ int __init amd_cppc_cmdline_parse(const char *s, const > char *e) > return 0; > } > > +/* > + * If CPPC lowest_freq and nominal_freq registers are exposed then we can > + * use them to convert perf to freq and vice versa. The conversion is > + * extrapolated as an linear function passing by the 2 points: > + * - (Low perf, Low freq) > + * - (Nominal perf, Nominal freq) > + */ > +static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data, > + unsigned int freq, uint8_t *perf) > +{ > + const struct xen_processor_cppc *cppc_data = data->cppc_data; > + uint64_t mul, div; > + int offset = 0, res; > + > + if ( freq == (cppc_data->cpc.nominal_mhz * 1000) ) I'm pretty sure I commented on this before: The expression here _suggests_ that "freq" is in kHz, but that's not being made explicit anywhere. > + { > + *perf = data->caps.nominal_perf; > + return 0; > + } > + > + if ( freq == (cppc_data->cpc.lowest_mhz * 1000) ) > + { > + *perf = data->caps.lowest_perf; > + return 0; > + } How likely is it that these two early return paths are taken, when the incoming "freq" is 25 or 5 MHz granular? IOW - is it relevant to shortcut these two cases? > + if ( cppc_data->cpc.lowest_mhz && cppc_data->cpc.nominal_mhz && > + cppc_data->cpc.lowest_mhz < cppc_data->cpc.nominal_mhz ) Along the lines of a comment on an earlier patch, the middle part of the condition here is redundant with the 3rd one. Also, don't you check this relation already during init? IOW isn't it the 3rd part which can be dropped? > + { > + mul = data->caps.nominal_perf - data->caps.lowest_perf; > + div = cppc_data->cpc.nominal_mhz - cppc_data->cpc.lowest_mhz; > + > + /* > + * We don't need to convert to kHz for computing offset and can > + * directly use nominal_mhz and lowest_mhz as the division > + * will remove the frequency unit. > + */ > + offset = data->caps.nominal_perf - > + (mul * cppc_data->cpc.nominal_mhz) / div; > + } > + else > + { > + /* Read Processor Max Speed(MHz) as anchor point */ > + mul = data->caps.highest_perf; > + div = this_cpu(amd_max_pxfreq_mhz); > + if ( !div || div == INVAL_FREQ_MHZ ) Seeing this - do we need INVAL_FREQ_MHZ at all? Isn't 0 good enough an indicator of "data not available"? > + return -EINVAL; > + } > + > + res = offset + (mul * freq) / (div * 1000); > + if ( res > UINT8_MAX ) > + { > + printk_once(XENLOG_WARNING > + "Perf value exceeds maximum value 255: %d\n", res); > + *perf = 0xff; > + return 0; > + } else if ( res < 0 ) Nit: Style. And no real need for "else" here anyway (or alternatively for the "return 0", with an "else" added below). > + { > + printk_once(XENLOG_WARNING > + "Perf value smaller than minimum value 0: %d\n", res); > + *perf = 0; > + return 0; > + } > + *perf = (uint8_t)res; We don't normally spell out such conversions (i.e. please drop the cast). > + return 0; > +} > + > +static int amd_get_lowest_or_nominal_freq(const struct amd_cppc_drv_data > *data, Nothing in the function looks to limit it to "nominal" or "lowest", so them being in the identifier feels misleading. > + uint32_t cpc_mhz, uint8_t perf, > + unsigned int *freq) > +{ > + uint64_t mul, div, res; > + > + if ( !freq ) > + return -EINVAL; Is this needed? It's an internal function. > + if ( cpc_mhz ) > + { > + /* Switch to khz */ > + *freq = cpc_mhz * 1000; > + return 0; > + } > + > + /* Read Processor Max Speed(MHz) as anchor point */ > + mul = this_cpu(amd_max_pxfreq_mhz); > + if ( mul == INVAL_FREQ_MHZ || !mul ) > + { > + printk(XENLOG_ERR > + "Failed to read valid processor max frequency as anchor > point: %lu\n", > + mul); > + return -EINVAL; > + } > + div = data->caps.highest_perf; > + res = (mul * perf * 1000) / div; While there is a comment further up, clarifying function-wide that the result is to be in kHz would perhaps be better. > + if ( res > UINT_MAX || !res ) > + { > + printk(XENLOG_ERR > + "Frequeny exceeds maximum value UINT_MAX or being zero value: > %lu\n", Just "out of range"? > + res); > + return -EINVAL; And then -ERANGE here? > + } > + *freq = (unsigned int)res; See above. > +static int amd_get_max_freq(const struct amd_cppc_drv_data *data, > + unsigned int *max_freq) > +{ > + unsigned int nom_freq = 0, boost_ratio; > + int res; > + > + res = amd_get_lowest_or_nominal_freq(data, > + data->cppc_data->cpc.nominal_mhz, > + data->caps.nominal_perf, > + &nom_freq); > + if ( res ) > + return res; > + > + boost_ratio = (unsigned int)(data->caps.highest_perf / > + data->caps.nominal_perf); I may have seen logic ensuring nominal_perf isn't 0, so that part may be fine. What guarantees this division to yield a positive value, though? If it yields zero (say 0xff / 0x80), ... > + *max_freq = nom_freq * boost_ratio; ... zero will be reported back here. I think you want to scale the calculations here to avoid such. > +static void cf_check amd_cppc_init_msrs(void *info) > +{ > + struct cpufreq_policy *policy = info; > + struct amd_cppc_drv_data *data = this_cpu(amd_cppc_drv_data); > + uint64_t val; > + unsigned int min_freq = 0, nominal_freq = 0, max_freq; > + > + /* Package level MSR */ > + rdmsrl(MSR_AMD_CPPC_ENABLE, val); > + /* > + * Only when Enable bit is on, the hardware will calculate the > processor’s > + * performance capabilities and initialize the performance level fields > in > + * the CPPC capability registers. > + */ > + if ( !(val & AMD_CPPC_ENABLE) ) > + { > + val |= AMD_CPPC_ENABLE; > + wrmsrl(MSR_AMD_CPPC_ENABLE, val); > + } > + > + rdmsrl(MSR_AMD_CPPC_CAP1, data->caps.raw); > + > + if ( data->caps.highest_perf == 0 || data->caps.lowest_perf == 0 || > + data->caps.nominal_perf == 0 || data->caps.lowest_nonlinear_perf == > 0 || > + data->caps.lowest_perf > data->caps.lowest_nonlinear_perf || Same question as asked elsewhere - where is this relation specified? > + data->caps.lowest_nonlinear_perf > data->caps.nominal_perf || > + data->caps.nominal_perf > data->caps.highest_perf ) > + { > + amd_cppc_err(policy->cpu, > + "Platform malfunction, read CPPC capability > highest(%u), lowest(%u), nominal(%u), lowest_nonlinear(%u) zero value\n", The message wants shortening if at all possible, and it wants to not be misleading (saying "zero" when the issue may be something else). > + data->caps.highest_perf, data->caps.lowest_perf, > + data->caps.nominal_perf, > data->caps.lowest_nonlinear_perf); > + goto err; > + } > + > + amd_process_freq(cpu_data + policy->cpu, &cpu_data[policy->cpu] is generally preferred in such cases. > +static int cf_check amd_cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) > +{ > + unsigned int cpu = policy->cpu; > + struct amd_cppc_drv_data *data; > + const struct cpuinfo_x86 *c = cpu_data + cpu; > + > + data = xvzalloc(struct amd_cppc_drv_data); > + if ( !data ) > + return -ENOMEM; > + > + data->cppc_data = &processor_pminfo[cpu]->cppc_data; > + > + per_cpu(amd_cppc_drv_data, cpu) = data; > + > + /* Feature CPPC is firstly introduced on Zen2 */ > + if ( c->x86 < 0x17 ) > + { > + printk_once("Unsupported cpu family: %x\n", c->x86); > + return -EOPNOTSUPP; > + } With the cpu_has_cppc check in amd_cppc_register_driver(), is this check really needed? Jan

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.