[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v1 05/11] xen/x86: introduce a new amd pstate driver for cpufreq scaling


  • To: Penny Zheng <Penny.Zheng@xxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Thu, 9 Jan 2025 11:55:17 +0100
  • Autocrypt: addr=jbeulich@xxxxxxxx; keydata= xsDiBFk3nEQRBADAEaSw6zC/EJkiwGPXbWtPxl2xCdSoeepS07jW8UgcHNurfHvUzogEq5xk hu507c3BarVjyWCJOylMNR98Yd8VqD9UfmX0Hb8/BrA+Hl6/DB/eqGptrf4BSRwcZQM32aZK 7Pj2XbGWIUrZrd70x1eAP9QE3P79Y2oLrsCgbZJfEwCgvz9JjGmQqQkRiTVzlZVCJYcyGGsD /0tbFCzD2h20ahe8rC1gbb3K3qk+LpBtvjBu1RY9drYk0NymiGbJWZgab6t1jM7sk2vuf0Py O9Hf9XBmK0uE9IgMaiCpc32XV9oASz6UJebwkX+zF2jG5I1BfnO9g7KlotcA/v5ClMjgo6Gl MDY4HxoSRu3i1cqqSDtVlt+AOVBJBACrZcnHAUSuCXBPy0jOlBhxPqRWv6ND4c9PH1xjQ3NP nxJuMBS8rnNg22uyfAgmBKNLpLgAGVRMZGaGoJObGf72s6TeIqKJo/LtggAS9qAUiuKVnygo 3wjfkS9A3DRO+SpU7JqWdsveeIQyeyEJ/8PTowmSQLakF+3fote9ybzd880fSmFuIEJldWxp Y2ggPGpiZXVsaWNoQHN1c2UuY29tPsJgBBMRAgAgBQJZN5xEAhsDBgsJCAcDAgQVAggDBBYC AwECHgECF4AACgkQoDSui/t3IH4J+wCfQ5jHdEjCRHj23O/5ttg9r9OIruwAn3103WUITZee e7Sbg12UgcQ5lv7SzsFNBFk3nEQQCACCuTjCjFOUdi5Nm244F+78kLghRcin/awv+IrTcIWF hUpSs1Y91iQQ7KItirz5uwCPlwejSJDQJLIS+QtJHaXDXeV6NI0Uef1hP20+y8qydDiVkv6l IreXjTb7DvksRgJNvCkWtYnlS3mYvQ9NzS9PhyALWbXnH6sIJd2O9lKS1Mrfq+y0IXCP10eS FFGg+Av3IQeFatkJAyju0PPthyTqxSI4lZYuJVPknzgaeuJv/2NccrPvmeDg6Coe7ZIeQ8Yj t0ARxu2xytAkkLCel1Lz1WLmwLstV30g80nkgZf/wr+/BXJW/oIvRlonUkxv+IbBM3dX2OV8 AmRv1ySWPTP7AAMFB/9PQK/VtlNUJvg8GXj9ootzrteGfVZVVT4XBJkfwBcpC/XcPzldjv+3 HYudvpdNK3lLujXeA5fLOH+Z/G9WBc5pFVSMocI71I8bT8lIAzreg0WvkWg5V2WZsUMlnDL9 mpwIGFhlbM3gfDMs7MPMu8YQRFVdUvtSpaAs8OFfGQ0ia3LGZcjA6Ik2+xcqscEJzNH+qh8V m5jjp28yZgaqTaRbg3M/+MTbMpicpZuqF4rnB0AQD12/3BNWDR6bmh+EkYSMcEIpQmBM51qM EKYTQGybRCjpnKHGOxG0rfFY1085mBDZCH5Kx0cl0HVJuQKC+dV2ZY5AqjcKwAxpE75MLFkr wkkEGBECAAkFAlk3nEQCGwwACgkQoDSui/t3IH7nnwCfcJWUDUFKdCsBH/E5d+0ZnMQi+G0A nAuWpQkjM1ASeQwSHEeAWPgskBQL
  • Cc: stefano.stabellini@xxxxxxx, Ray.Huang@xxxxxxx, Xenia.Ragiadakou@xxxxxxx, Jason.Andryuk@xxxxxxx, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx
  • Delivery-date: Thu, 09 Jan 2025 10:55:30 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 03.12.2024 09:11, Penny Zheng wrote:
> --- a/xen/arch/x86/acpi/cpufreq/amd-pstate.c
> +++ b/xen/arch/x86/acpi/cpufreq/amd-pstate.c
> @@ -15,6 +15,53 @@
>  #include <xen/init.h>
>  #include <xen/param.h>
>  #include <acpi/cpufreq/cpufreq.h>
> +#include <asm/msr.h>
> +
> +#define amd_pstate_err(cpu, fmt, args...) \
> +    printk(XENLOG_ERR "AMD_PSTATE: CPU%u error: " fmt, cpu, ## args)
> +#define amd_pstate_verbose(fmt, args...)                         \
> +({                                                               \
> +    if ( cpufreq_verbose )                                       \
> +        printk(XENLOG_DEBUG "AMD_PSTATE: " fmt, ## args);        \
> +})
> +#define amd_pstate_warn(fmt, args...) \
> +    printk(XENLOG_WARNING "AMD_PSTATE: CPU%u warning: " fmt, cpu, ## args)
> +
> +struct amd_pstate_drv_data
> +{
> +    struct xen_processor_cppc *cppc_data;
> +    union
> +    {
> +        uint64_t amd_caps;
> +        struct
> +        {
> +            unsigned int lowest_perf:8;
> +            unsigned int lowest_nonlinear_perf:8;
> +            unsigned int nominal_perf:8;
> +            unsigned int highest_perf:8;
> +            unsigned int :32;
> +        } hw;

Please can this be


    union {
        uint64_t raw;
        struct {
            unsigned int lowest_perf:8;
            unsigned int lowest_nonlinear_perf:8;
            unsigned int nominal_perf:8;
            unsigned int highest_perf:8;
            unsigned int :32;
        };
    } caps;

such that at use sites (e.g. amd_pstate_write_request()) it is possible to
actually spot that the same thing is being accessed through the two parts
of the union?

> +    };
> +    union
> +    {
> +        uint64_t amd_req;
> +        struct
> +        {
> +            unsigned int max_perf:8;
> +            unsigned int min_perf:8;
> +            unsigned int des_perf:8;
> +            unsigned int epp:8;
> +            unsigned int :32;
> +        } req;
> +    };

Along the same lines here then.

> +    int err;
> +
> +    uint32_t max_freq;
> +    uint32_t min_freq;
> +    uint32_t nominal_freq;
> +};
> +
> +static DEFINE_PER_CPU_READ_MOSTLY(struct amd_pstate_drv_data *, 
> amd_pstate_drv_data);
>  
>  uint16_t __read_mostly dmi_max_speed_mhz;
>  
> @@ -52,9 +99,298 @@ int __init amd_pstate_cmdline_parse(const char *s, const 
> char *e)
>      return 0;
>  }
>  
> +/*
> + * If CPPC lowest_freq and nominal_freq registers are exposed then we can
> + * use them to convert perf to freq and vice versa. The conversion is
> + * extrapolated as an affine function passing by the 2 points:
> + *  - (Low perf, Low freq)
> + *  - (Nominal perf, Nominal freq)
> + */
> +static unsigned int amd_pstate_khz_to_perf(struct amd_pstate_drv_data *data, 
> unsigned int freq)

Const?

> +{
> +    struct xen_processor_cppc* cppc_data = data->cppc_data;

Nit: Misplaced *. Const?

> +    uint64_t mul, div, offset = 0;
> +
> +    if ( freq == (cppc_data->nominal_freq * 1000) )
> +        return data->hw.nominal_perf;
> +
> +    if ( freq == (cppc_data->lowest_freq * 1000) )
> +        return data->hw.lowest_perf;
> +
> +    if ( (cppc_data->lowest_freq) && (cppc_data->nominal_freq) )
> +    {
> +        mul = data->hw.nominal_perf - data->hw.lowest_perf;
> +        div = cppc_data->nominal_freq - cppc_data->lowest_freq;
> +        /*
> +         * We don't need to convert to kHz for computing offset and can
> +         * directly use nominal_freq and lowest_freq as the division
> +         * will remove the frequency unit.
> +         */
> +        div = div ?: 1;
> +        offset = data->hw.nominal_perf - (mul * cppc_data->nominal_freq) / 
> div;
> +    }
> +    else
> +    {
> +        /* Read Processor Max Speed(mhz) from DMI table as anchor point */
> +        mul = data->hw.highest_perf;
> +        div = dmi_max_speed_mhz;

What if dmi_max_speed_mhz is still 0?

> +    }
> +
> +    return (unsigned int)(offset + (mul * freq ) / (div * 1000));

Nit: Excess blank before a closing parenthesis.

> +}
> +
> +static unsigned int amd_get_min_freq(struct amd_pstate_drv_data *data)
> +{
> +    struct xen_processor_cppc *cppc_data = data->cppc_data;
> +    uint64_t mul, div;
> +
> +    if ( cppc_data->lowest_freq )
> +        /* Switch to khz */
> +        return cppc_data->lowest_freq * 1000;
> +    else

Please avoid "else" when the earlier if() ends in an unconditional control
flow statement.

> +    {
> +        /* Read Processor Max Speed(mhz) from DMI table as anchor point */
> +        mul = dmi_max_speed_mhz;
> +        div = data->hw.highest_perf;
> +
> +        return (unsigned int)(mul / div) * data->hw.lowest_perf * 1000;

No risk of the cast chopping off set bits?

> +static unsigned int amd_get_nominal_freq(struct amd_pstate_drv_data *data)
> +{
> +    struct xen_processor_cppc *cppc_data = data->cppc_data;
> +    uint64_t mul, div;
> +
> +    if ( cppc_data->nominal_freq )
> +        /* Switch to khz */
> +        return cppc_data->nominal_freq * 1000;
> +    else
> +    {
> +        /* Read Processor Max Speed(mhz) from DMI table as anchor point */
> +        mul = dmi_max_speed_mhz;
> +        div = data->hw.highest_perf;
> +
> +        return (unsigned int)(mul / div) * data->hw.nominal_perf * 1000;
> +    }
> +}
> +
> +static unsigned int amd_get_max_freq(struct amd_pstate_drv_data *data)
> +{
> +    uint64_t max_perf, max_freq, nominal_freq, nominal_perf;
> +    uint64_t boost_ratio;
> +
> +    nominal_freq = amd_get_nominal_freq(data);
> +    nominal_perf = data->hw.nominal_perf;
> +    max_perf = data->hw.highest_perf;
> +
> +    boost_ratio = (uint64_t)(max_perf / nominal_perf);

What's the point of the cast here?

> +    max_freq = nominal_freq * boost_ratio;
> +
> +    return max_freq;
> +}
> +
> +static int cf_check amd_pstate_cpufreq_verify(struct cpufreq_policy *policy)
> +{
> +    struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, 
> policy->cpu);

Const? (I won't further repeat this. Please make pointers pointer-to-
const wherever possible. We aim at having fully const-correct code.)

> +    cpufreq_verify_within_limits(policy, data->min_freq, data->max_freq);
> +
> +    return 0;
> +}
> +
> +static void amd_pstate_write_request_msrs(void *info)
> +{
> +    struct amd_pstate_drv_data *data =(struct amd_pstate_drv_data *)info;

Nit: Blank after = and no need for a cast.

> +    if ( wrmsr_safe(MSR_AMD_CPPC_REQ, data->amd_req) )
> +    {
> +        amd_pstate_verbose("Failed to wrmsr_safe(MSR_AMD_CPPC_REQ, %lx)\n",
> +                           data->amd_req);

If the was to ever trigger, it would likely get very noisy in the log.

> +        data->err = -EINVAL;
> +        return;
> +    }
> +    data->err = 0;
> +}
> +
> +static int cf_check amd_pstate_write_request(int cpu, uint8_t min_perf,
> +                                             uint8_t des_perf, uint8_t 
> max_perf)
> +{
> +    struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, cpu);
> +    uint64_t prev = data->amd_req;
> +
> +    data->req.min_perf = min_perf;
> +    data->req.max_perf = max_perf;
> +    data->req.des_perf = des_perf;
> +
> +    if ( prev == data->amd_req )
> +        return 0;
> +
> +    on_selected_cpus(cpumask_of(cpu), amd_pstate_write_request_msrs, data, 
> 1);
> +
> +    return data->err;
> +}
> +
> +static int cf_check amd_pstate_cpufreq_target(struct cpufreq_policy *policy,
> +                                              unsigned int target_freq,
> +                                              unsigned int relation)
> +{
> +    unsigned int cpu = policy->cpu;
> +    struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, cpu);
> +    uint64_t max_perf, min_perf, des_perf;
> +
> +    if ( unlikely(!target_freq) )
> +    {
> +        amd_pstate_warn("Not setting target frequency to zero\n");

Potentially overly noisy again.

> +        return 0;
> +    }
> +    max_perf = data->hw.highest_perf;
> +    min_perf = data->hw.lowest_nonlinear_perf;
> +    des_perf = amd_pstate_khz_to_perf(data, target_freq);
> +
> +    return amd_pstate_write_request(policy->cpu, min_perf, des_perf, 
> max_perf);

The use of intermadiate variables doesn't look very helpful here. Even more
so that in the course of the call 64-bit quantaties will be truncated to
8-bit ones.

> +}
> +
> +static void cf_check amd_pstate_init_msrs(void *info)
> +{
> +    struct cpufreq_policy *policy = info;
> +    struct amd_pstate_drv_data *data = this_cpu(amd_pstate_drv_data);
> +    uint64_t val;
> +    unsigned int min_freq, nominal_freq, max_freq;
> +
> +    /* Package level MSR */
> +    if ( rdmsr_safe(MSR_AMD_CPPC_ENABLE, val) )

Before trying this, wouldn't you better exclude certain very old families?
Even looking at a random Fam17 PPR I can't spot documentation of this MSR
(nor the other two), despite you merely saying Zen elsewhere (without any
version number).

> +    {
> +        amd_pstate_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_ENABLE)\n");
> +        data->err = -EINVAL;
> +        return;
> +    }
> +
> +    if ( !(val & AMD_CPPC_ENABLE) )
> +    {
> +        val |= AMD_CPPC_ENABLE;
> +        if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
> +        {
> +            amd_pstate_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, 
> %lx)\n", val);
> +            data->err = -EINVAL;
> +            return;
> +        }
> +    }

Do you really need to enable befor reading ...

> +    if ( rdmsr_safe(MSR_AMD_CPPC_CAP1, data->amd_caps) )

... capabilities and ...

> +    {
> +        amd_pstate_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_CAP1)\n");
> +        goto error;
> +    }
> +
> +    if ( data->hw.highest_perf == 0 || data->hw.lowest_perf == 0 ||
> +         data->hw.nominal_perf == 0 || data->hw.lowest_nonlinear_perf == 0 )
> +    {
> +        amd_pstate_err(policy->cpu, "Platform malfunction, read CPPC 
> highest_perf: %u, lowest_perf: %u, nominal_perf: %u, lowest_nonlinear_perf: 
> %u zero value\n",
> +                       data->hw.highest_perf, data->hw.lowest_perf,
> +                       data->hw.nominal_perf, 
> data->hw.lowest_nonlinear_perf);
> +        goto error;
> +    }
> +
> +    min_freq = amd_get_min_freq(data);
> +    nominal_freq = amd_get_nominal_freq(data);
> +    max_freq = amd_get_max_freq(data);
> +    if ( min_freq > max_freq )
> +    {
> +        amd_pstate_err(policy->cpu, "min_freq(%u) or max_freq(%u) value is 
> incorrect\n",
> +                       min_freq, max_freq);
> +        goto error;
> +    }

... checking them? Error handling would be easier if you enabled only 
afterwards.

> +    policy->min = min_freq;
> +    policy->max = max_freq;
> +
> +    policy->cpuinfo.min_freq = min_freq;
> +    policy->cpuinfo.max_freq = max_freq;
> +    policy->cpuinfo.perf_freq = nominal_freq;
> +    policy->cur = nominal_freq;
> +
> +    /* Initial processor data capability frequencies */
> +    data->min_freq = min_freq;
> +    data->nominal_freq = nominal_freq;
> +    data->max_freq = max_freq;
> +
> +    data->err = 0;
> +    return;
> +
> + error:
> +    data->err = -EINVAL;
> +    val &= ~AMD_CPPC_ENABLE;
> +    if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
> +        amd_pstate_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, 
> %lx)\n", val);
> +}
> +
> +/*
> + * The new AMD P-States driver is different than legacy ACPI hardware 
> P-State,
> + * which has a finer grain frequency range between the highest and lowest
> + * frequency. And boost frequency is actually the frequency which is mapped 
> on
> + * highest performance ratio. The legacy P0 frequency is actually mapped on
> + * nominal performance ratio.
> + */
> +static void amd_pstate_boost_init(struct cpufreq_policy *policy, struct 
> amd_pstate_drv_data *data)
> +{
> +    uint32_t highest_perf, nominal_perf;

Why uint32_t when the fields read are both 8-bit only?

> +    highest_perf = data->hw.highest_perf;
> +    nominal_perf = data->hw.nominal_perf;
> +
> +    if ( highest_perf <= nominal_perf )
> +        return;
> +
> +    policy->turbo = CPUFREQ_TURBO_ENABLED;
> +}

And why the helper variables in the first place?

> +static int cf_check amd_pstate_cpufreq_cpu_init(struct cpufreq_policy 
> *policy)
> +{
> +    unsigned int cpu = policy->cpu;
> +    struct amd_pstate_drv_data *data;
> +
> +    data = xzalloc(struct amd_pstate_drv_data);
> +    if ( !data )
> +        return -ENOMEM;
> +
> +    data->cppc_data = &processor_pminfo[cpu]->cppc_data;
> +
> +    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
> +
> +    per_cpu(amd_pstate_drv_data, cpu) = data;
> +
> +    on_selected_cpus(cpumask_of(cpu), amd_pstate_init_msrs, policy, 1);
> +
> +    if ( data->err )
> +    {
> +        amd_pstate_err(cpu, "Could not initialize AMD CPPC MSR properly\n");
> +        per_cpu(amd_pstate_drv_data, cpu) = NULL;
> +        xfree(data);

Please use XFREE() (really XVFREE() after respective conversion) in such cases.

> +        return -ENODEV;

You're no undoing the policy->governor adjustment - maybe best to delay that
until after all errors were handled?

> --- a/xen/arch/x86/include/asm/msr-index.h
> +++ b/xen/arch/x86/include/asm/msr-index.h
> @@ -455,6 +455,11 @@
>  #define MSR_AMD_PPIN_CTL                0xc00102f0U
>  #define MSR_AMD_PPIN                    0xc00102f1U
>  
> +#define MSR_AMD_CPPC_CAP1               0xc00102b0
> +#define MSR_AMD_CPPC_ENABLE             0xc00102b1
> +#define MSR_AMD_CPPC_REQ                0xc00102b3
> +#define AMD_CPPC_ENABLE                 BIT(0, ULL)

Bits of individual MSRs want to follow their MSR #define directly, with
suitably different indentation. Please further pay attention to this
comment in the file (as well as the bigger one at the top):

/*
 * Legacy MSR constants in need of cleanup.  No new MSRs below this comment.
 */

Jan



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.