[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] tscmode: documentation (in C code) for pvrdtscp algorithm



This code describes the pvrdtscp algorithm and the new
ABI between Xen and applications.

Keir, I'm no sure where to put this... though it is a
C program, it is essentially documentation.  So, I've
just attached as a file, not as a patch.

Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>

/* pvrdtscp algorithm
 *
 * This sample code demonstrates the use of the paravirtualized rdtscp
 * algorithm.  Using this algorithm, an application may communicate with
 * the Xen hypervisor (version 4.0+) to obtain timestamp information which
 * is both monotonically increasing and has a fixed 1 GHz rate, even across
 * migrations between machines with different TSC rates and offsets.
 * Further,the algorithm provides performance near the performance of a
 * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED
 * the application is running on a machine on which the rdtscp instruction
 * is supported and TSC is "safe". The application must also be running in a
 * PV domain.  (HVM domains may be supported at a later time.) On machines
 * where TSC is unsafe or the rdtscp instruction is not supported, Xen
 * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp
 * algorithm, thus providing support for the algorithm for live migration
 * across all machines.
 *
 * More information can be found within the Xen (4.0+) source tree at
 *  docs/misc/tscmode.txt
 *
 * Copyright (c) 2009 Oracle Corporation and/or its affiliates.
 * All rights reserved
 * Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
 * 
 * This code is derived from code licensed under the GNU
 * General Public License ("GPL") version 2 and is therefore itself
 * also licensed under the GPL version 2.
 *
 * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2
 * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp
 * algorithm and its ABI to Xen version 4.0+ 
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/wait.h>

#ifdef __LP64__
#define __X86_64__
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long u64;
typedef int i32;
typedef long i64;
#define NSEC_PER_SEC 1000000000
#else
#define __X86_32__
typedef unsigned int u16;
typedef unsigned long u32;
typedef unsigned long long u64;
typedef long i32;
typedef long long i64;
#define NSEC_PER_SEC 1000000000L
#endif

static inline void hvm_cpuid(u32 idx, u32 sub,
                                u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
        *eax = idx, *ecx = sub;
        asm("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
            : "0" (*eax), "2" (*ecx));
}

static inline void pv_cpuid(u32 idx, u32 sub,
                                u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
        *eax = idx, *ecx = sub;
        asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" : "=a" (*eax),
            "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx));
}

static inline u64 do_rdtscp(u32 *aux)
{
static u64 last = 0;
        u32 lo32, hi32;
        u64 val;

        asm volatile(".byte 0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c" (*aux));
        val = lo32 | ((u64)hi32 << 32);
        return val;
}

static inline int get_xen_tsc_mode(void)
{
        u32 val, dummy1, dummy2, dummy3;
        pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3);
        return val;
}

static inline int get_xen_vtsc(void)
{
        u32 val, dummy1, dummy2, dummy3;
        pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3);
        return val & 1;
}

static inline int get_xen_vtsc_khz(void)
{
        u32 val, dummy1, dummy2, dummy3;
        pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3);
        return val;
}

static inline u32 get_xen_cpu_khz(void)
{
        u32 cpu_khz, dummy1, dummy2, dummy3;
        pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3);
        return cpu_khz;
}

static inline u32 get_xen_incarnation(void)
{
        u32 incarn, dummy1, dummy2, dummy3;
        pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn);
        return incarn;
}

static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift)
{
        u32 off_lo, off_hi, sys_lo, sys_hi, dummy;

        pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift);
        *offset = off_lo | ((u64)off_hi << 32);
}

static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift)
{
    u64 product;
#ifdef __X86_32__
    u32 tmp1, tmp2;
#endif

    if ( tsc_shift < 0 )
        delta >>= -tsc_shift;
    else
        delta <<= tsc_shift;

#ifdef __X86_32__
    asm (
        "mul  %5       ; "
        "mov  %4,%%eax ; "
        "mov  %%edx,%4 ; "
        "mul  %5       ; "
        "xor  %5,%5    ; "
        "add  %4,%%eax ; "
        "adc  %5,%%edx ; "
        : "=A" (product), "=r" (tmp1), "=r" (tmp2)
        : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (tsc_mul_frac) );
#else
    asm (
        "mul %%rdx ; shrd $32,%%rdx,%%rax"
        : "=a" (product) : "0" (delta), "d" ((u64)tsc_mul_frac) );
#endif

    return product;
}

static inline u64 get_pvrdtscp_timestamp(int *discontinuity)
{
        static int firsttime = 1;
        static u64 last_pvrdtscp_timestamp = 0;
        static u32 last_tsc_aux;
        static u64 xen_ns_offset;
        static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift;
        u32 this_tsc_aux;
        u64 timestamp, cur_tsc, cur_ns;

        if (firsttime) {
                cur_tsc = do_rdtscp(&last_tsc_aux);
                get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
                                        &xen_tsc_to_ns_shift);
                cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
                                        xen_tsc_to_ns_shift);
                timestamp = cur_ns - xen_ns_offset;
                last_pvrdtscp_timestamp = timestamp;
                firsttime = 0;
        }
        cur_tsc = do_rdtscp(&this_tsc_aux);
        *discontinuity = 0;
        while (this_tsc_aux != last_tsc_aux) {
                /* if tsc_aux changed, try again */
                last_tsc_aux = this_tsc_aux;
                get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
                                        &xen_tsc_to_ns_shift);
                cur_tsc = do_rdtscp(&this_tsc_aux);
                *discontinuity = 1;
        }

        /* compute nsec from TSC and Xen time values */
        cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
                                        xen_tsc_to_ns_shift);
        timestamp = cur_ns - xen_ns_offset;

        /* enforce monotonicity just in case */
        if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0)
                last_pvrdtscp_timestamp = timestamp;
        else {
                /* this should never happen but we'll check it anyway in
                 * case of some strange combination of scaling errors
                 * occurs across a very fast migration */
                printf("Time went backwards by %lluns\n",
                    (unsigned long long)(last_pvrdtscp_timestamp-timestamp));
                timestamp = ++last_pvrdtscp_timestamp;
        }
        return timestamp;
}

#define HVM 1
#define PVM 0

static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor)
{
        u32 eax, ebx, ecx, edx, base;
        union { char csig[16]; u32 u[4]; } sig;

        for (base=0x40000000; base < 0x40010000; base += 0x100) {
                if (hvm==HVM)
                        hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx);
                else
                        pv_cpuid(base,0,&eax,&ebx,&ecx,&edx);
                sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx;
                sig.csig[12] = '\0';
                if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >= (base+2))) {
                                if (hvm==HVM)
                                        hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
                                else
                                        pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
                                *version_major = (eax >> 16) & 0xffff;
                                *version_minor = eax & 0xffff;
                                return 1;
                }
        }
        return 0;
}

main(int ac, char **av)
{
        u32 dummy;
        u16 version_hi, version_lo;
        u64 ts, last_ts;
        int status, discontinuity = 0;
        pid_t pid;

        if (running_on_xen(HVM,&version_hi,&version_lo)) {
                printf("running on Xen v%d.%d as an HVM domain, "
                        "pvrdtsc not supported, exiting\n",
                        (int)version_hi, (int)version_lo);
                exit(0);
        }
        pid = fork();
        if (pid == -1) {
                fprintf(stderr,"Huh? Fork failed\n");
                return 0;
        }
        else if (pid == 0) { /* child */
                pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy);
                exit(0);
        }
        waitpid(pid,&status,0);
        if (!WIFEXITED(status))
                exit(0);
        if (!running_on_xen(PVM,&version_hi,&version_lo)) {
                printf("not running on Xen, exiting\n");
                exit(0);
        }
        printf("running on Xen v%d.%d as a PV domain\n",
                (int)version_hi, (int)version_lo);
        if ( version_hi <= 3 ) {
                printf("pvrdtscp requires Xen version 4.0 or greater\n");
                /* exit(0); FIXME after xen-unstable is officially v4.0 */
        }
        if ( get_xen_tsc_mode() != 3 )
                printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n");

        /* OK, we are on Xen, now loop forever checking timestamps */
        ts = get_pvrdtscp_timestamp(&discontinuity);
        printf("Starting with ts=%lluns 0x%llx 
(%llusec)\n",ts,ts,ts/NSEC_PER_SEC);
        printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
                                (unsigned long)get_xen_incarnation(),
                                (unsigned long)get_xen_vtsc(),
                                (unsigned long)get_xen_vtsc_khz(),
                                (unsigned long)get_xen_cpu_khz());
        ts = get_pvrdtscp_timestamp(&discontinuity);
        last_ts = ts;
        while (1) {
                ts = get_pvrdtscp_timestamp(&discontinuity);
                if (discontinuity)
                        printf("migrated/restored, incarn=%d: "
                               "vtsc now %d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
                                (unsigned long)get_xen_incarnation(),
                                (unsigned long)get_xen_vtsc(),
                                (unsigned long)get_xen_vtsc_khz(),
                                (unsigned long)get_xen_cpu_khz());
                if (ts < last_ts)
                        /* this should NEVER happen, especially since there
                         * is a check for it in get_pvrdtscp_timestamp() */
                        printf("Time went backwards: %lluns (%llusec)\n",
                                last_ts-ts,(last_ts-ts)/NSEC_PER_SEC);
                if (ts > last_ts + 200000000LL)
                        /* this is OK, usually about 2sec for save/restore
                         * and a fraction of a second for live migrate */
                        printf("Time jumped forward %lluns (%llusec)\n",
                                ts-last_ts,(ts-last_ts)/NSEC_PER_SEC);
                last_ts = ts;
        }
}

Attachment: pvrdtscp.c
Description: Binary data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.