Xen project Mailing List

[Xen-merge] [PATCH 23/23] Xen subarch (largely copied files still)

From: Chris Wright <chrisw@xxxxxxxx>

Date: Mon, 08 Aug 2005 00:02:59 -0700

Delivery-date: Mon, 08 Aug 2005 07:06:52 +0000

List-id: xen-merge <xen-merge.lists.xensource.com>

=================================================================== --- linux-2.6.12-xen0-arch.orig/arch/i386/Kconfig +++ linux-2.6.12-xen0-arch/arch/i386/Kconfig @@ -106,6 +106,13 @@ config X86_VISWS A kernel compiled for the Visual Workstation will not run on PCs and vice versa. See <file:Documentation/sgi-visws.txt> for details. +config X86_XEN + bool "Xen Paravirtualized Linux" + select XEN + help + This option will build a kernel that will run on a Xen + paravirtualized machine. + config X86_GENERICARCH bool "Generic architecture (Summit, bigsmp, ES7000, default)" depends on SMP @@ -124,6 +131,10 @@ config X86_ES7000 endchoice +if X86_XEN +source arch/i386/mach-xen/Kconfig +endif + config ACPI_SRAT bool default y @@ -589,12 +600,12 @@ config X86_VISWS_APIC config X86_TSC bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ && !X86_XEN default y config X86_MCE bool "Machine Check Exception" - depends on !X86_VOYAGER + depends on !X86_VOYAGER && !X86_XEN ---help--- Machine Check Exception support allows the processor to notify the kernel if it detects a problem (e.g. overheating, component failure). @@ -701,6 +712,7 @@ config MICROCODE config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" + depends on !X86_XEN help This device gives privileged processes access to the x86 Model-Specific Registers (MSRs). It is a character device with Index: linux-2.6.12-xen0-arch/arch/i386/kernel/early_printk.c =================================================================== --- linux-2.6.12-xen0-arch.orig/arch/i386/kernel/early_printk.c +++ linux-2.6.12-xen0-arch/arch/i386/kernel/early_printk.c @@ -1,2 +1,2 @@ -#include "../../x86_64/kernel/early_printk.c" +#include "../../../x86_64/kernel/early_printk.c" Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/boot/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/boot/Makefile @@ -0,0 +1,8 @@ + +OBJCOPYFLAGS := -g --strip-unneeded + +vmlinuz: vmlinux-stripped FORCE + $(call if_changed,gzip) + +vmlinux-stripped: vmlinux FORCE + $(call if_changed,objcopy) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig @@ -0,0 +1,159 @@ +# +# For a description of the syntax of this configuration file, +# see Documentation/kbuild/kconfig-language.txt. +# + +menu "XEN" + +config XEN_PRIVILEGED_GUEST + bool "Privileged Guest (domain 0)" + default n + select XEN_PHYSDEV_ACCESS + help + Support for privileged operation (domain 0) + +config XEN_PHYSDEV_ACCESS + bool "Physical device access" + default XEN_PRIVILEGED_GUEST + help + Assume access is available to physical hardware devices + (e.g., hard drives, network cards). This allows you to configure + such devices and also includes some low-level support that is + otherwise not compiled into the kernel. + +config XEN_BLKDEV_BACKEND + bool "Block-device backend driver" + depends on XEN_PHYSDEV_ACCESS + default y + help + The block-device backend driver allows the kernel to export its + block devices to other guests via a high-performance shared-memory + interface. + +config XEN_BLKDEV_TAP_BE + bool "Block Tap support for backend driver (DANGEROUS)" + depends on XEN_BLKDEV_BACKEND + default n + help + If you intend to use the block tap driver, the backend domain will + not know the domain id of the real frontend, and so will not be able + to map its data pages. This modifies the backend to attempt to map + from both the tap domain and the real frontend. This presents a + security risk, and so should ONLY be used for development + with the blktap. This option will be removed as the block drivers are + modified to use grant tables. + +config XEN_BLKDEV_GRANT + bool "Grant table substrate for block drivers" + depends on !XEN_BLKDEV_TAP_BE + default y + help + This introduces the use of grant tables as a data exhange mechanism + between the frontend and backend block drivers. This currently + conflicts with the block tap. + +config XEN_NETDEV_BACKEND + bool "Network-device backend driver" + depends on XEN_PHYSDEV_ACCESS + default y + help + The network-device backend driver allows the kernel to export its + network devices to other guests via a high-performance shared-memory + interface. + +config XEN_BLKDEV_FRONTEND + bool "Block-device frontend driver" + default y + help + The block-device frontend driver allows the kernel to access block + devices mounted within another guest OS. Unless you are building a + dedicated device-driver domain, or your master control domain + (domain 0), then you almost certainly want to say Y here. + +config XEN_NETDEV_FRONTEND + bool "Network-device frontend driver" + default y + help + The network-device frontend driver allows the kernel to access + network interfaces within another guest OS. Unless you are building a + dedicated device-driver domain, or your master control domain + (domain 0), then you almost certainly want to say Y here. + +config XEN_NETDEV_GRANT_TX + bool "Grant table substrate for net drivers tx path (DANGEROUS)" + default n + help + This introduces the use of grant tables as a data exhange mechanism + between the frontend and backend network drivers. + +config XEN_NETDEV_GRANT_RX + bool "Grant table substrate for net drivers rx path (DANGEROUS)" + default n + help + This introduces the use of grant tables as a data exhange mechanism + between the frontend and backend network drivers. + +config XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER + bool "Pipelined transmitter (DANGEROUS)" + depends on XEN_NETDEV_FRONTEND + default n + help + The driver will assume that the backend is pipelining packets for + transmission: whenever packets are pending in the remote backend, + the driver will not send asynchronous notifications when it queues + additional packets for transmission. + If the backend is a dumb domain, such as a transparent Ethernet + bridge with no local IP interface, it is safe to say Y here to get + slightly lower network overhead. + If the backend has a local IP interface; or may be doing smart things + like reassembling packets to perform firewall filtering; or if you + are unsure; or if you experience network hangs when this option is + enabled; then you must say N here. + +config XEN_BLKDEV_TAP + bool "Block device tap driver" + default n + help + This driver allows a VM to interact on block device channels + to other VMs. Block messages may be passed through or redirected + to a character device, allowing device prototyping in application + space. Odds are that you want to say N here. + +config XEN_SHADOW_MODE + bool "Fake shadow mode" + default n + help + fakes out a shadow mode kernel + + +config XEN_SCRUB_PAGES + bool "Scrub memory before freeing it to Xen" + default y + help + Erase memory contents before freeing it back to Xen's global + pool. This ensures that any secrets contained within that + memory (e.g., private keys) cannot be found by other guests that + may be running on the machine. Most people will want to say Y here. + If security is not a concern then you may increase performance by + saying N. +endmenu + +config XEN + boolean + +config HAVE_ARCH_DEV_ALLOC_SKB + bool + default y + +config NO_IDLE_HZ + bool + default y + +#Placeholder +#source "drivers/xen/Kconfig.drivers" + +#if XEN_PRIVILEGED_GUEST +#menu "Power management options" +#source "drivers/acpi/Kconfig" +#endmenu +#endif Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.debug =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.debug @@ -0,0 +1,129 @@ +menu "Kernel hacking" + +source "lib/Kconfig.debug" + +# X86 +config EARLY_PRINTK + bool "Early printk" if EMBEDDED && DEBUG_KERNEL + default y + depends on X86 + help + Write kernel log output directly into the VGA buffer or to a serial + port. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. + +config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL && X86 + +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL && X86 + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + +config DEBUG_STACK_USAGE + bool "Stack utilization instrumentation" + depends on DEBUG_KERNEL && X86 + help + Enables the display of the minimum amount of free stack which each + task has ever had available in the sysrq-T and sysrq-P debug output. + + This option will slow down process creation somewhat. + +comment "Page alloc debug is incompatible with Software Suspend on i386" + depends on DEBUG_KERNEL && SOFTWARE_SUSPEND && X86 + +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND && X86 + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + +config 4KSTACKS + bool "Use 4Kb for kernel stacks instead of 8Kb" + depends on DEBUG_KERNEL && X86 + help + If you say Y here the kernel will use a 4Kb stacksize for the + kernel stack attached to each process/thread. This facilitates + running more threads on a system and also reduces the pressure + on the VM subsystem for higher order allocations. This option + will also use IRQ stacks to compensate for the reduced stackspace. + +config X86_FIND_SMP_CONFIG + bool + depends on X86_LOCAL_APIC || X86_VOYAGER && X86 + default y + +config X86_MPPARSE + bool + depends on X86_LOCAL_APIC && !X86_VISWS && X86 + default y + +# X86_64 + +# !SMP for now because the context switch early causes GPF in segment reloading +# and the GS base checking does the wrong thing then, causing a hang. +config CHECKING + bool "Additional run-time checks" + depends on DEBUG_KERNEL && !SMP && X86_64 + help + Enables some internal consistency checks for kernel debugging. + You should normally say N. + +config INIT_DEBUG + bool "Debug __init statements" + depends on DEBUG_KERNEL && X86_64 + help + Fill __init and __initdata at the end of boot. This helps debugging + illegal uses of __init and __initdata after initialization. + +config IOMMU_DEBUG + depends on GART_IOMMU && DEBUG_KERNEL && X86_64 + bool "Enable IOMMU debugging" + help + Force the IOMMU to on even when you have less than 4GB of + memory and add debugging code. On overflow always panic. And + allow to enable IOMMU leak tracing. Can be disabled at boot + time with iommu=noforce. This will also enable scatter gather + list merging. Currently not recommended for production + code. When you use it make sure you have a big enough + IOMMU/AGP aperture. Most of the options enabled by this can + be set more finegrained using the iommu= command line + options. See Documentation/x86_64/boot-options.txt for more + details. + +config IOMMU_LEAK + bool "IOMMU leak tracing" + depends on DEBUG_KERNEL && X86_64 + depends on IOMMU_DEBUG + help + Add a simple leak tracer to the IOMMU code. This is useful when you + are debugging a buggy device driver that leaks IOMMU mappings. + +#config X86_REMOTE_DEBUG +# bool "kgdb debugging stub" + +# X86 & X86_64 +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + +endmenu Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.drivers =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.drivers @@ -0,0 +1,94 @@ +# arch/xen/Kconfig.drivers + +menu "Device Drivers" + +source "drivers/base/Kconfig" + +if XEN_PHYSDEV_ACCESS +source "drivers/mtd/Kconfig" +source "drivers/parport/Kconfig" +source "drivers/pnp/Kconfig" +endif + +source "drivers/block/Kconfig" + +if XEN_PHYSDEV_ACCESS +source "drivers/ide/Kconfig" +endif + +source "drivers/scsi/Kconfig" + +if XEN_PHYSDEV_ACCESS +source "drivers/cdrom/Kconfig" +endif + +source "drivers/md/Kconfig" + +if XEN_PHYSDEV_ACCESS +source "drivers/message/fusion/Kconfig" +source "drivers/ieee1394/Kconfig" +source "drivers/message/i2o/Kconfig" +endif + +source "net/Kconfig" + +if XEN_PHYSDEV_ACCESS +source "drivers/isdn/Kconfig" +source "drivers/telephony/Kconfig" +source "drivers/input/Kconfig" +source "drivers/char/Kconfig" +source "drivers/i2c/Kconfig" +source "drivers/w1/Kconfig" +source "drivers/misc/Kconfig" +source "drivers/media/Kconfig" +source "drivers/video/Kconfig" +source "sound/Kconfig" +source "drivers/usb/Kconfig" +source "drivers/mmc/Kconfig" +source "drivers/infiniband/Kconfig" +endif + +if !XEN_PHYSDEV_ACCESS + +menu "Character devices" + +config UNIX98_PTYS + bool + default y + +config LEGACY_PTYS + bool "Legacy (BSD) PTY support" + default y + ---help--- + A pseudo terminal (PTY) is a software device consisting of two + halves: a master and a slave. The slave device behaves identical to + a physical terminal; the master device is used by a process to + read data from and write data to the slave, thereby emulating a + terminal. Typical programs for the master side are telnet servers + and xterms. + + Linux has traditionally used the BSD-like names /dev/ptyxx + for masters and /dev/ttyxx for slaves of pseudo + terminals. This scheme has a number of problems, including + security. This option enables these legacy devices; on most + systems, it is safe to say N. + + +config LEGACY_PTY_COUNT + int "Maximum number of legacy PTY in use" + depends on LEGACY_PTYS + range 1 256 + default "256" + ---help--- + The maximum number of legacy PTYs that can be used at any one time. + The default is 256, and should be more than enough. Embedded + systems may want to reduce this to save memory. + + When not in use, each legacy PTY occupies 12 bytes on 32-bit + architectures and 24 bytes on 64-bit architectures. + +endmenu + +endif + +endmenu Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/boot.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/boot.c @@ -0,0 +1,912 @@ +/* + * boot.c - Architecture-Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@xxxxxxxxx> + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@xxxxxxxxx> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/init.h> +#include <linux/config.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/pgtable.h> +#include <asm/io_apic.h> +#include <asm/apic.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/mpspec.h> +#ifdef CONFIG_XEN +#include <asm/fixmap.h> +#endif + +void (*pm_power_off)(void) = NULL; + +#ifdef CONFIG_X86_64 + +static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) { } +extern void __init clustered_apic_check(void); +static inline int ioapic_setup_disabled(void) { return 0; } +#include <asm/proto.h> + +#else /* X86 */ + +#ifdef CONFIG_X86_LOCAL_APIC +#include <mach_apic.h> +#include <mach_mpparse.h> +#endif /* CONFIG_X86_LOCAL_APIC */ + +#endif /* X86 */ + +#define BAD_MADT_ENTRY(entry, end) ( \ + (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ + ((acpi_table_entry_header *)entry)->length != sizeof(*entry)) + +#define PREFIX "ACPI: " + +#ifdef CONFIG_ACPI_PCI +int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ +int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ +#else +int acpi_noirq __initdata = 1; +int acpi_pci_disabled __initdata = 1; +#endif +int acpi_ht __initdata = 1; /* enable HT */ + +int acpi_lapic; +int acpi_ioapic; +int acpi_strict; +EXPORT_SYMBOL(acpi_strict); + +acpi_interrupt_flags acpi_sci_flags __initdata; +int acpi_sci_override_gsi __initdata; +int acpi_skip_timer_override __initdata; + +#ifdef CONFIG_X86_LOCAL_APIC +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +#endif + +#ifndef __HAVE_ARCH_CMPXCHG +#warning ACPI uses CMPXCHG, i486 and later hardware +#endif + +#define MAX_MADT_ENTRIES 256 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] = + { [0 ... MAX_MADT_ENTRIES-1] = 0xff }; +EXPORT_SYMBOL(x86_acpiid_to_apicid); + +/* -------------------------------------------------------------------------- + Boot-time Configuration + -------------------------------------------------------------------------- */ + +/* + * The default interrupt routing model is PIC (8259). This gets + * overriden if IOAPICs are enumerated (below). + */ +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + +/* rely on all ACPI tables being in the direct mapping */ +char *__acpi_map_table(unsigned long phys_addr, unsigned long size) +{ + if (!phys_addr || !size) + return NULL; + + if (phys_addr < (end_pfn_map << PAGE_SHIFT)) + return __va(phys_addr); + + return NULL; +} + +#else + +/* + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, + * to map the target physical address. The problem is that set_fixmap() + * provides a single page, and it is possible that the page is not + * sufficient. + * By using this area, we can map up to MAX_IO_APICS pages temporarily, + * i.e. until the next __va_range() call. + * + * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* + * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and + * count idx down while incrementing the phys address. + */ +char *__acpi_map_table(unsigned long phys, unsigned long size) +{ + unsigned long base, offset, mapped_size; + int idx; + +#ifndef CONFIG_XEN + if (phys + size < 8*1024*1024) + return __va(phys); +#endif + + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; + set_fixmap(FIX_ACPI_END, phys); + base = fix_to_virt(FIX_ACPI_END); + + /* + * Most cases can be covered by the below. + */ + idx = FIX_ACPI_END; + while (mapped_size < size) { + if (--idx < FIX_ACPI_BEGIN) + return NULL; /* cannot handle this */ + phys += PAGE_SIZE; + set_fixmap(idx, phys); + mapped_size += PAGE_SIZE; + } + + return ((unsigned char *) base + offset); +} +#endif + +#ifdef CONFIG_PCI_MMCONFIG +static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_mcfg *mcfg; + + if (!phys_addr || !size) + return -EINVAL; + + mcfg = (struct acpi_table_mcfg *) __acpi_map_table(phys_addr, size); + if (!mcfg) { + printk(KERN_WARNING PREFIX "Unable to map MCFG\n"); + return -ENODEV; + } + + if (mcfg->base_reserved) { + printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); + return -ENODEV; + } + + pci_mmcfg_base_addr = mcfg->base_address; + + return 0; +} +#else +#define acpi_parse_mcfg NULL +#endif /* !CONFIG_PCI_MMCONFIG */ + +#ifdef CONFIG_X86_LOCAL_APIC +static int __init +acpi_parse_madt ( + unsigned long phys_addr, + unsigned long size) +{ + struct acpi_table_madt *madt = NULL; + + if (!phys_addr || !size) + return -EINVAL; + + madt = (struct acpi_table_madt *) __acpi_map_table(phys_addr, size); + if (!madt) { + printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + return -ENODEV; + } + + if (madt->lapic_address) { + acpi_lapic_addr = (u64) madt->lapic_address; + + printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", + madt->lapic_address); + } + + acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); + + return 0; +} + + +static int __init +acpi_parse_lapic ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic *processor = NULL; + + processor = (struct acpi_table_lapic*) header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* no utility in registering a disabled processor */ + if (processor->flags.enabled == 0) + return 0; + + x86_acpiid_to_apicid[processor->acpi_id] = processor->id; + + mp_register_lapic ( + processor->id, /* APIC ID */ + processor->flags.enabled); /* Enabled? */ + + return 0; +} + +static int __init +acpi_parse_lapic_addr_ovr ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL; + + lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr*) header; + + if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) + return -EINVAL; + + acpi_lapic_addr = lapic_addr_ovr->address; + + return 0; +} + +static int __init +acpi_parse_lapic_nmi ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic_nmi *lapic_nmi = NULL; + + lapic_nmi = (struct acpi_table_lapic_nmi*) header; + + if (BAD_MADT_ENTRY(lapic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (lapic_nmi->lint != 1) + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + + return 0; +} + + +#endif /*CONFIG_X86_LOCAL_APIC*/ + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER) + +static int __init +acpi_parse_ioapic ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_ioapic *ioapic = NULL; + + ioapic = (struct acpi_table_ioapic*) header; + + if (BAD_MADT_ENTRY(ioapic, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + mp_register_ioapic ( + ioapic->id, + ioapic->address, + ioapic->global_irq_base); + + return 0; +} + +/* + * Parse Interrupt Source Override for the ACPI SCI + */ +static void +acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) +{ + if (trigger == 0) /* compatible SCI trigger is level */ + trigger = 3; + + if (polarity == 0) /* compatible SCI polarity is low */ + polarity = 3; + + /* Command-line over-ride via acpi_sci= */ + if (acpi_sci_flags.trigger) + trigger = acpi_sci_flags.trigger; + + if (acpi_sci_flags.polarity) + polarity = acpi_sci_flags.polarity; + + /* + * mp_config_acpi_legacy_irqs() already setup IRQs < 16 + * If GSI is < 16, this will update its flags, + * else it will create a new mp_irqs[] entry. + */ + mp_override_legacy_irq(gsi, polarity, trigger, gsi); + + /* + * stash over-ride to indicate we've been here + * and for later update of acpi_fadt + */ + acpi_sci_override_gsi = gsi; + return; +} + +static int __init +acpi_parse_int_src_ovr ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_int_src_ovr *intsrc = NULL; + + intsrc = (struct acpi_table_int_src_ovr*) header; + + if (BAD_MADT_ENTRY(intsrc, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (intsrc->bus_irq == acpi_fadt.sci_int) { + acpi_sci_ioapic_setup(intsrc->global_irq, + intsrc->flags.polarity, intsrc->flags.trigger); + return 0; + } + + if (acpi_skip_timer_override && + intsrc->bus_irq == 0 && intsrc->global_irq == 2) { + printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + return 0; + } + + mp_override_legacy_irq ( + intsrc->bus_irq, + intsrc->flags.polarity, + intsrc->flags.trigger, + intsrc->global_irq); + + return 0; +} + + +static int __init +acpi_parse_nmi_src ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_nmi_src *nmi_src = NULL; + + nmi_src = (struct acpi_table_nmi_src*) header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* TBD: Support nimsrc entries? */ + + return 0; +} + +#endif /* CONFIG_X86_IO_APIC */ + +#ifdef CONFIG_ACPI_BUS + +/* + * acpi_pic_sci_set_trigger() + * + * use ELCR to set PIC-mode trigger type for SCI + * + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's + * it may require Edge Trigger -- use "acpi_sci=edge" + * + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0) + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0) + */ + +void __init +acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) +{ + unsigned int mask = 1 << irq; + unsigned int old, new; + + /* Real old ELCR mask */ + old = inb(0x4d0) | (inb(0x4d1) << 8); + + /* + * If we use ACPI to set PCI irq's, then we should clear ELCR + * since we will set it correctly as we enable the PCI irq + * routing. + */ + new = acpi_noirq ? old : 0; + + /* + * Update SCI information in the ELCR, it isn't in the PCI + * routing tables.. + */ + switch (trigger) { + case 1: /* Edge - clear */ + new &= ~mask; + break; + case 3: /* Level - set */ + new |= mask; + break; + } + + if (old == new) + return; + + printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); + outb(new, 0x4d0); + outb(new >> 8, 0x4d1); +} + + +#endif /* CONFIG_ACPI_BUS */ + +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +{ +#ifdef CONFIG_X86_IO_APIC + if (use_pci_vector() && !platform_legacy_irq(gsi)) + *irq = IO_APIC_VECTOR(gsi); + else +#endif + *irq = gsi; + return 0; +} + +unsigned int acpi_register_gsi(u32 gsi, int edge_level, int active_high_low) +{ + unsigned int irq; + unsigned int plat_gsi = gsi; + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { + plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low); + } +#endif + acpi_gsi_to_irq(plat_gsi, &irq); + return irq; +} +EXPORT_SYMBOL(acpi_register_gsi); + +/* + * ACPI based hotplug support for CPU + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +int +acpi_map_lsapic(acpi_handle handle, int *pcpu) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_map_lsapic); + + +int +acpi_unmap_lsapic(int cpu) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_unmap_lsapic); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +static unsigned long __init +acpi_scan_rsdp ( + unsigned long start, + unsigned long length) +{ + unsigned long offset = 0; + unsigned long sig_len = sizeof("RSD PTR ") - 1; + unsigned long vstart = (unsigned long)isa_bus_to_virt(start); + + /* + * Scan all 16-byte boundaries of the physical memory region for the + * RSDP signature. + */ + for (offset = 0; offset < length; offset += 16) { + if (strncmp((char *) (vstart + offset), "RSD PTR ", sig_len)) + continue; + return (start + offset); + } + + return 0; +} + +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_sbf *sb; + + if (!phys_addr || !size) + return -EINVAL; + + sb = (struct acpi_table_sbf *) __acpi_map_table(phys_addr, size); + if (!sb) { + printk(KERN_WARNING PREFIX "Unable to map SBF\n"); + return -ENODEV; + } + + sbf_port = sb->sbf_cmos; /* Save CMOS port */ + + return 0; +} + + +#ifdef CONFIG_HPET_TIMER + +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) +{ + struct acpi_table_hpet *hpet_tbl; + + if (!phys || !size) + return -EINVAL; + + hpet_tbl = (struct acpi_table_hpet *) __acpi_map_table(phys, size); + if (!hpet_tbl) { + printk(KERN_WARNING PREFIX "Unable to map HPET\n"); + return -ENODEV; + } + + if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) { + printk(KERN_WARNING PREFIX "HPET timers must be located in " + "memory.\n"); + return -1; + } + +#ifdef CONFIG_X86_64 + vxtime.hpet_address = hpet_tbl->addr.addrl | + ((long) hpet_tbl->addr.addrh << 32); + + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, vxtime.hpet_address); +#else /* X86 */ + { + extern unsigned long hpet_address; + + hpet_address = hpet_tbl->addr.addrl; + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, hpet_address); + } +#endif /* X86 */ + + return 0; +} +#else +#define acpi_parse_hpet NULL +#endif + +#ifdef CONFIG_X86_PM_TIMER +extern u32 pmtmr_ioport; +#endif + +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) +{ + struct fadt_descriptor_rev2 *fadt = NULL; + + fadt = (struct fadt_descriptor_rev2*) __acpi_map_table(phys,size); + if(!fadt) { + printk(KERN_WARNING PREFIX "Unable to map FADT\n"); + return 0; + } + +#ifdef CONFIG_ACPI_INTERPRETER + /* initialize sci_int early for INT_SRC_OVR MADT parsing */ + acpi_fadt.sci_int = fadt->sci_int; +#endif + +#ifdef CONFIG_ACPI_BUS + /* initialize rev and apic_phys_dest_mode for x86_64 genapic */ + acpi_fadt.revision = fadt->revision; + acpi_fadt.force_apic_physical_destination_mode = fadt->force_apic_physical_destination_mode; +#endif + +#ifdef CONFIG_X86_PM_TIMER + /* detect the location of the ACPI PM Timer */ + if (fadt->revision >= FADT2_REVISION_ID) { + /* FADT rev. 2 */ + if (fadt->xpm_tmr_blk.address_space_id != ACPI_ADR_SPACE_SYSTEM_IO) + return 0; + + pmtmr_ioport = fadt->xpm_tmr_blk.address; + } else { + /* FADT rev. 1 */ + pmtmr_ioport = fadt->V1_pm_tmr_blk; + } + if (pmtmr_ioport) + printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", pmtmr_ioport); +#endif + return 0; +} + + +unsigned long __init +acpi_find_rsdp (void) +{ + unsigned long rsdp_phys = 0; + + if (efi_enabled) { + if (efi.acpi20) + return __pa(efi.acpi20); + else if (efi.acpi) + return __pa(efi.acpi); + } + /* + * Scan memory looking for the RSDP signature. First search EBDA (low + * memory) paragraphs and then search upper memory (E0000-FFFFF). + */ + rsdp_phys = acpi_scan_rsdp (0, 0x400); + if (!rsdp_phys) + rsdp_phys = acpi_scan_rsdp (0xE0000, 0x20000); + + set_fixmap(FIX_ACPI_RSDP_PAGE, rsdp_phys); + + return rsdp_phys; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Parse LAPIC entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init +acpi_parse_madt_lapic_entries(void) +{ + int count; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); + return count; + } + + mp_register_lapic_address(acpi_lapic_addr); + + count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic, + MAX_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return -ENODEV; + } + else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER) +/* + * Parse IOAPIC related entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init +acpi_parse_madt_ioapic_entries(void) +{ + int count; + + /* + * ACPI interpreter is required to complete interrupt setup, + * so if it is off, don't enumerate the io-apics with ACPI. + * If MPS is present, it will handle them, + * otherwise the system will stay in PIC mode + */ + if (acpi_disabled || acpi_noirq) { + return -ENODEV; + } + + /* + * if "noapic" boot option, don't look for IO-APICs + */ + if (skip_ioapic_setup) { + printk(KERN_INFO PREFIX "Skipping IOAPIC probe " + "due to 'noapic' option.\n"); + return -ENODEV; + } + + count = acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic, MAX_IO_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + return -ENODEV; + } + else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, NR_IRQ_VECTORS); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + /* + * If BIOS did not supply an INT_SRC_OVR for the SCI + * pretend we got one so we can set the SCI flags. + */ + if (!acpi_sci_override_gsi) + acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0); + + /* Fill in identity legacy mapings where no override */ + mp_config_acpi_legacy_irqs(); + + count = acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, NR_IRQ_VECTORS); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + return 0; +} +#else +static inline int acpi_parse_madt_ioapic_entries(void) +{ + return -1; +} +#endif /* !(CONFIG_X86_IO_APIC && CONFIG_ACPI_INTERPRETER) */ + + +static void __init +acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int count, error; + + count = acpi_table_parse(ACPI_APIC, acpi_parse_madt); + if (count >= 1) { + + /* + * Parse MADT LAPIC entries + */ + error = acpi_parse_madt_lapic_entries(); + if (!error) { + acpi_lapic = 1; + + /* + * Parse MADT IO-APIC entries + */ + error = acpi_parse_madt_ioapic_entries(); + if (!error) { + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + acpi_irq_balance_set(NULL); + acpi_ioapic = 1; + + smp_found_config = 1; + clustered_apic_check(); + } + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } +#endif + return; +} + +/* + * acpi_boot_table_init() and acpi_boot_init() + * called from setup_arch(), always. + * 1. checksums all tables + * 2. enumerates lapics + * 3. enumerates io-apics + * + * acpi_table_init() is separate to allow reading SRAT without + * other side effects. + * + * side effects of acpi_boot_init: + * acpi_lapic = 1 if LAPIC found + * acpi_ioapic = 1 if IOAPIC found + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; + * if acpi_blacklisted() acpi_disabled = 1; + * acpi_irq_model=... + * ... + * + * return value: (currently ignored) + * 0: success + * !0: failure + */ + +int __init +acpi_boot_table_init(void) +{ + int error; + + /* + * If acpi_disabled, bail out + * One exception: acpi=ht continues far enough to enumerate LAPICs + */ + if (acpi_disabled && !acpi_ht) + return 1; + + /* + * Initialize the ACPI boot-time table parser. + */ + error = acpi_table_init(); + if (error) { + disable_acpi(); + return error; + } + +#ifdef __i386__ + check_acpi_pci(); +#endif + + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); + + /* + * blacklist may disable ACPI entirely + */ + error = acpi_blacklisted(); + if (error) { + extern int acpi_force; + + if (acpi_force) { + printk(KERN_WARNING PREFIX "acpi=force override\n"); + } else { + printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + disable_acpi(); + return error; + } + } + + return 0; +} + + +int __init acpi_boot_init(void) +{ + /* + * If acpi_disabled, bail out + * One exception: acpi=ht continues far enough to enumerate LAPICs + */ + if (acpi_disabled && !acpi_ht) + return 1; + + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); + + /* + * set sci_int and PM timer address + */ + acpi_table_parse(ACPI_FADT, acpi_parse_fadt); + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + acpi_process_madt(); + + acpi_table_parse(ACPI_HPET, acpi_parse_hpet); + acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); + + return 0; +} + Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_ACPI_BOOT) := boot.o +c-obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o +c-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/kernel/acpi/$(notdir $@) $@ + +obj-y += $(c-obj-y) $(s-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) +clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/apic.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/apic.c @@ -0,0 +1,83 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include <linux/config.h> +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> + +#include <asm/atomic.h> +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/desc.h> +#include <asm/arch_hooks.h> +#include <asm/hpet.h> + +#include <mach_apic.h> + +#include "io_ports.h" + +/* + * Debug level + */ +int apic_verbosity; + +int get_physical_broadcast(void) +{ + return 0xff; +} + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + */ + ack_APIC_irq(); +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + + return 0; +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/common.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/common.c @@ -0,0 +1,650 @@ +#include <linux/init.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <asm/semaphore.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <mach_apic.h> +#endif +#include <asm-xen/hypervisor.h> + +#include "cpu.h" + +DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]); +EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); + +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); + +static int cachesize_override __initdata = -1; +static int disable_x86_fxsr __initdata = 0; +static int disable_x86_serial_nr __initdata = 1; + +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; + +extern void mcheck_init(struct cpuinfo_x86 *c); + +extern void machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c); + +extern int disable_pse; + +static void default_init(struct cpuinfo_x86 * c) +{ + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +} + +static struct cpu_dev default_cpu = { + .c_init = default_init, +}; +static struct cpu_dev * this_cpu = &default_cpu; + +static int __init cachesize_setup(char *str) +{ + get_option (&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +int __init get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q; + + if (cpuid_eax(0x80000000) < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* Intel chips right-justify this string for some dumb reason; + undo that brain damage */ + p = q = &c->x86_model_id[0]; + while ( *p == ' ' ) + p++; + if ( p != q ) { + while ( *p ) + *q++ = *p++; + while ( q <= &c->x86_model_id[48] ) + *q++ = '\0'; /* Zero-pad the rest */ + } + + return 1; +} + + +void __init display_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ecx, edx, l2size; + + n = cpuid_eax(0x80000000); + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size=(ecx>>24)+(edx>>24); + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + ecx = cpuid_ecx(0x80000006); + l2size = ecx >> 16; + + /* do processor-specific cache resizing */ + if (this_cpu->c_size_cache) + l2size = this_cpu->c_size_cache(c,l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if ( l2size == 0 ) + return; /* Again, no L2 cache is possible */ + + c->x86_cache_size = l2size; + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + l2size, ecx & 0xFF); +} + +/* Naming convention should be: <Name> [(<Codename>)] */ +/* This table only is used unless init_<vendor>() below doesn't set it; */ +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ + +/* Look up CPU names by table lookup. */ +static char __init *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if ( c->x86_model >= 16 ) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + + +void __init get_cpu_vendor(struct cpuinfo_x86 *c, int early) +{ + char *v = c->x86_vendor_id; + int i; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (cpu_devs[i]) { + if (!strcmp(v,cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v,cpu_devs[i]->c_ident[1]))) { + c->x86_vendor = i; + if (!early) + this_cpu = cpu_devs[i]; + break; + } + } + } +} + + +static int __init x86_fxsr_setup(char * s) +{ + disable_x86_fxsr = 1; + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + asm("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + + +/* Probe for the CPUID instruction */ +static int __init have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 1); + + c->x86 = 4; + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) { + c->x86 += (tfms >> 20) & 0xff; + c->x86_model += ((tfms >> 16) & 0xF) << 4; + } + c->x86_mask = tfms & 15; + if (cap0 & (1<<19)) + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; + } + + early_intel_workaround(c); + +#ifdef CONFIG_X86_HT + phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; +#endif +} + +void __init generic_identify(struct cpuinfo_x86 * c) +{ + u32 tfms, xlvl; + int junk; + + if (have_cpuid_p()) { + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 0); + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if ( c->cpuid_level >= 0x00000001 ) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &junk, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) { + c->x86 += (tfms >> 20) & 0xff; + c->x86_model += ((tfms >> 16) & 0xF) << 4; + } + c->x86_mask = tfms & 15; + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ( (xlvl & 0xffff0000) == 0x80000000 ) { + if ( xlvl >= 0x80000001 ) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + if ( xlvl >= 0x80000004 ) + get_model_name(c); /* Default name */ + } + } +} + +static void __init squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { + /* Disable processor serial number */ + unsigned long lo,hi; + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_bit(X86_FEATURE_PN, c->x86_capability); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); + + + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __init identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_num_cores = 1; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + if (!have_cpuid_p()) { + /* First of all, decide if this is a 486 or higher */ + /* It's a 486 if we can modify the AC flag */ + if ( flag_is_changeable_p(X86_EFLAGS_AC) ) + c->x86 = 4; + else + c->x86 = 3; + } + + generic_identify(c); + + printk(KERN_DEBUG "CPU: After generic identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + if (this_cpu->c_identify) { + this_cpu->c_identify(c); + + printk(KERN_DEBUG "CPU: After vendor identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + } + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* + * The vendor-specific functions might have changed features. Now + * we do "generic changes." + */ + + /* TSC disabled? */ + if ( tsc_disable ) + clear_bit(X86_FEATURE_TSC, c->x86_capability); + + /* FXSR disabled? */ + if (disable_x86_fxsr) { + clear_bit(X86_FEATURE_FXSR, c->x86_capability); + clear_bit(X86_FEATURE_XMM, c->x86_capability); + } + + if (disable_pse) + clear_bit(X86_FEATURE_PSE, c->x86_capability); + + /* If the model name is still unset, do table lookup. */ + if ( !c->x86_model_id[0] ) { + char *p; + p = table_lookup_model(c); + if ( p ) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86_vendor, c->x86_model); + } + + machine_specific_modify_cpu_capabilities(c); + + /* Now the feature flags better reflect actual CPU features! */ + + printk(KERN_DEBUG "CPU: After all inits, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if ( c != &boot_cpu_data ) { + /* AND the already accumulated flags with these */ + for ( i = 0 ; i < NCAPINTS ; i++ ) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + + /* Init Machine Check Exception if available. */ +#ifdef CONFIG_X86_MCE + mcheck_init(c); +#endif +} + +#ifdef CONFIG_X86_HT +void __init detect_ht(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + int index_msb, tmp; + int cpu = smp_processor_id(); + + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) + return; + + cpuid(1, &eax, &ebx, &ecx, &edx); + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1 ) { + index_msb = 31; + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + tmp = smp_num_siblings; + while ((tmp & 0x80000000 ) == 0) { + tmp <<=1 ; + index_msb--; + } + if (smp_num_siblings & (smp_num_siblings - 1)) + index_msb++; + phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + phys_proc_id[cpu]); + + smp_num_siblings = smp_num_siblings / c->x86_num_cores; + + tmp = smp_num_siblings; + index_msb = 31; + while ((tmp & 0x80000000) == 0) { + tmp <<=1 ; + index_msb--; + } + + if (smp_num_siblings & (smp_num_siblings - 1)) + index_msb++; + + cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + + if (c->x86_num_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + cpu_core_id[cpu]); + } +} +#endif + +void __init print_cpu_info(struct cpuinfo_x86 *c) +{ + char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) + vendor = this_cpu->c_vendor; + else if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) + printk("%s ", vendor); + + if (!c->x86_model_id[0]) + printk("%d86", c->x86); + else + printk("%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(" stepping %02x\n", c->x86_mask); + else + printk("\n"); +} + +cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; + +/* This is hacky. :) + * We're emulating future behavior. + * In the future, the cpu-specific init functions will be called implicitly + * via the magic of initcalls. + * They will insert themselves into the cpu_devs structure. + * Then, when cpu_init() is called, we can just iterate over that array. + */ + +extern int intel_cpu_init(void); +extern int cyrix_init_cpu(void); +extern int nsc_init_cpu(void); +extern int amd_init_cpu(void); +extern int centaur_init_cpu(void); +extern int transmeta_init_cpu(void); +extern int rise_init_cpu(void); +extern int nexgen_init_cpu(void); +extern int umc_init_cpu(void); + +void __init early_cpu_init(void) +{ + intel_cpu_init(); + cyrix_init_cpu(); + nsc_init_cpu(); + amd_init_cpu(); + centaur_init_cpu(); + transmeta_init_cpu(); + rise_init_cpu(); + nexgen_init_cpu(); + umc_init_cpu(); + early_cpu_detect(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif +} + +void __init cpu_gdt_init(struct Xgt_desc_struct *gdt_descr) +{ + unsigned long frames[16]; + unsigned long va; + int f; + + for (va = gdt_descr->address, f = 0; + va < gdt_descr->address + gdt_descr->size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + make_page_readonly((void *)va); + } + if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) + BUG(); + lgdt_finish(); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __init cpu_init (void) +{ + int cpu = smp_processor_id(); + struct tss_struct * t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = &current->thread; + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } + + /* + * Set up the per-thread TLS descriptor cache: + */ + memcpy(thread->tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], + GDT_ENTRY_TLS_ENTRIES * 8); + + cpu_gdt_init(&cpu_gdt_descr[cpu]); + + /* + * Delete NT + */ + __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + if (current->mm) + BUG(); + enter_lazy_tlb(&init_mm, current); + + load_esp0(t, thread); + + load_LDT(&init_mm.context); + + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ + +#define CD(register) HYPERVISOR_set_debugreg(register, 0) + + CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); + +#undef CD + + /* + * Force FPU initialization: + */ + current_thread_info()->status = 0; + clear_used_math(); + mxcsr_feature_mask_init(); +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/Makefile @@ -0,0 +1,31 @@ +# +# Makefile for x86-compatible CPU details and quirks +# + +CFLAGS += -Iarch/i386/kernel/cpu + +obj-y := common.o +c-obj-y += proc.o + +c-obj-y += amd.o +c-obj-y += cyrix.o +c-obj-y += centaur.o +c-obj-y += transmeta.o +c-obj-y += intel.o intel_cacheinfo.o +c-obj-y += rise.o +c-obj-y += nexgen.o +c-obj-y += umc.o + +#obj-$(CONFIG_X86_MCE) += ../../../../i386/kernel/cpu/mcheck/ + +obj-$(CONFIG_MTRR) += mtrr/ +#obj-$(CONFIG_CPU_FREQ) += ../../../../i386/kernel/cpu/cpufreq/ + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/kernel/cpu/$(notdir $@) $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/main.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/main.c @@ -0,0 +1,165 @@ +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> + +#include <asm/mtrr.h> +#include "mtrr.h" + +void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + dom0_op_t op; + + op.cmd = DOM0_READ_MEMTYPE; + op.u.read_memtype.reg = reg; + (void)HYPERVISOR_dom0_op(&op); + + *size = op.u.read_memtype.nr_pfns; + *base = op.u.read_memtype.pfn; + *type = op.u.read_memtype.type; +} + +struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .get = generic_get_mtrr, +}; + +struct mtrr_ops *mtrr_if = &generic_mtrr_ops; +unsigned int num_var_ranges; +unsigned int *usage_table; + +static void __init set_num_var_ranges(void) +{ + dom0_op_t op; + + for (num_var_ranges = 0; ; num_var_ranges++) { + op.cmd = DOM0_READ_MEMTYPE; + op.u.read_memtype.reg = num_var_ranges; + if (HYPERVISOR_dom0_op(&op) != 0) + break; + } +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) + == NULL) { + printk(KERN_ERR "mtrr: could not allocate\n"); + return; + } + for (i = 0; i < max; i++) + usage_table[i] = 0; +} + +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, char increment) +{ + int error; + dom0_op_t op; + + op.cmd = DOM0_ADD_MEMTYPE; + op.u.add_memtype.pfn = base; + op.u.add_memtype.nr_pfns = size; + op.u.add_memtype.type = type; + if ((error = HYPERVISOR_dom0_op(&op))) + return error; + + if (increment) + ++usage_table[op.u.add_memtype.reg]; + + return op.u.add_memtype.reg; +} + +int +mtrr_add(unsigned long base, unsigned long size, unsigned int type, + char increment) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + int i, max; + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; + int error = -EINVAL; + dom0_op_t op; + + max = num_var_ranges; + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, &ltype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, + size); + goto out; + } + } + if (usage_table[reg] < 1) { + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + goto out; + } + if (--usage_table[reg] < 1) { + op.cmd = DOM0_DEL_MEMTYPE; + op.u.del_memtype.handle = 0; + op.u.add_memtype.reg = reg; + (void)HYPERVISOR_dom0_op(&op); + } + error = reg; + out: + return error; +} + +int +mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_INFO "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +EXPORT_SYMBOL(mtrr_add); +EXPORT_SYMBOL(mtrr_del); + +static int __init mtrr_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!(xen_start_info.flags & SIF_PRIVILEGED)) + return -ENODEV; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + set_num_var_ranges(); + init_table(); + + return 0; +} + +subsys_initcall(mtrr_init); Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/Makefile @@ -0,0 +1,16 @@ +obj-y := main.o +c-obj-y := if.o + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): $(obj)/mtrr.h + @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/$(notdir $@) $@ + +$(patsubst %.o,$(obj)/%.c,$(obj-y)): $(obj)/mtrr.h + +$(obj)/mtrr.h: + @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/mtrr.h $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/entry.S =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/entry.S @@ -0,0 +1,753 @@ +/* + * linux/arch/i386/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'ret_from_system_call': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - orig_eax + * 28(%esp) - %eip + * 2C(%esp) - %cs + * 30(%esp) - %eflags + * 34(%esp) - %oldesp + * 38(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/thread_info.h> +#include <asm/errno.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/page.h> +#include "irq_vectors.h" +#include <xen-public/xen.h> + +#define nr_syscalls ((syscall_table_size)/4) + +EBX = 0x00 +ECX = 0x04 +EDX = 0x08 +ESI = 0x0C +EDI = 0x10 +EBP = 0x14 +EAX = 0x18 +DS = 0x1C +ES = 0x20 +ORIG_EAX = 0x24 +EIP = 0x28 +CS = 0x2C +EVENT_MASK = 0x2E +EFLAGS = 0x30 +OLDESP = 0x34 +OLDSS = 0x38 + +CF_MASK = 0x00000001 +TF_MASK = 0x00000100 +IF_MASK = 0x00000200 +DF_MASK = 0x00000400 +NT_MASK = 0x00004000 +VM_MASK = 0x00020000 + +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 3 + +#ifdef CONFIG_SMP +#define preempt_disable(reg) incl TI_preempt_count(reg) +#define preempt_enable(reg) decl TI_preempt_count(reg) +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%ebp) ; \ + movl TI_cpu(%ebp),reg ; \ + shl $sizeof_vcpu_shift,reg ; \ + addl HYPERVISOR_shared_info,reg +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%ebp) +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff +#else +#define XEN_GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg +#define XEN_PUT_VCPU_INFO(reg) +#define XEN_PUT_VCPU_INFO_fixup +#endif + +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_BLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) + +#ifdef CONFIG_PREEMPT +#define preempt_stop GET_THREAD_INFO(%ebp) ; \ + XEN_BLOCK_EVENTS(%esi) +#else +#define preempt_stop +#define resume_kernel restore_all +#endif + +#define SAVE_ALL \ + cld; \ + pushl %es; \ + pushl %ds; \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; + +#define RESTORE_INT_REGS \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax + +#define RESTORE_REGS \ + RESTORE_INT_REGS; \ +1: popl %ds; \ +2: popl %es; \ +.section .fixup,"ax"; \ +3: movl $0,(%esp); \ + jmp 1b; \ +4: movl $0,(%esp); \ + jmp 2b; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,3b; \ + .long 2b,4b; \ +.previous + + +#define RESTORE_ALL \ + RESTORE_REGS \ + addl $4, %esp; \ +1: iret; \ +.section .fixup,"ax"; \ +2: movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; \ + movl $11,%eax; \ + call do_exit; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,2b; \ +.previous + + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + jmp syscall_exit + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop +ret_from_intr: + GET_THREAD_INFO(%ebp) + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 2), %eax + jz resume_kernel # returning to kernel or vm86-space +ENTRY(resume_userspace) + XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + XEN_BLOCK_EVENTS(%esi) + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_all +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testb $0xFF,EVENT_MASK(%esp) # interrupts off (exception path) ? + jnz restore_all + call preempt_schedule_irq + jmp need_resched +#endif + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(sysenter_entry) + movl TSS_sysenter_esp0(%esp),%esp +sysenter_past_esp: + sti + pushl $(__USER_DS) + pushl %ebp + pushfl + pushl $(__USER_CS) + pushl $SYSENTER_RETURN + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + pushl %eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) + cli + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx + jne syscall_exit_work +/* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx + movl OLDESP(%esp), %ecx + xorl %ebp,%ebp + sti + sysexit + + + # system call handler stub +ENTRY(system_call) + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) # store the return value +syscall_exit: + XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work +restore_all: + testl $VM_MASK, EFLAGS(%esp) + jnz resume_vm86 + movb EVENT_MASK(%esp), %al + notb %al # %al == ~saved_mask + XEN_GET_VCPU_INFO(%esi) + andb evtchn_upcall_mask(%esi),%al + andb $1,%al # %al == mask & ~saved_mask + jnz restore_all_enable_events # != 0 => reenable event delivery + XEN_PUT_VCPU_INFO(%esi) + RESTORE_ALL + +resume_vm86: + XEN_UNBLOCK_EVENTS(%esi) + RESTORE_REGS + movl %eax,(%esp) + movl $__HYPERVISOR_switch_vm86,%eax + int $0x82 + ud2 + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests + testl $VM_MASK, EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp restore_all + + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + xorl %edx, %edx + call do_notify_resume + jmp restore_all + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,EAX(%esp) + movl %esp, %eax + xorl %edx,%edx + call do_syscall_trace + movl ORIG_EAX(%esp), %eax + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + jz work_pending + XEN_UNBLOCK_EVENTS(%esi) # could let do_syscall_trace() call + # schedule() instead + movl %esp, %eax + movl $1, %edx + call do_syscall_trace + jmp resume_userspace + + ALIGN +syscall_fault: + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + movl $-EFAULT,EAX(%esp) + jmp resume_userspace + + ALIGN +syscall_badsys: + movl $-ENOSYS,EAX(%esp) + jmp resume_userspace + +#if 0 /* XEN */ +/* + * Build the entry stubs and pointer table with + * some assembler magic. + */ +.data +ENTRY(interrupt) +.text + +vector=0 +ENTRY(irq_entries_start) +.rept NR_IRQS + ALIGN +1: pushl $vector-256 + jmp common_interrupt +.data + .long 1b +.text +vector=vector+1 +.endr + + ALIGN +common_interrupt: + SAVE_ALL + movl %esp,%eax + call do_IRQ + jmp ret_from_intr + +#define BUILD_INTERRUPT(name, nr) \ +ENTRY(name) \ + pushl $nr-256; \ + SAVE_ALL \ + movl %esp,%eax; \ + call smp_/**/name; \ + jmp ret_from_intr; + +/* The include is where all of the SMP etc. interrupts come from */ +#include "entry_arch.h" +#endif /* XEN */ + +ENTRY(divide_error) + pushl $0 # no error code + pushl $do_divide_error + ALIGN +error_code: + pushl %ds + pushl %eax + xorl %eax, %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax # eax = -1 + pushl %ecx + pushl %ebx + cld + movl %es, %ecx + movl ES(%esp), %edi # get the function address + movl ORIG_EAX(%esp), %edx # get the error code + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +ENTRY(hypervisor_callback) + pushl %eax + SAVE_ALL + movl EIP(%esp),%eax + cmpl $scrit,%eax + jb 11f + cmpl $ecrit,%eax + jb critical_region_fixup +11: push %esp + call evtchn_do_upcall + add $4,%esp + jmp ret_from_intr + + ALIGN +restore_all_enable_events: + XEN_LOCKED_UNBLOCK_EVENTS(%esi) +scrit: /**** START OF CRITICAL REGION ****/ + XEN_TEST_PENDING(%esi) + jnz 14f # process more events if necessary... + XEN_PUT_VCPU_INFO(%esi) + RESTORE_ALL +14: XEN_LOCKED_BLOCK_EVENTS(%esi) + XEN_PUT_VCPU_INFO(%esi) + jmp 11b +ecrit: /**** END OF CRITICAL REGION ****/ +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + addl $critical_fixup_table-scrit,%eax + movzbl (%eax),%eax # %eax contains num bytes popped + cmpb $0xff,%al # 0xff => vcpu_info critical region + jne 15f + GET_THREAD_INFO(%ebp) + XEN_PUT_VCPU_INFO(%esi) # abort vcpu_info critical region + xorl %eax,%eax +15: mov %esp,%esi + add %eax,%esi # %esi points at end of src region + mov %esp,%edi + add $0x34,%edi # %edi points at end of dst region + mov %eax,%ecx + shr $2,%ecx # convert words to bytes + je 17f # skip loop if nothing to copy +16: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 16b +17: movl %edi,%esp # final %edi is top of merged stack + jmp 11b + +critical_fixup_table: + .byte 0xff,0xff,0xff # testb $0xff,(%esi) = XEN_TEST_PENDING + .byte 0xff,0xff # jnz 14f + XEN_PUT_VCPU_INFO_fixup + .byte 0x00 # pop %ebx + .byte 0x04 # pop %ecx + .byte 0x08 # pop %edx + .byte 0x0c # pop %esi + .byte 0x10 # pop %edi + .byte 0x14 # pop %ebp + .byte 0x18 # pop %eax + .byte 0x1c # pop %ds + .byte 0x20 # pop %es + .byte 0x24,0x24,0x24 # add $4,%esp + .byte 0x28 # iret + .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) + XEN_PUT_VCPU_INFO_fixup + .byte 0x00,0x00 # jmp 11b + +# Hypervisor uses this for application faults while it executes. +ENTRY(failsafe_callback) +1: popl %ds +2: popl %es +3: popl %fs +4: popl %gs + subl $4,%esp + SAVE_ALL + jmp ret_from_exception +.section .fixup,"ax"; \ +6: movl $0,(%esp); \ + jmp 1b; \ +7: movl $0,(%esp); \ + jmp 2b; \ +8: movl $0,(%esp); \ + jmp 3b; \ +9: movl $0,(%esp); \ + jmp 4b; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,6b; \ + .long 2b,7b; \ + .long 3b,8b; \ + .long 4b,9b; \ +.previous + +ENTRY(coprocessor_error) + pushl $0 + pushl $do_coprocessor_error + jmp error_code + +ENTRY(simd_coprocessor_error) + pushl $0 + pushl $do_simd_coprocessor_error + jmp error_code + +ENTRY(device_not_available) + pushl $-1 # mark this as an int + SAVE_ALL + preempt_stop + call math_state_restore + jmp ret_from_exception + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_esp0+offset(%esp),%esp; \ + pushfl; \ + pushl $__KERNEL_CS; \ + pushl $sysenter_past_esp + +ENTRY(debug) + cmpl $sysenter_entry,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + +#if 0 /* XEN */ +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + cmpl $sysenter_entry,(%esp) + je nmi_stack_fixup + pushl %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + jae nmi_stack_correct + cmpl $sysenter_entry,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + RESTORE_ALL + +nmi_stack_fixup: + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug - 1,(%esp) + jle nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + jle nmi_debug_stack_fixup +nmi_debug_stack_fixup: + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct +#endif /* XEN */ + +ENTRY(int3) + pushl $-1 # mark this as an int + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + +ENTRY(overflow) + pushl $0 + pushl $do_overflow + jmp error_code + +ENTRY(bounds) + pushl $0 + pushl $do_bounds + jmp error_code + +ENTRY(invalid_op) + pushl $0 + pushl $do_invalid_op + jmp error_code + +ENTRY(coprocessor_segment_overrun) + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code + +ENTRY(invalid_TSS) + pushl $do_invalid_TSS + jmp error_code + +ENTRY(segment_not_present) + pushl $do_segment_not_present + jmp error_code + +ENTRY(stack_segment) + pushl $do_stack_segment + jmp error_code + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code + +ENTRY(alignment_check) + pushl $do_alignment_check + jmp error_code + +# This handler is special, because it gets an extra value on its stack, +# which is the linear faulting address. +# fastcall register usage: %eax = pt_regs, %edx = error code, +# %ecx = fault address +ENTRY(page_fault) + pushl %ds + pushl %eax + xorl %eax, %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax /* eax = -1 */ + pushl %ecx + pushl %ebx + cld + movl %es,%edi + movl ES(%esp), %ecx /* get the faulting address */ + movl ORIG_EAX(%esp), %edx /* get the error code */ + movl %eax, ORIG_EAX(%esp) + movl %edi, ES(%esp) + movl $(__KERNEL_DS),%eax + movl %eax, %ds + movl %eax, %es + movl %esp,%eax /* pt_regs pointer */ + call do_page_fault + jmp ret_from_exception + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + pushl $0 + pushl machine_check_vector + jmp error_code +#endif + +ENTRY(fixup_4gb_segment) + pushl $do_fixup_4gb_segment + jmp error_code + +#include "syscall_table.S" + +syscall_table_size=(.-sys_call_table) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/head.S =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/head.S @@ -0,0 +1,198 @@ + +#include <linux/config.h> + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=3.0" + .ascii ",VIRT_BASE=0xC0000000" +#ifdef CONFIG_X86_PAE + .ascii ",PAE=yes" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" + .byte 0 + +.text +#include <linux/threads.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/thread_info.h> +#include <asm/asm_offsets.h> +#include <xen-public/arch-x86_32.h> + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +ENTRY(startup_32) + cld + + /* Copy the necessary stuff from xen_start_info structure. */ + mov $xen_start_info_union,%edi + mov $512,%ecx + rep movsl + +#ifdef CONFIG_SMP +ENTRY(startup_32_smp) + cld +#endif /* CONFIG_SMP */ + + /* Set up the stack pointer */ + lss stack_start,%esp + +checkCPUtype: + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + cpuid + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + movl $1,%eax # Use the CPUID instruction to get CPU type + cpuid + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + + incb ready + + xorl %eax,%eax # Clear FS/GS and LDT + movl %eax,%fs + movl %eax,%gs + cld # gcc2 wants the direction flag cleared at all times + +#ifdef CONFIG_SMP + movb ready, %cl + cmpb $1,%cl + je 1f # the first CPU calls start_kernel + # all other CPUs call initialize_secondary + call initialize_secondary + jmp L6 +1: +#endif /* CONFIG_SMP */ + call start_kernel +L6: + jmp L6 # main should never return here, but + # just in case, we know what happens. + +ENTRY(lgdt_finish) + movl $(__KERNEL_DS),%eax # reload all the segment registers + movw %ax,%ss # after changing gdt. + + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movw %ax,%ds + movw %ax,%es + + popl %eax # reload CS by intersegment return + pushl $(__KERNEL_CS) + pushl %eax + lret + +ENTRY(stack_start) + .long init_thread_union+THREAD_SIZE + .long __BOOT_DS + +ready: .byte 0 + +.globl idt_descr +.globl cpu_gdt_descr + + ALIGN + .word 0 # 32-bit align idt_desc.address +idt_descr: + .word IDT_ENTRIES*8-1 # idt contains 256 entries + .long idt_table + +# boot GDT descriptor (later on used by CPU#0): + .word 0 # 32 bit align gdt_desc.address +cpu_gdt_descr: + .word GDT_SIZE + .long cpu_gdt_table + + .fill NR_CPUS-1,8,0 # space for the other GDT descriptors + +.org 0x1000 +ENTRY(empty_zero_page) + +.org 0x2000 +ENTRY(swapper_pg_dir) + +.org 0x3000 +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* 0x0b reserved */ + .quad 0x0000000000000000 /* 0x13 reserved */ + .quad 0x0000000000000000 /* 0x1b reserved */ + .quad 0x0000000000000000 /* 0x20 unused */ + .quad 0x0000000000000000 /* 0x28 unused */ + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ + .quad 0x0000000000000000 /* 0x4b reserved */ + .quad 0x0000000000000000 /* 0x53 reserved */ + .quad 0x0000000000000000 /* 0x5b reserved */ + +#ifdef CONFIG_X86_PAE + .quad 0x00cfbb00000067ff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cfb300000067ff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x00cffb00000067ff /* 0x73 user 4GB code at 0x00000000 */ + .quad 0x00cff300000067ff /* 0x7b user 4GB data at 0x00000000 */ +#else + .quad 0x00cfbb000000c3ff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cfb3000000c3ff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x00cffb000000c3ff /* 0x73 user 4GB code at 0x00000000 */ + .quad 0x00cff3000000c3ff /* 0x7b user 4GB data at 0x00000000 */ +#endif + + .quad 0x0000000000000000 /* 0x80 TSS descriptor */ + .quad 0x0000000000000000 /* 0x88 LDT descriptor */ + + /* Segments used for calling PnP BIOS */ + .quad 0x0000000000000000 /* 0x90 32-bit code */ + .quad 0x0000000000000000 /* 0x98 16-bit code */ + .quad 0x0000000000000000 /* 0xa0 16-bit data */ + .quad 0x0000000000000000 /* 0xa8 16-bit data */ + .quad 0x0000000000000000 /* 0xb0 16-bit data */ + /* + * The APM segments have byte granularity and their bases + * and limits are set at run time. + */ + .quad 0x0000000000000000 /* 0xb8 APM CS code */ + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ + .quad 0x0000000000000000 /* 0xc8 APM DS data */ + + .quad 0x0000000000000000 /* 0xd0 - unused */ + .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xe0 - unused */ + .quad 0x0000000000000000 /* 0xe8 - unused */ + .quad 0x0000000000000000 /* 0xf0 - unused */ + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + .fill GDT_ENTRIES-32,8,0 + +.org 0x4000 +ENTRY(default_ldt) + +.org 0x5000 +/* + * Real beginning of normal "text" segment + */ +ENTRY(stext) +ENTRY(_stext) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/i386_ksyms.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/i386_ksyms.c @@ -0,0 +1,193 @@ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/user.h> +#include <linux/elfcore.h> +#include <linux/mca.h> +#include <linux/sched.h> +#include <linux/in6.h> +#include <linux/interrupt.h> +#include <linux/smp_lock.h> +#include <linux/pm.h> +#include <linux/pci.h> +#include <linux/apm_bios.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/tty.h> +#include <linux/highmem.h> +#include <linux/time.h> + +#include <asm/semaphore.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/uaccess.h> +#include <asm/checksum.h> +#include <asm/io.h> +#include <asm/delay.h> +#include <asm/irq.h> +#include <asm/mmx.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/nmi.h> +#include <asm/ist.h> +#include <asm/kdebug.h> + +extern void dump_thread(struct pt_regs *, struct user *); +extern spinlock_t rtc_lock; + +/* This is definitely a GPL-only symbol */ +EXPORT_SYMBOL_GPL(cpu_gdt_table); + +#if defined(CONFIG_APM_MODULE) +extern void machine_real_restart(unsigned char *, int); +EXPORT_SYMBOL(machine_real_restart); +extern void default_idle(void); +EXPORT_SYMBOL(default_idle); +#endif + +#ifdef CONFIG_SMP +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +#endif + +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) +extern struct drive_info_struct drive_info; +EXPORT_SYMBOL(drive_info); +#endif + +extern unsigned long cpu_khz; +extern unsigned long get_cmos_time(void); + +/* platform dependent support */ +EXPORT_SYMBOL(boot_cpu_data); +#ifdef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(node_data); +EXPORT_SYMBOL(physnode_map); +#endif +#ifdef CONFIG_X86_NUMAQ +EXPORT_SYMBOL(xquad_portio); +#endif +EXPORT_SYMBOL(dump_thread); +EXPORT_SYMBOL(dump_fpu); +EXPORT_SYMBOL_GPL(kernel_fpu_begin); +EXPORT_SYMBOL(__ioremap); +EXPORT_SYMBOL(ioremap_nocache); +EXPORT_SYMBOL(iounmap); +EXPORT_SYMBOL(kernel_thread); +EXPORT_SYMBOL(pm_idle); +#ifdef CONFIG_ACPI_BOOT +EXPORT_SYMBOL(pm_power_off); +#endif +EXPORT_SYMBOL(get_cmos_time); +EXPORT_SYMBOL(cpu_khz); +EXPORT_SYMBOL(apm_info); + +EXPORT_SYMBOL(__down_failed); +EXPORT_SYMBOL(__down_failed_interruptible); +EXPORT_SYMBOL(__down_failed_trylock); +EXPORT_SYMBOL(__up_wakeup); +/* Networking helper routines. */ +EXPORT_SYMBOL(csum_partial_copy_generic); +/* Delay loops */ +EXPORT_SYMBOL(__ndelay); +EXPORT_SYMBOL(__udelay); +EXPORT_SYMBOL(__delay); +EXPORT_SYMBOL(__const_udelay); + +EXPORT_SYMBOL(__get_user_1); +EXPORT_SYMBOL(__get_user_2); +EXPORT_SYMBOL(__get_user_4); + +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + +EXPORT_SYMBOL(strpbrk); +EXPORT_SYMBOL(strstr); + +EXPORT_SYMBOL(strncpy_from_user); +EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(__copy_from_user_ll); +EXPORT_SYMBOL(__copy_to_user_ll); +EXPORT_SYMBOL(strnlen_user); + +EXPORT_SYMBOL(dma_alloc_coherent); +EXPORT_SYMBOL(dma_free_coherent); + +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +#ifdef CONFIG_PCI_BIOS +EXPORT_SYMBOL(pcibios_set_irq_routing); +EXPORT_SYMBOL(pcibios_get_irq_routing_table); +#endif + +#ifdef CONFIG_X86_USE_3DNOW +EXPORT_SYMBOL(_mmx_memcpy); +EXPORT_SYMBOL(mmx_clear_page); +EXPORT_SYMBOL(mmx_copy_page); +#endif + +#ifdef CONFIG_X86_HT +EXPORT_SYMBOL(smp_num_siblings); +EXPORT_SYMBOL(cpu_sibling_map); +#endif + +#ifdef CONFIG_SMP +EXPORT_SYMBOL(cpu_data); +EXPORT_SYMBOL(cpu_online_map); +EXPORT_SYMBOL(cpu_callout_map); +EXPORT_SYMBOL(__write_lock_failed); +EXPORT_SYMBOL(__read_lock_failed); + +/* Global SMP stuff */ +EXPORT_SYMBOL(smp_call_function); + +/* TLB flushing */ +EXPORT_SYMBOL(flush_tlb_page); +#endif + +#ifdef CONFIG_X86_IO_APIC +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); +#endif + +#ifdef CONFIG_MCA +EXPORT_SYMBOL(machine_id); +#endif + +#ifdef CONFIG_VT +EXPORT_SYMBOL(screen_info); +#endif + +EXPORT_SYMBOL(get_wchan); + +EXPORT_SYMBOL(rtc_lock); + +EXPORT_SYMBOL_GPL(set_nmi_callback); +EXPORT_SYMBOL_GPL(unset_nmi_callback); + +EXPORT_SYMBOL(register_die_notifier); +#ifdef CONFIG_HAVE_DEC_LOCK +EXPORT_SYMBOL(_atomic_dec_and_lock); +#endif + +EXPORT_SYMBOL(__PAGE_KERNEL); + +#ifdef CONFIG_HIGHMEM +EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_to_page); +#endif + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +EXPORT_SYMBOL(ist_info); +#endif + +EXPORT_SYMBOL(csum_partial); Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/io_apic.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/io_apic.c @@ -0,0 +1,2609 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@xxxxxxxxxxxxxxxxxxxx> and + * Hidemi Kishimoto <kisimoto@xxxxxxxxxxxxxxxxxxxx>, + * further tested and cleaned up by Zach Brown <zab@xxxxxxxxxx> + * and Ingo Molnar <mingo@xxxxxxxxxx> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/config.h> +#include <linux/smp_lock.h> +#include <linux/mc146818rtc.h> +#include <linux/compiler.h> +#include <linux/acpi.h> + +#include <linux/sysdev.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/timer.h> + +#include <mach_apic.h> + +#include "io_ports.h" + +#ifdef CONFIG_X86_XEN + +#include <xen-public/xen.h> +#include <xen-public/physdev.h> + +/* Fake i8259 */ +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) +#define disable_8259A_irq(_irq) ((void)0) +#define i8259A_irq_pending(_irq) (0) + +unsigned long io_apic_irqs; + +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) +{ + physdev_op_t op; + int ret; + + op.cmd = PHYSDEVOP_APIC_READ; + op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; + op.u.apic_op.offset = reg; + ret = HYPERVISOR_physdev_op(&op); + if (ret) + return ret; + return op.u.apic_op.value; +} + +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + physdev_op_t op; + + op.cmd = PHYSDEVOP_APIC_WRITE; + op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; + op.u.apic_op.offset = reg; + op.u.apic_op.value = value; + HYPERVISOR_physdev_op(&op); +} + +#define io_apic_read(a,r) xen_io_apic_read(a,r) +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) + +#endif /* CONFIG_XEN */ + +int (*ioapic_renumber_irq)(int ioapic, int irq); +atomic_t irq_mis_count; + +static DEFINE_SPINLOCK(ioapic_lock); + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + int apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; +#ifdef CONFIG_PCI_MSI +#define vector_to_irq(vector) \ + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#else +#define vector_to_irq(vector) (vector) +#endif + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: whoops"); + } + entry->apic = apic; + entry->pin = pin; +} + +#ifndef CONFIG_XEN +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + + while (1) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + } + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int pin, reg; + + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, 0x00010000); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); +} + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Disable it in the IO-APIC irq-routing table: + */ + memset(&entry, 0, sizeof(entry)); + entry.mask = 1; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) +{ + unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int apicid_value; + + apicid_value = cpu_mask_to_apicid(cpumask); + /* Prepare to do the io_apic_write */ + apicid_value = apicid_value << 24; + spin_lock_irqsave(&ioapic_lock, flags); + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} +#else +#define clear_IO_APIC() ((void)0) +#endif + +#if defined(CONFIG_IRQBALANCE) +# include <asm/processor.h> /* kernel_thread() */ +# include <linux/kernel_stat.h> /* kstat */ +# include <linux/slab.h> /* kmalloc() */ +# include <linux/timer.h> /* time_after() */ + +# ifdef CONFIG_BALANCED_IRQ_DEBUG +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) +# define Dprintk(x...) do { TDprintk(x); } while (0) +# else +# define TDprintk(x...) +# define Dprintk(x...) +# endif + +cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS]; + +#define IRQBALANCE_CHECK_ARCH -999 +static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH; +static int physical_balance = 0; + +static struct irq_cpu_info { + unsigned long * last_irq; + unsigned long * irq_delta; + unsigned long irq; +} irq_cpu_data[NR_CPUS]; + +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) + +#define IDLE_ENOUGH(cpu,now) \ + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) + +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) + +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) + +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) +#define BALANCED_IRQ_MORE_DELTA (HZ/10) +#define BALANCED_IRQ_LESS_DELTA (HZ) + +static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL; + +static unsigned long move(int curr_cpu, cpumask_t allowed_mask, + unsigned long now, int direction) +{ + int search_idle = 1; + int cpu = curr_cpu; + + goto inside; + + do { + if (unlikely(cpu == curr_cpu)) + search_idle = 0; +inside: + if (direction == 1) { + cpu++; + if (cpu >= NR_CPUS) + cpu = 0; + } else { + cpu--; + if (cpu == -1) + cpu = NR_CPUS-1; + } + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || + (search_idle && !IDLE_ENOUGH(cpu,now))); + + return cpu; +} + +static inline void balance_irq(int cpu, int irq) +{ + unsigned long now = jiffies; + cpumask_t allowed_mask; + unsigned int new_cpu; + + if (irqbalance_disabled) + return; + + cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]); + new_cpu = move(cpu, allowed_mask, now, 1); + if (cpu != new_cpu) { + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu); + spin_unlock_irqrestore(&desc->lock, flags); + } +} + +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) +{ + int i, j; + Dprintk("Rotating IRQs among CPUs.\n"); + for (i = 0; i < NR_CPUS; i++) { + for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) { + if (!irq_desc[j].action) + continue; + /* Is it a significant load ? */ + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < + useful_load_threshold) + continue; + balance_irq(i, j); + } + } + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; +} + +static void do_irq_balance(void) +{ + int i, j; + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); + unsigned long move_this_load = 0; + int max_loaded = 0, min_loaded = 0; + int load; + unsigned long useful_load_threshold = balanced_irq_interval + 10; + int selected_irq; + int tmp_loaded, first_attempt = 1; + unsigned long tmp_cpu_irq; + unsigned long imbalance = 0; + cpumask_t allowed_mask, target_cpu_mask, tmp; + + for (i = 0; i < NR_CPUS; i++) { + int package_index; + CPU_IRQ(i) = 0; + if (!cpu_online(i)) + continue; + package_index = CPU_TO_PACKAGEINDEX(i); + for (j = 0; j < NR_IRQS; j++) { + unsigned long value_now, delta; + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if ( package_index == i ) + IRQ_DELTA(package_index,j) = 0; + /* Determine the total count per processor per IRQ */ + value_now = (unsigned long) kstat_cpu(i).irqs[j]; + + /* Determine the activity per processor per IRQ */ + delta = value_now - LAST_CPU_IRQ(i,j); + + /* Update last_cpu_irq[][] for the next time */ + LAST_CPU_IRQ(i,j) = value_now; + + /* Ignore IRQs whose rate is less than the clock */ + if (delta < useful_load_threshold) + continue; + /* update the load for the processor or package total */ + IRQ_DELTA(package_index,j) += delta; + + /* Keep track of the higher numbered sibling as well */ + if (i != package_index) + CPU_IRQ(i) += delta; + /* + * We have sibling A and sibling B in the package + * + * cpu_irq[A] = load for cpu A + load for cpu B + * cpu_irq[B] = load for cpu B + */ + CPU_IRQ(package_index) += delta; + } + } + /* Find the least loaded processor package */ + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (min_cpu_irq > CPU_IRQ(i)) { + min_cpu_irq = CPU_IRQ(i); + min_loaded = i; + } + } + max_cpu_irq = ULONG_MAX; + +tryanothercpu: + /* Look for heaviest loaded processor. + * We may come back to get the next heaviest loaded processor. + * Skip processors with trivial loads. + */ + tmp_cpu_irq = 0; + tmp_loaded = -1; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (max_cpu_irq <= CPU_IRQ(i)) + continue; + if (tmp_cpu_irq < CPU_IRQ(i)) { + tmp_cpu_irq = CPU_IRQ(i); + tmp_loaded = i; + } + } + + if (tmp_loaded == -1) { + /* In the case of small number of heavy interrupt sources, + * loading some of the cpus too much. We use Ingo's original + * approach to rotate them around. + */ + if (!first_attempt && imbalance >= useful_load_threshold) { + rotate_irqs_among_cpus(useful_load_threshold); + return; + } + goto not_worth_the_effort; + } + + first_attempt = 0; /* heaviest search */ + max_cpu_irq = tmp_cpu_irq; /* load */ + max_loaded = tmp_loaded; /* processor */ + imbalance = (max_cpu_irq - min_cpu_irq) / 2; + + Dprintk("max_loaded cpu = %d\n", max_loaded); + Dprintk("min_loaded cpu = %d\n", min_loaded); + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); + Dprintk("load imbalance = %lu\n", imbalance); + + /* if imbalance is less than approx 10% of max load, then + * observe diminishing returns action. - quit + */ + if (imbalance < (max_cpu_irq >> 3)) { + Dprintk("Imbalance too trivial\n"); + goto not_worth_the_effort; + } + +tryanotherirq: + /* if we select an IRQ to move that can't go where we want, then + * see if there is another one to try. + */ + move_this_load = 0; + selected_irq = -1; + for (j = 0; j < NR_IRQS; j++) { + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if (imbalance <= IRQ_DELTA(max_loaded,j)) + continue; + /* Try to find the IRQ that is closest to the imbalance + * without going over. + */ + if (move_this_load < IRQ_DELTA(max_loaded,j)) { + move_this_load = IRQ_DELTA(max_loaded,j); + selected_irq = j; + } + } + if (selected_irq == -1) { + goto tryanothercpu; + } + + imbalance = move_this_load; + + /* For physical_balance case, we accumlated both load + * values in the one of the siblings cpu_irq[], + * to use the same code for physical and logical processors + * as much as possible. + * + * NOTE: the cpu_irq[] array holds the sum of the load for + * sibling A and sibling B in the slot for the lowest numbered + * sibling (A), _AND_ the load for sibling B in the slot for + * the higher numbered sibling. + * + * We seek the least loaded sibling by making the comparison + * (A+B)/2 vs B + */ + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } + + cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); + target_cpu_mask = cpumask_of_cpu(min_loaded); + cpus_and(tmp, target_cpu_mask, allowed_mask); + + if (!cpus_empty(tmp)) { + irq_desc_t *desc = irq_desc + selected_irq; + unsigned long flags; + + Dprintk("irq = %d moved to cpu = %d\n", + selected_irq, min_loaded); + /* mark for change destination */ + spin_lock_irqsave(&desc->lock, flags); + pending_irq_balance_cpumask[selected_irq] = + cpumask_of_cpu(min_loaded); + spin_unlock_irqrestore(&desc->lock, flags); + /* Since we made a change, come back sooner to + * check for more variation. + */ + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; + } + goto tryanotherirq; + +not_worth_the_effort: + /* + * if we did not find an IRQ to move, then adjust the time interval + * upward + */ + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); + Dprintk("IRQ worth rotating not found\n"); + return; +} + +static int balanced_irq(void *unused) +{ + int i; + unsigned long prev_balance_time = jiffies; + long time_remaining = balanced_irq_interval; + + daemonize("kirqd"); + + /* push everything to CPU 0 to give us a starting point. */ + for (i = 0 ; i < NR_IRQS ; i++) { + pending_irq_balance_cpumask[i] = cpumask_of_cpu(0); + } + + for ( ; ; ) { + set_current_state(TASK_INTERRUPTIBLE); + time_remaining = schedule_timeout(time_remaining); + try_to_freeze(PF_FREEZE); + if (time_after(jiffies, + prev_balance_time+balanced_irq_interval)) { + do_irq_balance(); + prev_balance_time = jiffies; + time_remaining = balanced_irq_interval; + } + } + return 0; +} + +static int __init balanced_irq_init(void) +{ + int i; + struct cpuinfo_x86 *c; + cpumask_t tmp; + + cpus_shift_right(tmp, cpu_online_map, 2); + c = &boot_cpu_data; + /* When not overwritten by the command line ask subarchitecture. */ + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) + irqbalance_disabled = NO_BALANCE_IRQ; + if (irqbalance_disabled) + return 0; + + /* disable irqbalance completely if there is only one processor online */ + if (num_online_cpus() < 2) { + irqbalance_disabled = 1; + return 0; + } + /* + * Enable physical balance only if more than 1 physical processor + * is present + */ + if (smp_num_siblings > 1 && !cpus_empty(tmp)) + physical_balance = 1; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { + printk(KERN_ERR "balanced_irq_init: out of memory"); + goto failed; + } + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); + } + + printk(KERN_INFO "Starting balanced_irq\n"); + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + return 0; + else + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); +failed: + for (i = 0; i < NR_CPUS; i++) { + if(irq_cpu_data[i].irq_delta) + kfree(irq_cpu_data[i].irq_delta); + if(irq_cpu_data[i].last_irq) + kfree(irq_cpu_data[i].last_irq); + } + return 0; +} + +int __init irqbalance_disable(char *str) +{ + irqbalance_disabled = 1; + return 0; +} + +__setup("noirqbalance", irqbalance_disable); + +static inline void move_irq(int irq) +{ + /* note - we hold the desc->lock */ + if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) { + set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]); + cpus_clear(pending_irq_balance_cpumask[irq]); + } +} + +late_initcall(balanced_irq_init); + +#else /* !CONFIG_IRQBALANCE */ +static inline void move_irq(int irq) { } +#endif /* CONFIG_IRQBALANCE */ + +#ifndef CONFIG_SMP +void fastcall send_IPI_self(int vector) +{ +#ifndef CONFIG_XEN + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); +#endif +} +#endif /* !CONFIG_SMP */ + + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; + +static int __init ioapic_setup(char *str) +{ + skip_ioapic_setup = 1; + return 1; +} + +__setup("noapic", ioapic_setup); + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +#ifndef CONFIG_XEN +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA || + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} +#endif + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " + "slot:%d, pin:%d.\n", bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} + +#ifndef CONFIG_XEN +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif /* !CONFIG_XEN */ + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +/* NEC98 interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_NEC98_trigger(idx) (0) +#define default_NEC98_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + polarity = default_NEC98_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + trigger = default_NEC98_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + case MP_BUS_NEC98: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS]; /* = { FIRST_DEVICE_VECTOR , 0 }; */ + +int assign_irq_vector(int irq) +{ + static int current_vector = FIRST_DEVICE_VECTOR; + physdev_op_t op; + + BUG_ON(irq >= NR_IRQ_VECTORS); + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) + return IO_APIC_VECTOR(irq); + + op.cmd = PHYSDEVOP_ASSIGN_VECTOR; + op.u.irq_op.irq = irq; + if (HYPERVISOR_physdev_op(&op)) + return -ENOSPC; + current_vector = op.u.irq_op.vector; + + vector_irq[current_vector] = irq; + if (irq != AUTO_ASSIGN) + IO_APIC_VECTOR(irq) = current_vector; + + return current_vector; +} + +#ifndef CONFIG_XEN +static struct hw_interrupt_type ioapic_level_type; +static struct hw_interrupt_type ioapic_edge_type; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + if (use_pci_vector() && !platform_legacy_irq(irq)) { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[vector].handler = &ioapic_level_type; + else + irq_desc[vector].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[vector]); + } else { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[irq].handler = &ioapic_level_type; + else + irq_desc[irq].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[irq]); + } +} +#else +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) +#endif + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = + cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + " IO-APIC (apicid-pin) %d-%d", + mp_ioapics[apic].mpc_apicid, + pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", + mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + } + + irq = pin_2_irq(idx, apic, pin); + /* + * skip adding the timer int on secondary nodes, which causes + * a small but painful rift in the time-space continuum + */ + if (multi_timer_check(apic, irq)) + continue; + else + add_pin_to_irq(irq, apic, pin); + + if (/*!apic &&*/ !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE, " not connected.\n"); +} + +/* + * Set up the 8259A-master output pin: + */ +#ifndef CONFIG_XEN +static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].handler = &ioapic_edge_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +static inline void UNEXPECTED_IO_APIC(void) +{ +} + +void __init print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + if (reg_00.bits.ID >= get_physical_broadcast()) + UNEXPECTED_IO_APIC(); + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.bits.entries != 0x2E) && + (reg_01.bits.entries != 0x3F) + ) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ + ) + UNEXPECTED_IO_APIC(); + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + if (reg_03.bits.__reserved_1) + UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + if (use_pci_vector()) + printk(KERN_INFO "Using vector-based indexing\n"); + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + if (use_pci_vector() && !platform_legacy_irq(i)) + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); + else + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +static void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1<<j)) + printk("1"); + else + printk("0"); + } + printk("\n"); + } +} + +void /*__init*/ print_local_APIC(void * dummy) +{ + unsigned int v, ver, maxlvt; + + if (apic_verbosity == APIC_QUIET) + return; + + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + v = apic_read(APIC_EOI); + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_bitfield(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_bitfield(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_bitfield(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void /*__init*/ print_PIC(void) +{ + extern spinlock_t i8259A_lock; + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} +#else +void __init print_IO_APIC(void) { } +#endif /* !CONFIG_XEN */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (i = 0; i < nr_ioapics; i++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(i, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[i] = reg_01.bits.entries+1; + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + +#ifndef CONFIG_XEN + disconnect_bsp_APIC(); +#endif +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@xxxxxxxx> Tue Dec 21 12:25:05 CST 1999 + */ + +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mpc_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; + } + + /* Don't check I/O APIC IDs for some xAPIC systems. They have + * no meaning without the serial APIC bus. */ + if (NO_IOAPIC_CHECK) + continue; + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mpc_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mpc_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mpc_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mpc_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#else +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif + +#ifndef CONFIG_XEN +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + if (jiffies - t1 > 4) + return 1; + + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ +static unsigned int startup_edge_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +static void ack_edge_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +/* + * Level triggered interrupts can just be masked, + * and shutting down and starting up the interrupt + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. + * + * Level triggered interrupts are special because we + * do not touch any IO-APIC register while handling + * them. We ack the APIC in the end-IRQ handler, not + * in the start-IRQ-handler. Protection against reentrance + * from the same interrupt is still provided, both by the + * generic IRQ layer and by the fact that an unacked local + * APIC does not accept IRQs. + */ +static unsigned int startup_level_ioapic_irq (unsigned int irq) +{ + unmask_IO_APIC_irq(irq); + + return 0; /* don't check for pending */ +} + +static void end_level_ioapic_irq (unsigned int irq) +{ + unsigned long v; + int i; + + move_irq(irq); +/* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = IO_APIC_VECTOR(irq); + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} + +#ifdef CONFIG_PCI_MSI +static unsigned int startup_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_edge_ioapic_irq(irq); +} + +static void ack_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + ack_edge_ioapic_irq(irq); +} + +static unsigned int startup_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_level_ioapic_irq (irq); +} + +static void end_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + end_level_ioapic_irq(irq); +} + +static void mask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_IO_APIC_irq(irq); +} + +static void unmask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + unmask_IO_APIC_irq(irq); +} + +static void set_ioapic_affinity_vector (unsigned int vector, + cpumask_t cpu_mask) +{ + int irq = vector_to_irq(vector); + + set_ioapic_affinity_irq(irq, cpu_mask); +} +#endif + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ +static struct hw_interrupt_type ioapic_edge_type = { + .typename = "IO-APIC-edge", + .startup = startup_edge_ioapic, + .shutdown = shutdown_edge_ioapic, + .enable = enable_edge_ioapic, + .disable = disable_edge_ioapic, + .ack = ack_edge_ioapic, + .end = end_edge_ioapic, + .set_affinity = set_ioapic_affinity, +}; + +static struct hw_interrupt_type ioapic_level_type = { + .typename = "IO-APIC-level", + .startup = startup_level_ioapic, + .shutdown = shutdown_level_ioapic, + .enable = enable_level_ioapic, + .disable = disable_level_ioapic, + .ack = mask_and_ack_level_ioapic, + .end = end_level_ioapic, + .set_affinity = set_ioapic_affinity, +}; +#endif /* !CONFIG_XEN */ + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (use_pci_vector()) { + if (!platform_legacy_irq(tmp)) + if ((tmp = vector_to_irq(tmp)) == -1) + continue; + } + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); +#ifndef CONFIG_XEN + else + /* Strange. Oh, well.. */ + irq_desc[irq].handler = &no_irq_type; +#endif + } + } +} + +#ifndef CONFIG_XEN +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type = { + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(0, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(0, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + */ +static inline void check_timer(void) +{ + int pin1, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + timer_ack = 1; + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + pin2 = find_isa_irq_pin(0, mp_ExtINT); + + printk(KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + return; + } + clear_IO_APIC_pin(0, pin1); + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); + } + + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + printk("\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + if (pin1 != -1) + replace_pin_at_irq(0, 0, pin1, 0, pin2); + else + add_pin_to_irq(0, 0, pin2); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(0, pin2); + } + printk(" failed.\n"); + + if (nmi_watchdog == NMI_IO_APIC) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].handler = &lapic_irq_type; + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + printk(" failed.\n"); + + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + timer_ack = 0; + init_8259A(0); + make_8259A_irq(0); + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + printk(" failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option"); +} +#else +#define check_timer() ((void)0) +#endif + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + printk("ENABLING IO-APIC IRQs\n"); + + /* + * Set up IO-APIC IRQ routing. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#ifndef CONFIG_XEN + sync_Arb_IDs(); +#endif + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if(sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + spin_lock_irqsave(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +int __init io_apic_get_unique_id (int ioapic, int apic_id) +{ +#ifndef CONFIG_XEN + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) + panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); +#endif /* !CONFIG_XEN */ + + return apic_id; +} + + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /*CONFIG_ACPI_BOOT*/ Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ioport.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ioport.c @@ -0,0 +1,129 @@ +/* + * linux/arch/i386/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <asm-xen/xen-public/physdev.h> + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + unsigned long mask; + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); + unsigned int low_index = base & (BITS_PER_LONG-1); + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0UL << low_index); + if (length < BITS_PER_LONG) + mask &= ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + length -= BITS_PER_LONG; + } + + mask = (new_value ? ~0UL : 0UL); + while (length >= BITS_PER_LONG) { + *bitmap_base++ = mask; + length -= BITS_PER_LONG; + } + + if (length > 0) { + mask = ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + } +} + + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct * t = &current->thread; + unsigned long *bitmap; + physdev_op_t op; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + + op.cmd = PHYSDEVOP_SET_IOBITMAP; + op.u.set_iobitmap.bitmap = (unsigned long)bitmap; + op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS; + HYPERVISOR_physdev_op(&op); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned int new_io_pl) +{ + unsigned int old_io_pl = current->thread.io_pl; + physdev_op_t op; + + if (new_io_pl > 3) + return -EINVAL; + + /* Need "raw I/O" privileges for direct port access. */ + if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* Maintain OS privileges even if user attempts to relinquish them. */ + if (new_io_pl == 0) + new_io_pl = 1; + + /* Change our version of the privilege levels. */ + current->thread.io_pl = new_io_pl; + + /* Force the change at ring 0. */ + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = new_io_pl; + HYPERVISOR_physdev_op(&op); + + return 0; +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/irq.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/irq.c @@ -0,0 +1,299 @@ +/* + * linux/arch/i386/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <asm/uaccess.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/delay.h> + +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp; +EXPORT_PER_CPU_SYMBOL(irq_stat); + +#ifndef CONFIG_X86_LOCAL_APIC +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); +} +#endif + +#ifdef CONFIG_4KSTACKS +/* + * per-CPU IRQ handling contexts (thread information and stack) + */ +union irq_ctx { + struct thread_info tinfo; + u32 stack[THREAD_SIZE/sizeof(u32)]; +}; + +static union irq_ctx *hardirq_ctx[NR_CPUS]; +static union irq_ctx *softirq_ctx[NR_CPUS]; +#endif + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +fastcall unsigned int do_IRQ(struct pt_regs *regs) +{ + /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & __IRQ_MASK(HARDIRQ_BITS); +#ifdef CONFIG_4KSTACKS + union irq_ctx *curctx, *irqctx; + u32 *isp; +#endif + + irq_enter(); +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ + { + long esp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("do_IRQ: stack overflow: %ld\n", + esp - sizeof(struct thread_info)); + dump_stack(); + } + } +#endif + +#ifdef CONFIG_4KSTACKS + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (curctx != irqctx) { + int arg1, arg2, ebx; + + /* build the stack frame on the IRQ stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_IRQ \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (ebx) + : "0" (irq), "1" (regs), "2" (isp) + : "memory", "cc", "ecx" + ); + } else +#endif + __do_IRQ(irq, regs); + + irq_exit(); + + return 1; +} + +#ifdef CONFIG_4KSTACKS + +/* + * These should really be __section__(".bss.page_aligned") as well, but + * gcc's 3.0 and earlier don't handle that correctly. + */ +static char softirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +static char hardirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void irq_ctx_init(int cpu) +{ + union irq_ctx *irqctx; + + if (hardirq_ctx[cpu]) + return; + + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + hardirq_ctx[cpu] = irqctx; + + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + softirq_ctx[cpu] = irqctx; + + printk("CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); +} + +extern asmlinkage void __do_softirq(void); + +asmlinkage void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curctx; + union irq_ctx *irqctx; + u32 *isp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* build the stack frame on the softirq stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_softirq \n" + " movl %%ebx,%%esp \n" + : "=b"(isp) + : "0"(isp) + : "memory", "cc", "edx", "ecx", "eax" + ); + } + + local_irq_restore(flags); +} + +EXPORT_SYMBOL(do_softirq); +#endif + +/* + * Interrupt statistics: + */ + +atomic_t irq_err_count; + +/* + * /proc/interrupts printing: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for_each_cpu(j) + seq_printf(p, "CPU%d ",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for_each_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ldt.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ldt.c @@ -0,0 +1,275 @@ +/* + * linux/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/mmu_context.h> + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(&current->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt; + void *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + preempt_disable(); +#endif + make_pages_readonly(pc->ldt, (pc->size * LDT_ENTRY_SIZE) / + PAGE_SIZE); + load_LDT(pc); +#ifdef CONFIG_SMP + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable(oldldt, (oldsize * LDT_ENTRY_SIZE) / + PAGE_SIZE); + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + make_pages_readonly(new->ldt, (new->size * LDT_ENTRY_SIZE) / + PAGE_SIZE); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + memset(&mm->context, 0, sizeof(mm->context)); + init_MUTEX(&mm->context.sem); + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + if (retval == 0) { + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable(mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / + PAGE_SIZE); + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } + if (!mm->context.pinned) { + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + void *address; + + err = 0; + address = &default_ldt[0]; + size = 5*sizeof(struct desc_struct); + if (size > bytecount) + size = bytecount; + + err = size; + if (copy_to_user(ptr, address, size)) + err = -EFAULT; + + return err; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct * mm = current->mm; + __u32 entry_1, entry_2, *lp; + unsigned long mach_lp; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + mach_lp = arbitrary_virt_to_machine(lp); + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2); + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/Makefile @@ -0,0 +1,99 @@ +# +# Makefile for the linux kernel. +# + +XENARCH := $(subst ",,$(CONFIG_XENARCH)) + +extra-y := head.o init_task.o + +obj-y := process.o signal.o entry.o traps.o \ + time.o ioport.o ldt.o setup.o \ + pci-dma.o i386_ksyms.o irq.o quirks.o + +c-obj-y := semaphore.o vm86.o \ + ptrace.o sys_i386.o \ + i387.o dmi_scan.o bootflag.o \ + doublefault.o +s-obj-y := + +obj-y += cpu/ +#obj-y += timers/ +obj-$(CONFIG_ACPI_BOOT) += acpi/ +#c-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o +c-obj-$(CONFIG_MCA) += mca.o +c-obj-$(CONFIG_X86_MSR) += msr.o +c-obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_MICROCODE) += microcode.o +c-obj-$(CONFIG_APM) += apm.o +obj-$(CONFIG_X86_SMP) += smp.o smpboot.o +#obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_X86_MPPARSE) += mpparse.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o +c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o +c-obj-$(CONFIG_X86_REBOOTFIXUPS)+= reboot_fixups.o +c-obj-$(CONFIG_X86_NUMAQ) += numaq.o +c-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o +c-obj-$(CONFIG_MODULES) += module.o +c-obj-y += sysenter.o +obj-y += vsyscall.o +c-obj-$(CONFIG_ACPI_SRAT) += srat.o +c-obj-$(CONFIG_HPET_TIMER) += time_hpet.o +c-obj-$(CONFIG_EFI) += efi.o efi_stub.o +c-obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +c-obj-$(CONFIG_SMP_ALTERNATIVES)+= smpalts.o + +EXTRA_AFLAGS := -traditional + +c-obj-$(CONFIG_SCx200) += scx200.o + +# vsyscall.o contains the vsyscall DSO images as __initdata. +# We must build both images before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so +targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) +targets += vsyscall-note.o vsyscall.lds + +# The DSO images are built using a special linker script. +quiet_cmd_syscall = SYSCALL $@ + cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ + -Wl,-T,$(filter-out FORCE,$^) -o $@ + +export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH) + +vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 +SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags) +SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags) + +$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ +$(obj)/vsyscall-%.so: $(src)/vsyscall.lds \ + $(obj)/vsyscall-%.o FORCE + $(call if_changed,syscall) + +# We also create a special relocatable object that should mirror the symbol +# table and layout of the linked DSO. With ld -R we can then refer to +# these symbols in the kernel code rather than hand-coded addresses. +extra-y += vsyscall-syms.o +$(obj)/built-in.o: $(obj)/vsyscall-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o + +SYSCFLAGS_vsyscall-syms.o = -r +$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ + $(obj)/vsyscall-sysenter.o FORCE + $(call if_changed,syscall) + +c-link := init_task.o +s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o syscall_table.o + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): + @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@ + +$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S + +$(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S + +obj-y += $(c-obj-y) $(s-obj-y) +obj-m += $(c-obj-m) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-m) $(c-obj-) $(c-link)) +clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/microcode.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/microcode.c @@ -0,0 +1,163 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2004 Tigran Aivazian + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, + * Order Number 245472 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/245472.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +//#define DEBUG /* pr_debug */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/syscalls.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> + +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@xxxxxxxxxxx>"); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "1.14-xen" + +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DECLARE_MUTEX(microcode_sem); + +static void __user *user_buffer; /* user area microcode data buffer */ +static unsigned int user_buffer_size; /* it's size */ + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + + +static int do_microcode_update (void) +{ + int err; + dom0_op_t op; + + err = sys_mlock((unsigned long)user_buffer, user_buffer_size); + if (err != 0) + return err; + + op.cmd = DOM0_MICROCODE; + op.u.microcode.data = user_buffer; + op.u.microcode.length = user_buffer_size; + err = HYPERVISOR_dom0_op(&op); + + (void)sys_munlock((unsigned long)user_buffer, user_buffer_size); + + return err; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if (len < DEFAULT_UCODE_TOTALSIZE) { + printk(KERN_ERR "microcode: not enough data\n"); + return -EINVAL; + } + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + return -EINVAL; + } + + down(&microcode_sem); + + user_buffer = (void __user *) buf; + user_buffer_size = (int) len; + + ret = do_microcode_update(); + if (!ret) + ret = (ssize_t)len; + + up(&microcode_sem); + + return ret; +} + +static int microcode_ioctl (struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + /* + * XXX: will be removed after microcode_ctl + * is updated to ignore failure of this ioctl() + */ + case MICROCODE_IOCFREE: + return 0; + default: + return -EINVAL; + } + return -EINVAL; +} + +static struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .ioctl = microcode_ioctl, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .devfs_name = "cpu/microcode", + .fops = &microcode_fops, +}; + +static int __init microcode_init (void) +{ + int error; + + error = misc_register(&microcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@xxxxxxxxxxx>\n"); + return 0; +} + +static void __exit microcode_exit (void) +{ + misc_deregister(&microcode_dev); + printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n"); +} + +module_init(microcode_init) +module_exit(microcode_exit) +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/mpparse.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/mpparse.c @@ -0,0 +1,1124 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/delay.h> +#include <linux/config.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/bitops.h> + +#include <asm/smp.h> +#include <asm/acpi.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/io_apic.h> + +#include <mach_apic.h> +#include <mach_mpparse.h> +#include <bios_ebda.h> + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __initdata maxcpus = NR_CPUS; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; +int mp_bus_id_to_type [MAX_MP_BUSSES]; +int mp_bus_id_to_node [MAX_MP_BUSSES]; +int mp_bus_id_to_local [MAX_MP_BUSSES]; +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +static int mp_current_pci_id; + +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; + +int pic_mode; +unsigned long mp_lapic_addr; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +unsigned int boot_cpu_logical_apicid = -1U; +/* Internal processor count */ +static unsigned int __initdata num_processors; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +/* + * Intel MP BIOS table parsing routines: + */ + + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ + +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; + +#ifdef CONFIG_X86_NUMAQ +static int MP_valid_apicid(int apicid, int version) +{ + return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf; +} +#elif !defined(CONFIG_XEN) +static int MP_valid_apicid(int apicid, int version) +{ + if (version >= 0x14) + return apicid < 0xff; + else + return apicid < 0xf; +} +#endif + +#ifndef CONFIG_XEN +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver, apicid; + physid_mask_t tmp; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (m->mpc_featureflag&(1<<0)) + Dprintk(" Floating point unit present.\n"); + if (m->mpc_featureflag&(1<<7)) + Dprintk(" Machine Exception supported.\n"); + if (m->mpc_featureflag&(1<<8)) + Dprintk(" 64 bit compare & exchange supported.\n"); + if (m->mpc_featureflag&(1<<9)) + Dprintk(" Internal APIC present.\n"); + if (m->mpc_featureflag&(1<<11)) + Dprintk(" SEP present.\n"); + if (m->mpc_featureflag&(1<<12)) + Dprintk(" MTRR present.\n"); + if (m->mpc_featureflag&(1<<13)) + Dprintk(" PGE present.\n"); + if (m->mpc_featureflag&(1<<14)) + Dprintk(" MCA present.\n"); + if (m->mpc_featureflag&(1<<15)) + Dprintk(" CMOV present.\n"); + if (m->mpc_featureflag&(1<<16)) + Dprintk(" PAT present.\n"); + if (m->mpc_featureflag&(1<<17)) + Dprintk(" PSE present.\n"); + if (m->mpc_featureflag&(1<<18)) + Dprintk(" PSN present.\n"); + if (m->mpc_featureflag&(1<<19)) + Dprintk(" Cache Line Flush Instruction present.\n"); + /* 20 Reserved */ + if (m->mpc_featureflag&(1<<21)) + Dprintk(" Debug Trace and EMON Store present.\n"); + if (m->mpc_featureflag&(1<<22)) + Dprintk(" ACPI Thermal Throttle Registers present.\n"); + if (m->mpc_featureflag&(1<<23)) + Dprintk(" MMX present.\n"); + if (m->mpc_featureflag&(1<<24)) + Dprintk(" FXSR present.\n"); + if (m->mpc_featureflag&(1<<25)) + Dprintk(" XMM present.\n"); + if (m->mpc_featureflag&(1<<26)) + Dprintk(" Willamette New Instructions present.\n"); + if (m->mpc_featureflag&(1<<27)) + Dprintk(" Self Snoop present.\n"); + if (m->mpc_featureflag&(1<<28)) + Dprintk(" HT present.\n"); + if (m->mpc_featureflag&(1<<29)) + Dprintk(" Thermal Monitor present.\n"); + /* 30, 31 Reserved */ + + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_physical_apicid = m->mpc_apicid; + boot_cpu_logical_apicid = apicid; + } + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + num_processors++; + ver = m->mpc_apicver; + + if (!MP_valid_apicid(apicid, ver)) { + printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + --num_processors; + return; + } + + tmp = apicid_to_cpu_present(apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp); + + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; +} +#else +void __init MP_processor_info (struct mpc_config_processor *m) +{ + num_processors++; +} +#endif /* CONFIG_XEN */ + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + + mpc_oem_bus_info(m, str, translation_table[mpc_record]); + + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { + mpc_oem_pci_bus(m, translation_table[mpc_record]); + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; + } else { + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +#ifdef CONFIG_X86_NUMAQ +static void __init MP_translation_info (struct mpc_config_translation *m) +{ + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ + unsigned short oemsize) +{ + int count = sizeof (*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable)+count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) + { + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], + oemtable->oem_signature[1], + oemtable->oem_signature[2], + oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) + { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m= + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); + return; + } + } + } +} + +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! May not be a NUMA-Q system!\n"); + if (mpc->mpc_oemptr) + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, + mpc->mpc_oemsize); +} +#endif /* CONFIG_X86_NUMAQ */ + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + char oem[10]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", + *(u32 *)mpc->mpc_signature); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk(KERN_ERR "SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(oem,mpc->mpc_oem,8); + oem[8]=0; + printk(KERN_INFO "OEM ID: %s ",oem); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk("Product ID: %s ",str); + + mps_oem_check(mpc, oem, str); + + printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + + /* + * Save the local APIC address (it might be non-default) -- but only + * if we're not using ACPI. + */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + mpc_record = 0; + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + default: + { + count = mpc->mpc_length; + break; + } + } + ++mpc_record; + } + clustered_apic_check(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk("???\n"); + printk(KERN_ERR "Unknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI may be used to obtain the entire SMP configuration or just to + * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + unsigned long *bp = isa_bus_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + printk("Error: MPF size\n"); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; +#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at %08lx\n", + virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) { + /* + * We cannot access to MPC table to compute + * table size yet, as only few megabytes from + * the bottom is mapped now. + * PC-9800's MPC table places on the very last + * of physical memory; so that simply reserving + * PAGE_SIZE from mpg->mpf_physptr yields BUG() + * in reserve_bootmem. + */ + unsigned long size = PAGE_SIZE; + unsigned long end = max_low_pfn * PAGE_SIZE; + if (mpf->mpf_physptr + size > end) + size = end - mpf->mpf_physptr; + reserve_bootmem(mpf->mpf_physptr, size); + } +#else + printk(KERN_INFO "found SMP MP-table at %08lx\n", + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); +#endif + + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_smp_config (void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + +#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +#endif +} + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +void __init mp_register_lapic_address ( + u64 address) +{ +#ifndef CONFIG_XEN + mp_lapic_addr = (unsigned long) address; + + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +#endif +} + + +void __init mp_register_lapic ( + u8 id, + u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (MAX_APICS - id <= 0) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + +#ifndef CONFIG_XEN + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; +#endif + + MP_processor_info(&processor); +} + +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_ACPI_INTERPRETER) || defined(CONFIG_ACPI_BOOT)) + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_base; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + + +static int mp_find_ioapic ( + int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + + +void __init mp_register_ioapic ( + u8 id, + u32 address, + u32 gsi_base) +{ + int idx = 0; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); + + return; +} + + +void __init mp_override_legacy_irq ( + u8 bus_irq, + u8 polarity, + u8 trigger, + u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + + return; +} + +int es7000_plat; + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overriden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } +} + +int mp_register_gsi (u32 gsi, int edge_level, int active_high_low) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + +#ifdef CONFIG_ACPI_BUS + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_fadt.sci_int == gsi) + return gsi; +#endif + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + return gsi; + } + + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, + active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /*CONFIG_X86_IO_APIC && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT)*/ +#endif /*CONFIG_ACPI_BOOT*/ Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/pci-dma.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/pci-dma.c @@ -0,0 +1,282 @@ +/* + * Dynamic DMA mapping support. + * + * On i386 there is no hardware dynamic DMA address translation, + * so consistent alloc/free are merely page allocation/freeing. + * The rest of the dynamic DMA mapping interface is implemented + * in asm/pci.h. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/version.h> +#include <asm/io.h> +#include <asm-xen/balloon.h> +#include <asm/tlbflush.h> + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, unsigned int __nocast gfp) +{ + void *ret; + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + unsigned int order = get_order(size); + unsigned long vstart; + /* ignore region specifiers */ + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + ret = mem->virt_base + (page << PAGE_SHIFT); + memset(ret, 0, size); + return ret; + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + return NULL; + } + + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) + gfp |= GFP_DMA; + + vstart = __get_free_pages(gfp, order); + ret = (void *)vstart; + + if (ret != NULL) { + xen_contig_memory(vstart, order); + + memset(ret, 0, size); + *dma_handle = virt_to_bus(ret); + } + return ret; +} + +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); + } else + free_pages((unsigned long)vaddr, order); +} + +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void __iomem *mem_base; + int pages = size >> PAGE_SHIFT; + int bitmap_size = (pages + 31)/32; + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; + if (!size) + goto out; + if (dev->dma_mem) + goto out; + + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ + + mem_base = ioremap(bus_addr, size); + if (!mem_base) + goto out; + + dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dev->dma_mem) + goto out; + memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); + dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); + if (!dev->dma_mem->bitmap) + goto free1_out; + memset(dev->dma_mem->bitmap, 0, bitmap_size); + + dev->dma_mem->virt_base = mem_base; + dev->dma_mem->device_base = device_addr; + dev->dma_mem->size = pages; + dev->dma_mem->flags = flags; + + if (flags & DMA_MEMORY_MAP) + return DMA_MEMORY_MAP; + + return DMA_MEMORY_IO; + + free1_out: + kfree(dev->dma_mem->bitmap); + out: + return 0; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if(!mem) + return; + dev->dma_mem = NULL; + iounmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; + int pos, err; + + if (!mem) + return ERR_PTR(-EINVAL); + + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +static LIST_HEAD(dma_map_head); +static DEFINE_SPINLOCK(dma_map_lock); +struct dma_map_entry { + struct list_head list; + dma_addr_t dma; + char *bounce, *host; + size_t size; +}; +#define DMA_MAP_MATCHES(e,d) (((e)->dma<=(d)) && (((e)->dma+(e)->size)>(d))) + +dma_addr_t +dma_map_single(struct device *dev, void *ptr, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + void *bnc; + dma_addr_t dma; + unsigned long flags; + + BUG_ON(direction == DMA_NONE); + + /* + * Even if size is sub-page, the buffer may still straddle a page + * boundary. Take into account buffer start offset. All other calls are + * conservative and always search the dma_map list if it's non-empty. + */ + if ((((unsigned int)ptr & ~PAGE_MASK) + size) <= PAGE_SIZE) { + dma = virt_to_bus(ptr); + } else { + BUG_ON((bnc = dma_alloc_coherent(dev, size, &dma, 0)) == NULL); + BUG_ON((ent = kmalloc(sizeof(*ent), GFP_KERNEL)) == NULL); + if (direction != DMA_FROM_DEVICE) + memcpy(bnc, ptr, size); + ent->dma = dma; + ent->bounce = bnc; + ent->host = ptr; + ent->size = size; + spin_lock_irqsave(&dma_map_lock, flags); + list_add(&ent->list, &dma_map_head); + spin_unlock_irqrestore(&dma_map_lock, flags); + } + + flush_write_buffers(); + return dma; +} +EXPORT_SYMBOL(dma_map_single); + +void +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + unsigned long flags; + + BUG_ON(direction == DMA_NONE); + + /* Fast-path check: are there any multi-page DMA mappings? */ + if (!list_empty(&dma_map_head)) { + spin_lock_irqsave(&dma_map_lock, flags); + list_for_each_entry ( ent, &dma_map_head, list ) { + if (DMA_MAP_MATCHES(ent, dma_addr)) { + list_del(&ent->list); + break; + } + } + spin_unlock_irqrestore(&dma_map_lock, flags); + if (&ent->list != &dma_map_head) { + BUG_ON(dma_addr != ent->dma); + BUG_ON(size != ent->size); + if (direction != DMA_TO_DEVICE) + memcpy(ent->host, ent->bounce, size); + dma_free_coherent(dev, size, ent->bounce, ent->dma); + kfree(ent); + } + } +} +EXPORT_SYMBOL(dma_unmap_single); + +void +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + unsigned long flags, off; + + /* Fast-path check: are there any multi-page DMA mappings? */ + if (!list_empty(&dma_map_head)) { + spin_lock_irqsave(&dma_map_lock, flags); + list_for_each_entry ( ent, &dma_map_head, list ) + if (DMA_MAP_MATCHES(ent, dma_handle)) + break; + spin_unlock_irqrestore(&dma_map_lock, flags); + if (&ent->list != &dma_map_head) { + off = dma_handle - ent->dma; + BUG_ON((off + size) > ent->size); + /*if (direction != DMA_TO_DEVICE)*/ + memcpy(ent->host+off, ent->bounce+off, size); + } + } +} +EXPORT_SYMBOL(dma_sync_single_for_cpu); + +void +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + unsigned long flags, off; + + /* Fast-path check: are there any multi-page DMA mappings? */ + if (!list_empty(&dma_map_head)) { + spin_lock_irqsave(&dma_map_lock, flags); + list_for_each_entry ( ent, &dma_map_head, list ) + if (DMA_MAP_MATCHES(ent, dma_handle)) + break; + spin_unlock_irqrestore(&dma_map_lock, flags); + if (&ent->list != &dma_map_head) { + off = dma_handle - ent->dma; + BUG_ON((off + size) > ent->size); + /*if (direction != DMA_FROM_DEVICE)*/ + memcpy(ent->bounce+off, ent->host+off, size); + } + } + + flush_write_buffers(); +} +EXPORT_SYMBOL(dma_sync_single_for_device); Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/process.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/process.c @@ -0,0 +1,793 @@ +/* + * linux/arch/i386/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include <stdarg.h> + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/user.h> +#include <linux/a.out.h> +#include <linux/interrupt.h> +#include <linux/config.h> +#include <linux/utsname.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/mc146818rtc.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/ptrace.h> +#include <linux/random.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/irq.h> +#include <asm/desc.h> +#include <asm-xen/xen-public/physdev.h> +#ifdef CONFIG_MATH_EMULATION +#include <asm/math_emu.h> +#endif + +#include <linux/irq.h> +#include <linux/err.h> + +#include <asm/tlbflush.h> +#include <asm/cpu.h> + +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); + +static int hlt_counter; + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Return saved PC of a blocked thread. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + return ((unsigned long *)tsk->thread.esp)[3]; +} + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +void disable_hlt(void) +{ + hlt_counter++; +} + +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + hlt_counter--; +} + +EXPORT_SYMBOL(enable_hlt); + +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */ +extern void stop_hz_timer(void); +extern void start_hz_timer(void); +void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) { + local_irq_enable(); + } else { + stop_hz_timer(); + HYPERVISOR_block(); /* implicit local_irq_enable() */ + start_hz_timer(); + } +} + +#ifdef CONFIG_HOTPLUG_CPU +#include <asm/nmi.h> +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* We shouldn't have to disable interrupts while dead, but + * some interrupts just don't seem to go away, and this makes + * it "work" for testing purposes. */ + /* Death loop */ + while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) + HYPERVISOR_yield(); + + local_irq_disable(); + __flush_tlb_all(); + cpu_set(smp_processor_id(), cpu_online_map); + local_irq_enable(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle (void) +{ + int cpu = _smp_processor_id(); + + /* endless idle loop with no priority at all */ + while (1) { + while (!need_resched()) { + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + rmb(); + + if (cpu_is_offline(cpu)) { +#if defined(CONFIG_XEN) && defined(CONFIG_HOTPLUG_CPU) + /* Tell hypervisor to take vcpu down. */ + HYPERVISOR_vcpu_down(cpu); +#endif + play_dead(); + } + + __get_cpu_var(irq_stat).idle_timestamp = jiffies; + xen_idle(); + } + schedule(); + } +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */ +/* Always use xen_idle() instead. */ +void __init select_idle_routine(const struct cpuinfo_x86 *c) {} + +void show_regs(struct pt_regs * regs) +{ + printk("\n"); + printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); + print_symbol("EIP is at %s\n", regs->eip); + + if (regs->xcs & 2) + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); + printk(" EFLAGS: %08lx %s (%s)\n", + regs->eflags, print_tainted(), system_utsname.release); + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->eax,regs->ebx,regs->ecx,regs->edx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx", + regs->esi, regs->edi, regs->ebp); + printk(" DS: %04x ES: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes); + + show_trace(NULL, &regs->esp); +} + +/* + * This gets run with %ebx containing the + * function to call, and %edx containing + * the "args". + */ +extern void kernel_thread_helper(void); +__asm__(".section .text\n" + ".align 4\n" + "kernel_thread_helper:\n\t" + "movl %edx,%eax\n\t" + "pushl %edx\n\t" + "call *%ebx\n\t" + "pushl %eax\n\t" + "call do_exit\n" + ".previous"); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(&regs, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; + regs.edx = (unsigned long) arg; + + regs.xds = __USER_DS; + regs.xes = __USER_DS; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = __KERNEL_CS; + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *tsk = current; + struct thread_struct *t = &tsk->thread; + + /* The process may have allocated an io port bitmap... nuke it. */ + if (unlikely(NULL != t->io_bitmap_ptr)) { + physdev_op_t op = { 0 }; + op.cmd = PHYSDEVOP_SET_IOBITMAP; + HYPERVISOR_physdev_op(&op); + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + // temporary debugging check + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } + + release_vm86_irqs(dead_task); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + struct pt_regs * childregs; + struct task_struct *tsk; + int err; + + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; + /* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the xss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ + childregs = (struct pt_regs *) ((unsigned long) childregs - 8); + *childregs = *regs; + childregs->eax = 0; + childregs->esp = esp; + + p->thread.esp = (unsigned long) childregs; + p->thread.esp0 = (unsigned long) (childregs+1); + + p->thread.eip = (unsigned long) ret_from_fork; + + savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); + + tsk = current; + if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { + struct desc_struct *desc; + struct user_desc info; + int idx; + + err = -EFAULT; + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) + goto out; + err = -EINVAL; + if (LDT_empty(&info)) + goto out; + + idx = info.entry_number; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + goto out; + + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + + p->thread.io_pl = current->thread.io_pl; + + err = 0; + out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +/* + * fill in the user structure for a core dump.. + */ +void dump_thread(struct pt_regs * regs, struct user * dump) +{ + int i; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + for (i = 0; i < 8; i++) + dump->u_debugreg[i] = current->thread.debugreg[i]; + + if (dump->start_stack < TASK_SIZE) + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; + + dump->regs.ebx = regs->ebx; + dump->regs.ecx = regs->ecx; + dump->regs.edx = regs->edx; + dump->regs.esi = regs->esi; + dump->regs.edi = regs->edi; + dump->regs.ebp = regs->ebp; + dump->regs.eax = regs->eax; + dump->regs.ds = regs->xds; + dump->regs.es = regs->xes; + savesegment(fs,dump->regs.fs); + savesegment(gs,dump->regs.gs); + dump->regs.orig_eax = regs->orig_eax; + dump->regs.eip = regs->eip; + dump->regs.cs = regs->xcs; + dump->regs.eflags = regs->eflags; + dump->regs.esp = regs->esp; + dump->regs.ss = regs->xss; + + dump->u_fpvalid = dump_fpu (regs, &dump->i387); +} + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs ptregs; + + ptregs = *(struct pt_regs *) + ((unsigned long)tsk->thread_info+THREAD_SIZE - sizeof(ptregs)); + ptregs.xcs &= 0xffff; + ptregs.xds &= 0xffff; + ptregs.xes &= 0xffff; + ptregs.xss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + boot_option_idle_override = 1; + return 1; +} + + +/* + * switch_to(x,yn) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %eax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + physdev_op_t iopl_op, iobmp_op; + multicall_entry_t _mcl[8], *mcl = _mcl; + + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ + + /* + * This is basically '__unlazy_fpu', except that we queue a + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + */ + if (prev_p->thread_info->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } + + /* + * Reload esp0, LDT and the page table pointer: + * This is load_esp0(tss, next) with a multicall. + */ + tss->esp0 = next->esp0; + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = tss->ss0; + mcl->args[1] = tss->esp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \ + [GDT_ENTRY_TLS_MIN + i]); \ + mcl->args[1] = ((u32 *)&next->tls_array[i])[0]; \ + mcl->args[2] = ((u32 *)&next->tls_array[i])[1]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->io_pl != next->io_pl)) { + iopl_op.cmd = PHYSDEVOP_SET_IOPL; + iopl_op.u.set_iopl.iopl = next->io_pl; + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = (unsigned long)&iopl_op; + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + iobmp_op.cmd = + PHYSDEVOP_SET_IOBITMAP; + iobmp_op.u.set_iobitmap.bitmap = + (unsigned long)next->io_bitmap_ptr; + iobmp_op.u.set_iobitmap.nr_ports = + next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = (unsigned long)&iobmp_op; + mcl++; + } + + (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); + + /* + * Restore %fs and %gs if needed. + */ + if (unlikely(next->fs | next->gs)) { + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); + } + + /* + * Now maybe reload the debug registers + */ + if (unlikely(next->debugreg[7])) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + return prev_p; +} + +asmlinkage int sys_fork(struct pt_regs regs) +{ + return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL); +} + +asmlinkage int sys_clone(struct pt_regs regs) +{ + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr, *child_tidptr; + + clone_flags = regs.ebx; + newsp = regs.ecx; + parent_tidptr = (int __user *)regs.edx; + child_tidptr = (int __user *)regs.edi; + if (!newsp) + newsp = regs.esp; + return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage int sys_vfork(struct pt_regs regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL); +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage int sys_execve(struct pt_regs regs) +{ + int error; + char * filename; + + filename = getname((char __user *) regs.ebx); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + goto out; + error = do_execve(filename, + (char __user * __user *) regs.ecx, + (char __user * __user *) regs.edx, + &regs); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } + putname(filename); +out: + return error; +} + +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ebp, esp, eip; + unsigned long stack_page; + int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack_page = (unsigned long)p->thread_info; + esp = p->thread.esp; + if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + return 0; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + ebp = *(unsigned long *) esp; + do { + if (ebp < stack_page || ebp > top_ebp+stack_page) + return 0; + eip = *(unsigned long *) (ebp+4); + if (!in_sched_functions(eip)) + return eip; + ebp = *(unsigned long *) ebp; + } while (count++ < 16); + return 0; +} + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = &current->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(t->tls_array + idx)) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +/* + * Set a given TLS descriptor: + */ +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) +{ + struct thread_struct *t = &current->thread; + struct user_desc info; + struct desc_struct *desc; + int cpu, idx; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + load_TLS(t, cpu); + + put_cpu(); + + return 0; +} + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) +{ + struct user_desc info; + struct desc_struct *desc; + int idx; + + if (get_user(idx, &u_info->entry_number)) + return -EFAULT; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/quirks.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/quirks.c @@ -0,0 +1,49 @@ +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include <linux/config.h> +#include <linux/pci.h> +#include <linux/irq.h> + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config, rev; + u32 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) + return; + + printk(KERN_INFO "Intel E7520/7320/7525 detected."); + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + config |= 0x2; + pci_write_config_byte(dev, 0xf4, config); + + /* read xTPR register */ + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + + if (!(word & (1 << 13))) { + dom0_op_t op; + printk(KERN_INFO "Disabling irq balancing and affinity\n"); + op.cmd = DOM0_PLATFORM_QUIRK; + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; + (void)HYPERVISOR_dom0_op(&op); + } + + config &= ~0x2; + /* disable access to config space*/ + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +#endif Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/reboot.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/reboot.c @@ -0,0 +1,262 @@ + +#define __KERNEL_SYSCALLS__ +static int errno; +#include <linux/errno.h> +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/unistd.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <asm/irq.h> +#include <asm/mmu_context.h> + +#include <ctrl_if.h> +#include <hypervisor.h> +#include <xen-public/dom0_ops.h> +#include <mach_suspend.h> +#include <queues.h> + +void machine_restart(char * __unused) +{ + /* We really want to get pending console data out before we die. */ + extern void xencons_force_flush(void); + xencons_force_flush(); + HYPERVISOR_reboot(); +} + +void machine_halt(void) +{ + machine_power_off(); +} + +void machine_power_off(void) +{ + /* We really want to get pending console data out before we die. */ + extern void xencons_force_flush(void); + xencons_force_flush(); + HYPERVISOR_shutdown(); +} + +int reboot_thru_bios = 0; /* for dmi_scan.c */ +EXPORT_SYMBOL(machine_restart); +EXPORT_SYMBOL(machine_halt); +EXPORT_SYMBOL(machine_power_off); + + +/* FIXME move all the rest, doesn't belong here */ + +/****************************************************************************** + * Stop/pickle callback handling. + */ + +/* Ignore multiple shutdown requests. */ +static int shutting_down = -1; + +static void __do_suspend(void) +{ + int i, j; + suspend_record_t *suspend_record; + + /* Hmmm... a cleaner interface to suspend/resume blkdevs would be nice. */ + /* XXX SMH: yes it would :-( */ +#ifdef CONFIG_XEN_BLKDEV_FRONTEND + extern void blkdev_suspend(void); + extern void blkdev_resume(void); +#else +#define blkdev_suspend() do{}while(0) +#define blkdev_resume() do{}while(0) +#endif + +#ifdef CONFIG_XEN_NETDEV_FRONTEND + extern void netif_suspend(void); + extern void netif_resume(void); +#else +#define netif_suspend() do{}while(0) +#define netif_resume() do{}while(0) +#endif + +#ifdef CONFIG_XEN_USB_FRONTEND + extern void usbif_resume(); +#else +#define usbif_resume() do{}while(0) +#endif + +#ifdef CONFIG_XEN_BLKDEV_GRANT + extern int gnttab_suspend(void); + extern int gnttab_resume(void); +#else +#define gnttab_suspend() do{}while(0) +#define gnttab_resume() do{}while(0) +#endif + + extern void time_suspend(void); + extern void time_resume(void); + extern unsigned long max_pfn; + extern unsigned int *pfn_to_mfn_frame_list; + + suspend_record = (suspend_record_t *)__get_free_page(GFP_KERNEL); + if ( suspend_record == NULL ) + goto out; + + suspend_record->nr_pfns = max_pfn; /* final number of pfns */ + + __cli(); + +#ifdef __i386__ + mm_pin_all(); + kmem_cache_shrink(pgd_cache); +#endif + + netif_suspend(); + + blkdev_suspend(); + + time_suspend(); + + ctrl_if_suspend(); + + irq_suspend(); + + gnttab_suspend(); + + HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; + clear_fixmap(FIX_SHARED_INFO); + + memcpy(&suspend_record->resume_info, &xen_start_info, + sizeof(xen_start_info)); + + HYPERVISOR_suspend(virt_to_machine(suspend_record) >> PAGE_SHIFT); + + shutting_down = -1; + + memcpy(&xen_start_info, &suspend_record->resume_info, + sizeof(xen_start_info)); + + set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); + + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + + memset(empty_zero_page, 0, PAGE_SIZE); + + for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) + { + pfn_to_mfn_frame_list[j] = + virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; + } + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = + virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; + + gnttab_resume(); + + irq_resume(); + + ctrl_if_resume(); + + time_resume(); + + blkdev_resume(); + + netif_resume(); + + usbif_resume(); + + __sti(); + + out: + if ( suspend_record != NULL ) + free_page((unsigned long)suspend_record); +} + +static int shutdown_process(void *__unused) +{ + static char *envp[] = { "HOME=/", "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; + static char *restart_argv[] = { "/sbin/reboot", NULL }; + static char *poweroff_argv[] = { "/sbin/poweroff", NULL }; + + extern asmlinkage long sys_reboot(int magic1, int magic2, + unsigned int cmd, void *arg); + + daemonize("shutdown"); + + switch ( shutting_down ) + { + case CMSG_SHUTDOWN_POWEROFF: + if ( execve("/sbin/poweroff", poweroff_argv, envp) < 0 ) + { + sys_reboot(LINUX_REBOOT_MAGIC1, + LINUX_REBOOT_MAGIC2, + LINUX_REBOOT_CMD_POWER_OFF, + NULL); + } + break; + + case CMSG_SHUTDOWN_REBOOT: + if ( execve("/sbin/reboot", restart_argv, envp) < 0 ) + { + sys_reboot(LINUX_REBOOT_MAGIC1, + LINUX_REBOOT_MAGIC2, + LINUX_REBOOT_CMD_RESTART, + NULL); + } + break; + } + + shutting_down = -1; /* could try again */ + + return 0; +} + +static void __shutdown_handler(void *unused) +{ + int err; + + if ( shutting_down != CMSG_SHUTDOWN_SUSPEND ) + { + err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES); + if ( err < 0 ) + printk(KERN_ALERT "Error creating shutdown process!\n"); + } + else + { + __do_suspend(); + } +} + +static void shutdown_handler(ctrl_msg_t *msg, unsigned long id) +{ + static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); + + if ( msg->subtype == CMSG_SHUTDOWN_SYSRQ ) + { + int sysrq = ((shutdown_sysrq_t *)&msg->msg[0])->key; + +#ifdef CONFIG_MAGIC_SYSRQ + handle_sysrq(sysrq, NULL, NULL); +#endif + } + else if ( (shutting_down == -1) && + ((msg->subtype == CMSG_SHUTDOWN_POWEROFF) || + (msg->subtype == CMSG_SHUTDOWN_REBOOT) || + (msg->subtype == CMSG_SHUTDOWN_SUSPEND)) ) + { + shutting_down = msg->subtype; + schedule_work(&shutdown_work); + } + else + { + printk("Ignore spurious shutdown request\n"); + } + + ctrl_if_send_response(msg); +} + +static int __init setup_shutdown_event(void) +{ + ctrl_if_register_receiver(CMSG_SHUTDOWN, shutdown_handler, 0); + return 0; +} + +__initcall(setup_shutdown_event); Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/setup.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/setup.c @@ -0,0 +1,1712 @@ +/* + * linux/arch/i386/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons <orc@xxxxxxxxxxxxxx>, July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle <bmoyle@xxxxxxxxxx>, February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel <mochel@xxxxxxxx>, March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@xxxxxxx>, December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/tty.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/apm_bios.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/seq_file.h> +#include <linux/console.h> +#include <linux/mca.h> +#include <linux/root_dev.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/nodemask.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <video/edid.h> +#include <asm/e820.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/arch_hooks.h> +#include <asm/sections.h> +#include <asm/io_apic.h> +#include <asm/ist.h> +#include <asm/io.h> +#include <xen_hypervisor.h> +#include <xen-public/physdev.h> +#include "setup_arch_pre.h" +#include <bios_ebda.h> + +/* Allows setting of maximum possible memory size */ +static unsigned long xen_override_max_pfn; + +static int xen_panic_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block xen_panic_block = { + xen_panic_event, NULL, 0 /* try to go last */ +}; + +int disable_pse __initdata = 0; + +/* + * Machine setup.. + */ + +#ifdef CONFIG_EFI +int efi_enabled = 0; +EXPORT_SYMBOL(efi_enabled); +#endif + +/* cpu data as detected by the assembly code in head.S */ +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; +/* common cpu data for all cpus */ +struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; + +unsigned long mmu_cr4_features; + +#ifdef CONFIG_ACPI_INTERPRETER + int acpi_disabled = 0; +#else + int acpi_disabled = 1; +#endif +EXPORT_SYMBOL(acpi_disabled); + +#ifdef CONFIG_ACPI_BOOT +int __initdata acpi_force = 0; +extern acpi_interrupt_flags acpi_sci_flags; +#endif + +/* for MCA, but anyone else can use it if they want */ +unsigned int machine_id; +unsigned int machine_submodel_id; +unsigned int BIOS_revision; +unsigned int mca_pentium_flag; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; + +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ +int bootloader_type; + +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * Setup options + */ +struct drive_info_struct { char dummy[32]; } drive_info; +struct screen_info screen_info; +struct apm_info apm_info; +struct sys_desc_table_struct { + unsigned short length; + unsigned char table[0]; +}; +struct edid_info edid_info; +struct ist_info ist_info; +struct e820map e820; + +extern void early_cpu_init(void); +extern void dmi_scan_machine(void); +extern void generic_apic_probe(char *); +extern int root_mountflags; + +unsigned long saved_videomode; + +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +static char command_line[COMMAND_LINE_SIZE]; + +unsigned char __initdata boot_params[PARAM_SIZE]; + +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +#define ADAPTER_ROM_RESOURCES \ + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; +#endif + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +#define STANDARD_IO_RESOURCES \ + (sizeof standard_io_resources / sizeof standard_io_resources[0]) + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + + /* Nothing to do if not running in dom0. */ + if (!(xen_start_info.flags & SIF_INITDOMAIN)) + return; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} +#endif + +/* + * Point at the empty zero page to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; +EXPORT_SYMBOL(HYPERVISOR_shared_info); + +unsigned int *phys_to_machine_mapping, *pfn_to_mfn_frame_list; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/* Raw start-of-day parameters from the hypervisor. */ +union xen_start_info_union xen_start_info_union; + +static void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + int i; + + if (efi_enabled) { + for (i = 0; i < memmap.nr_map; i++) { + current_addr = memmap.map[i].phys_addr + + (memmap.map[i].num_pages << 12); + if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) { + if (current_addr >= size) { + memmap.map[i].num_pages -= + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); + memmap.nr_map = i + 1; + return; + } + } + } + } + for (i = 0; i < e820.nr_map; i++) { + if (e820.map[i].type == E820_RAM) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr >= size) { + e820.map[i].size -= current_addr-size; + e820.nr_map = i + 1; + return; + } + } + } +} + +static void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) +{ + int x; + + if (!efi_enabled) { + x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; + } +} /* add_memory_region */ + +#define E820_DEBUG 1 + +static void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +} + +#if 0 +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; + +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; i<old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i=0; i < 2*old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i=0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; /* true number of change-points */ + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if <current_addr> > <last_addr>, swap */ + /* or, if current=<start_addr> & last=<end_addr>, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; i<overlap_entries; i++) + { + if (overlap_list[i] == change_point[chgidx]->pbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; i<overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + */ + if (type == E820_RAM) { + if (start < 0x100000ULL && end > 0xA0000ULL) { + if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); + if (end <= 0x100000ULL) + continue; + start = 0x100000ULL; + size = end - start; + } + } + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} +#endif + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void copy_edd(void) +{ + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); + edd.mbr_signature_nr = EDD_MBR_SIG_NR; + edd.edd_info_nr = EDD_NR; +} +#else +static inline void copy_edd(void) +{ +} +#endif + +/* + * Do NOT EVER look at the BIOS memory size location. + * It does not work on many machines. + */ +#define LOWMEMSIZE() (0x9f000) + +static void __init parse_cmdline_early (char ** cmdline_p) +{ + char c = ' ', *to = command_line, *from = saved_command_line; + int len = 0, max_cmdline; + int userdef = 0; + + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline); + /* Save unparsed command line copy for /proc/cmdline */ + saved_command_line[max_cmdline-1] = '\0'; + + for (;;) { + if (c != ' ') + goto next_char; + /* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to <mem>, overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * <start> to <start>+<mem>, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ + if (!memcmp(from, "mem=", 4)) { + if (to != command_line) + to--; + if (!memcmp(from+4, "nopentium", 9)) { + from += 9+4; + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long mem_size; + + mem_size = memparse(from+4, &from); +#if 0 + limit_regions(mem_size); + userdef=1; +#else + xen_override_max_pfn = + (unsigned long)(mem_size>>PAGE_SHIFT); +#endif + } + } + + else if (!memcmp(from, "memmap=", 7)) { + if (to != command_line) + to--; + if (!memcmp(from+7, "exactmap", 8)) { + from += 8+7; + e820.nr_map = 0; + userdef = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(from+7, &from); + if (*from == '@') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*from == '#') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*from == '$') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + userdef=1; + } + } + } + + else if (!memcmp(from, "noexec=", 7)) + noexec_setup(from + 7); + + +#ifdef CONFIG_X86_MPPARSE + /* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ + else if (!memcmp(from, "maxcpus=", 8)) { + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(from + 8, NULL, 0); + } +#endif + +#ifdef CONFIG_ACPI_BOOT + /* "acpi=off" disables both ACPI table parsing and interpreter */ + else if (!memcmp(from, "acpi=off", 8)) { + disable_acpi(); + } + + /* acpi=force to over-ride black-list */ + else if (!memcmp(from, "acpi=force", 10)) { + acpi_force = 1; + acpi_ht = 1; + acpi_disabled = 0; + } + + /* acpi=strict disables out-of-spec workarounds */ + else if (!memcmp(from, "acpi=strict", 11)) { + acpi_strict = 1; + } + + /* Limit ACPI just to boot-time to enable HT */ + else if (!memcmp(from, "acpi=ht", 7)) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + + /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ + else if (!memcmp(from, "pci=noacpi", 10)) { + acpi_disable_pci(); + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (!memcmp(from, "acpi=noirq", 10)) { + acpi_noirq_set(); + } + + else if (!memcmp(from, "acpi_sci=edge", 13)) + acpi_sci_flags.trigger = 1; + + else if (!memcmp(from, "acpi_sci=level", 14)) + acpi_sci_flags.trigger = 3; + + else if (!memcmp(from, "acpi_sci=high", 13)) + acpi_sci_flags.polarity = 1; + + else if (!memcmp(from, "acpi_sci=low", 12)) + acpi_sci_flags.polarity = 3; + +#ifdef CONFIG_X86_IO_APIC + else if (!memcmp(from, "acpi_skip_timer_override", 24)) + acpi_skip_timer_override = 1; +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + /* disable IO-APIC */ + else if (!memcmp(from, "noapic", 6)) + disable_ioapic_setup(); +#endif /* CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_ACPI_BOOT */ + + /* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ + else if (!memcmp(from, "highmem=", 8)) + highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; + + /* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ + else if (!memcmp(from, "vmalloc=", 8)) + __VMALLOC_RESERVE = memparse(from+8, &from); + + next_char: + c = *(from++); + if (!c) + break; + if (COMMAND_LINE_SIZE <= ++len) + break; + *(to++) = c; + } + *to = '\0'; + *cmdline_p = command_line; + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); + } +} + +#if 0 /* !XEN */ +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + } +} +#else +/* We don't use the fake e820 because we need to respond to user override. */ +void __init find_max_pfn(void) +{ + if ( xen_override_max_pfn < xen_start_info.nr_pages ) + xen_override_max_pfn = xen_start_info.nr_pages; + max_pfn = xen_override_max_pfn; +} +#endif /* XEN */ + +/* + * Determine low and high memory ranges: + */ +unsigned long __init find_max_low_pfn(void) +{ + unsigned long max_low_pfn; + + max_low_pfn = max_pfn; + if (max_low_pfn > MAXMEM_PFN) { + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", + MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_X86_PAE + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING "Warning only 4GB will be used.\n"); + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + } +#endif /* !CONFIG_X86_PAE */ +#endif /* !CONFIG_HIGHMEM */ + } else { + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ + printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif + } + return max_low_pfn; +} + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= ((max_low_pfn + 1) << PAGE_SHIFT)) + return 0; + if (end >= ((max_low_pfn + 1) << PAGE_SHIFT)) + end = (max_low_pfn + 1) << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +static void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +#ifndef CONFIG_XEN +/* + * workaround for Dell systems that neglect to reserve EBDA + */ +static void __init reserve_ebda_region(void) +{ + unsigned int addr; + addr = get_bios_ebda(); + if (addr) + reserve_bootmem(addr, PAGE_SIZE); +} +#endif + +#ifndef CONFIG_DISCONTIGMEM +void __init setup_bootmem_allocator(void); +static unsigned long __init setup_memory(void) +{ + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + xen_start_info.nr_pt_frames; + + find_max_pfn(); + + max_low_pfn = find_max_low_pfn(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) { + highstart_pfn = max_low_pfn; + } + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned int max_dma, low; + + /* + * XEN: Our notion of "DMA memory" is fake when running over Xen. + * We simply put all RAM in the DMA zone so that those drivers which + * needlessly specify GFP_DMA do not get starved of RAM unnecessarily. + * Those drivers that *do* require lowmem are screwed anyway when + * running over Xen! + */ + max_dma = max_low_pfn; + low = max_low_pfn; + + if (low < max_dma) + zones_size[ZONE_DMA] = low; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = low - max_dma; +#ifdef CONFIG_HIGHMEM + zones_size[ZONE_HIGHMEM] = highend_pfn - low; +#endif + } + free_area_init(zones_size); +} +#else +extern unsigned long setup_memory(void); +extern void zone_sizes_init(void); +#endif /* !CONFIG_DISCONTIGMEM */ + +void __init setup_bootmem_allocator(void) +{ + unsigned long bootmap_size; + /* + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); + + register_bootmem_low_pages(max_low_pfn); + + /* + * Reserve the bootmem bitmap itself as well. We do this in two + * steps (first step was init_bootmem()) because this catches + * the (very unlikely) case of us accidentally initializing the + * bootmem allocator with an invalid RAM area. + */ + reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); + +#ifndef CONFIG_XEN + /* + * reserve physical page 0 - it's a special BIOS page on many boxes, + * enabling clean reboots, SMP operation, laptop functions. + */ + reserve_bootmem(0, PAGE_SIZE); + + /* reserve EBDA region, it's a 4K region */ + reserve_ebda_region(); + + /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent + PCI prefetch into it (errata #56). Usually the page is reserved anyways, + unless you have no PS/2 mouse plugged in. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 6) + reserve_bootmem(0xa0000 - 4096, 4096); + +#ifdef CONFIG_SMP + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_bootmem(PAGE_SIZE, PAGE_SIZE); +#endif +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + */ + acpi_reserve_bootmem(); +#endif +#endif /* !CONFIG_XEN */ + +#ifdef CONFIG_BLK_DEV_INITRD + if (xen_start_info.mod_start) { + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { + /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/ + initrd_start = INITRD_START + PAGE_OFFSET; + initrd_end = initrd_start+INITRD_SIZE; + initrd_below_start_ok = 1; + } + else { + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + INITRD_START + INITRD_SIZE, + max_low_pfn << PAGE_SHIFT); + initrd_start = 0; + } + } +#endif + + phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list; +} + +/* + * The node 0 pgdat is initialized before all of these because + * it's needed for bootmem. node>0 pgdats have their virtual + * space allocated before the pagetables are in place to access + * them, so they can't be cleared then. + * + * This should all compile down to nothing when NUMA is off. + */ +void __init remapped_pgdat_init(void) +{ + int nid; + + for_each_online_node(nid) { + if (nid != 0) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + } +} + +/* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ +static void __init +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +{ + int i; + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + probe_roms(); +#endif + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; + res = alloc_bootmem_low(sizeof(struct resource)); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, code_resource); + request_resource(res, data_resource); + } + } +} + +/* + * Request address space for all standard resources + */ +static void __init register_memory(void) +{ + unsigned long gapstart, gapsize; + unsigned long long last; + int i; + + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else + legacy_init_iomem_resources(&code_resource, &data_resource); + + if (xen_start_info.flags & SIF_INITDOMAIN) + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } + + /* + * Start allocating dynamic PCI memory a bit into the gap, + * aligned up to the nearest megabyte. + * + * Question: should we try to pad it up a bit (do something + * like " + (gapsize >> 3)" in there too?). We now have the + * technology. + */ + pci_mem_start = (gapstart + 0xfffff) & ~0xfffff; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} + +/* Use inline assembly to define this because the nops are defined + as inline assembly strings in the include files and we cannot + get them easily into strings. */ +asm("\t.data\nintelnops: " + GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 + GENERIC_NOP7 GENERIC_NOP8); +asm("\t.data\nk8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); +asm("\t.data\nk7nops: " + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 + K7_NOP7 K7_NOP8); + +extern unsigned char intelnops[], k8nops[], k7nops[]; +static unsigned char *intel_nops[ASM_NOP_MAX+1] = { + NULL, + intelnops, + intelnops + 1, + intelnops + 1 + 2, + intelnops + 1 + 2 + 3, + intelnops + 1 + 2 + 3 + 4, + intelnops + 1 + 2 + 3 + 4 + 5, + intelnops + 1 + 2 + 3 + 4 + 5 + 6, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k7_nops[ASM_NOP_MAX+1] = { + NULL, + k7nops, + k7nops + 1, + k7nops + 1 + 2, + k7nops + 1 + 2 + 3, + k7nops + 1 + 2 + 3 + 4, + k7nops + 1 + 2 + 3 + 4 + 5, + k7nops + 1 + 2 + 3 + 4 + 5 + 6, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static struct nop { + int cpuid; + unsigned char **noptable; +} noptypes[] = { + { X86_FEATURE_K8, k8_nops }, + { X86_FEATURE_K7, k7_nops }, + { -1, NULL } +}; + +/* Replace instructions with better alternatives for this CPU type. + + This runs before SMP is initialized to avoid SMP problems with + self modifying code. This implies that assymetric systems where + APs have less capabilities than the boot processor are not handled. + In this case boot with "noreplacement". */ +void apply_alternatives(void *start, void *end) +{ + struct alt_instr *a; + int diff, i, k; + unsigned char **noptable = intel_nops; + for (i = 0; noptypes[i].cpuid >= 0; i++) { + if (boot_cpu_has(noptypes[i].cpuid)) { + noptable = noptypes[i].noptable; + break; + } + } + for (a = start; (void *)a < end; a++) { + if (!boot_cpu_has(a->cpuid)) + continue; + BUG_ON(a->replacementlen > a->instrlen); + memcpy(a->instr, a->replacement, a->replacementlen); + diff = a->instrlen - a->replacementlen; + /* Pad the rest with nops */ + for (i = a->replacementlen; diff > 0; diff -= k, i += k) { + k = diff; + if (k > ASM_NOP_MAX) + k = ASM_NOP_MAX; + memcpy(a->instr + i, noptable[k], k); + } + } +} + +static int no_replacement __initdata = 0; + +void __init alternative_instructions(void) +{ + extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + if (no_replacement) + return; + apply_alternatives(__alt_instructions, __alt_instructions_end); +} + +static int __init noreplacement_setup(char *s) +{ + no_replacement = 1; + return 0; +} + +__setup("noreplacement", noreplacement_setup); + +static char * __init machine_specific_memory_setup(void); + +#ifdef CONFIG_MCA +static void set_mca_bus(int x) +{ + MCA_bus = x; +} +#else +static void set_mca_bus(int x) { } +#endif + +/* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures + * for initialization. Note, the efi init code path is determined by the + * global efi_enabled. This allows the same kernel image to be used on existing + * systems (with a traditional BIOS) as well as on EFI systems. + */ +void __init setup_arch(char **cmdline_p) +{ + int i, j; + physdev_op_t op; + unsigned long max_low_pfn; + + /* Force a quick death if the kernel panics. */ + extern int panic_timeout; + if (panic_timeout == 0) + panic_timeout = 1; + + /* Register a call for panic conditions. */ + notifier_chain_register(&panic_notifier_list, &xen_panic_block); + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables); + + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + early_cpu_init(); + + /* + * FIXME: This isn't an official loader_type right + * now but does currently work with elilo. + * If we were configured as an EFI kernel, check to make + * sure that we were loaded correctly from elilo and that + * the system table is valid. If not, then initialize normally. + */ +#ifdef CONFIG_EFI + if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) + efi_enabled = 1; +#endif + + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd. + */ + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0); + drive_info = DRIVE_INFO; + screen_info = SCREEN_INFO; + edid_info = EDID_INFO; + apm_info.bios = APM_BIOS_INFO; + ist_info = IST_INFO; + saved_videomode = VIDEO_MODE; + if( SYS_DESC_TABLE.length != 0 ) { + set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); + machine_id = SYS_DESC_TABLE.table[0]; + machine_submodel_id = SYS_DESC_TABLE.table[1]; + BIOS_revision = SYS_DESC_TABLE.table[2]; + } + bootloader_type = LOADER_TYPE; + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + /* This is drawn from a dump from vgacon:startup in standard Linux. */ + screen_info.orig_video_mode = 3; + screen_info.orig_video_isVGA = 1; + screen_info.orig_video_lines = 25; + screen_info.orig_video_cols = 80; + screen_info.orig_video_ega_bx = 3; + screen_info.orig_video_points = 16; +#endif + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); +#endif + ARCH_SETUP + if (efi_enabled) + efi_init(); + else { + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(machine_specific_memory_setup()); + } + + copy_edd(); + + if (!MOUNT_ROOT_RDONLY) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) _text; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; + init_mm.brk = (PFN_UP(__pa(xen_start_info.pt_base)) + + xen_start_info.nr_pt_frames) << PAGE_SHIFT; + + /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */ + /*code_resource.start = virt_to_phys(_text);*/ + /*code_resource.end = virt_to_phys(_etext)-1;*/ + /*data_resource.start = virt_to_phys(_etext);*/ + /*data_resource.end = virt_to_phys(_edata)-1;*/ + + parse_cmdline_early(cmdline_p); + + max_low_pfn = setup_memory(); + + /* + * NOTE: before this point _nobody_ is allowed to allocate + * any memory using the bootmem allocator. Although the + * alloctor is now initialised only the first 8Mb of the kernel + * virtual address space has been mapped. All allocations before + * paging_init() has completed must use the alloc_bootmem_low_pages() + * variant (which allocates DMA'able memory) and care must be taken + * not to exceed the 8Mb limit. + */ + +#ifdef CONFIG_SMP + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ +#endif + paging_init(); + remapped_pgdat_init(); + zone_sizes_init(); + +#ifdef CONFIG_X86_FIND_SMP_CONFIG + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif + + /* Make sure we have a correctly sized P->M table. */ + if (max_pfn != xen_start_info.nr_pages) { + phys_to_machine_mapping = alloc_bootmem_low_pages( + max_pfn * sizeof(unsigned long)); + + if (max_pfn > xen_start_info.nr_pages) { + /* set to INVALID_P2M_ENTRY */ + memset(phys_to_machine_mapping, ~0, + max_pfn * sizeof(unsigned long)); + memcpy(phys_to_machine_mapping, + (unsigned long *)xen_start_info.mfn_list, + xen_start_info.nr_pages * sizeof(unsigned long)); + } else { + memcpy(phys_to_machine_mapping, + (unsigned long *)xen_start_info.mfn_list, + max_pfn * sizeof(unsigned long)); + if (HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, + (unsigned long *)xen_start_info.mfn_list + max_pfn, + xen_start_info.nr_pages - max_pfn, 0) != + (xen_start_info.nr_pages - max_pfn)) BUG(); + } + free_bootmem( + __pa(xen_start_info.mfn_list), + PFN_PHYS(PFN_UP(xen_start_info.nr_pages * + sizeof(unsigned long)))); + } + + pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE); + for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) + { + pfn_to_mfn_frame_list[j] = + virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; + } + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = + virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; + + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + +#ifdef CONFIG_EARLY_PRINTK + { + char *s = strstr(*cmdline_p, "earlyprintk="); + if (s) { + extern void setup_early_printk(char *); + + setup_early_printk(s); + printk("early console enabled\n"); + } + } +#endif + + + dmi_scan_machine(); + +#ifdef CONFIG_X86_GENERICARCH + generic_apic_probe(*cmdline_p); +#endif + if (efi_enabled) + efi_map_memmap(); + + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = current->thread.io_pl = 1; + HYPERVISOR_physdev_op(&op); + +#ifdef CONFIG_ACPI_BOOT + if (!(xen_start_info.flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + acpi_disabled = 1; + acpi_ht = 0; + } +#endif + +#ifdef CONFIG_ACPI_BOOT + /* + * Parse the ACPI tables for possible boot-time SMP configuration. + */ + acpi_boot_table_init(); + acpi_boot_init(); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + if (smp_found_config) + get_smp_config(); +#endif + + /* XXX Disable irqdebug until we have a way to avoid interrupt + * conflicts. */ + noirqdebug_setup(""); + + register_memory(); + + if (xen_start_info.flags & SIF_INITDOMAIN) { + if (!(xen_start_info.flags & SIF_PRIVILEGED)) + panic("Xen granted us console access " + "but not privileged status"); + +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + if (!efi_enabled || + (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif + } else { +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + extern const struct consw xennull_con; + extern int console_use_vt; +#if defined(CONFIG_VGA_CONSOLE) + /* disable VGA driver */ + ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB; +#endif + conswitchp = &xennull_con; + console_use_vt = 0; +#endif + } +} + +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + HYPERVISOR_crash(); + /* we're never actually going to get here... */ + return NOTIFY_DONE; +} + +#include "setup_arch_post.h" +/* + * Local Variables: + * mode:c + * c-file-style:"k&r" + * c-basic-offset:8 + * End: + */ Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/signal.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/signal.c @@ -0,0 +1,665 @@ +/* + * linux/arch/i386/kernel/signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/personality.h> +#include <linux/suspend.h> +#include <linux/ptrace.h> +#include <linux/elf.h> +#include <asm/processor.h> +#include <asm/ucontext.h> +#include <asm/uaccess.h> +#include <asm/i387.h> +#include "../../kernel/sigframe.h" + +#define DEBUG_SIG 0 + +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +/* + * Atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int +sys_sigsuspend(int history0, int history1, old_sigset_t mask) +{ + struct pt_regs * regs = (struct pt_regs *) &history0; + sigset_t saveset; + + mask &= _BLOCKABLE; + spin_lock_irq(&current->sighand->siglock); + saveset = current->blocked; + siginitset(&current->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(&current->sighand->siglock); + + regs->eax = -EINTR; + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(regs, &saveset)) + return -EINTR; + } +} + +asmlinkage int +sys_rt_sigsuspend(struct pt_regs regs) +{ + sigset_t saveset, newset; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (regs.ecx != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset, (sigset_t __user *)regs.ebx, sizeof(newset))) + return -EFAULT; + sigdelsetmask(&newset, ~_BLOCKABLE); + + spin_lock_irq(&current->sighand->siglock); + saveset = current->blocked; + current->blocked = newset; + recalc_sigpending(); + spin_unlock_irq(&current->sighand->siglock); + + regs.eax = -EINTR; + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(&regs, &saveset)) + return -EINTR; + } +} + +asmlinkage int +sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} + +asmlinkage int +sys_sigaltstack(unsigned long ebx) +{ + /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ + struct pt_regs *regs = (struct pt_regs *)&ebx; + const stack_t __user *uss = (const stack_t __user *)ebx; + stack_t __user *uoss = (stack_t __user *)regs->ecx; + + return do_sigaltstack(uss, uoss, regs->esp); +} + + +/* + * Do a signal return; undo the signal stack. + */ + +static int +restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax) +{ + unsigned int err = 0; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + +#define COPY(x) err |= __get_user(regs->x, &sc->x) + +#define COPY_SEG(seg) \ + { unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->x##seg = tmp; } + +#define COPY_SEG_STRICT(seg) \ + { unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->x##seg = tmp|3; } + +#define GET_SEG(seg) \ + { unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg,tmp); } + +#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF | \ + X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ + X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) + + GET_SEG(gs); + GET_SEG(fs); + COPY_SEG(es); + COPY_SEG(ds); + COPY(edi); + COPY(esi); + COPY(ebp); + COPY(esp); + COPY(ebx); + COPY(edx); + COPY(ecx); + COPY(eip); + COPY_SEG_STRICT(cs); + COPY_SEG_STRICT(ss); + + { + unsigned int tmpflags; + err |= __get_user(tmpflags, &sc->eflags); + regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_eax = -1; /* disable syscall checks */ + } + + { + struct _fpstate __user * buf; + err |= __get_user(buf, &sc->fpstate); + if (buf) { + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) + goto badframe; + err |= restore_i387(buf); + } else { + struct task_struct *me = current; + if (used_math()) { + clear_fpu(me); + clear_used_math(); + } + } + } + + err |= __get_user(*peax, &sc->eax); + return err; + +badframe: + return 1; +} + +asmlinkage int sys_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *) &__unused; + struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); + sigset_t set; + int eax; + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) + || (_NSIG_WORDS > 1 + && __copy_from_user(&set.sig[1], &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(&current->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(&current->sighand->siglock); + + if (restore_sigcontext(regs, &frame->sc, &eax)) + goto badframe; + return eax; + +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *) &__unused; + struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); + sigset_t set; + int eax; + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(&current->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(&current->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + goto badframe; + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) + goto badframe; + + return eax; + +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +/* + * Set up a signal frame. + */ + +static int +setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + int tmp, err = 0; + + tmp = 0; + __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->fs); + + err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); + err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); + err |= __put_user(regs->edi, &sc->edi); + err |= __put_user(regs->esi, &sc->esi); + err |= __put_user(regs->ebp, &sc->ebp); + err |= __put_user(regs->esp, &sc->esp); + err |= __put_user(regs->ebx, &sc->ebx); + err |= __put_user(regs->edx, &sc->edx); + err |= __put_user(regs->ecx, &sc->ecx); + err |= __put_user(regs->eax, &sc->eax); + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); + err |= __put_user(regs->eip, &sc->eip); + err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); + err |= __put_user(regs->eflags, &sc->eflags); + err |= __put_user(regs->esp, &sc->esp_at_signal); + err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); + + tmp = save_i387(fpstate); + if (tmp < 0) + err = 1; + else + err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + + /* non-iBCS2 extensions.. */ + err |= __put_user(mask, &sc->oldmask); + err |= __put_user(current->thread.cr2, &sc->cr2); + + return err; +} + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +{ + unsigned long esp; + + /* Default to using normal stack */ + esp = regs->esp; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(esp) == 0) + esp = current->sas_ss_sp + current->sas_ss_size; + } + + /* This is the legacy signal stack switching. */ + else if ((regs->xss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + esp = (unsigned long) ka->sa.sa_restorer; + } + + return (void __user *)((esp - frame_size) & -8ul); +} + +/* These symbols are defined with the addresses in the vsyscall page. + See vsyscall-sigreturn.S. */ +extern void __user __kernel_sigreturn; +extern void __user __kernel_rt_sigreturn; + +static void setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs * regs) +{ + void __user *restorer; + struct sigframe __user *frame; + int err = 0; + int usig; + + frame = get_sigframe(ka, regs, sizeof(*frame)); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto give_sigsegv; + + usig = current_thread_info()->exec_domain + && current_thread_info()->exec_domain->signal_invmap + && sig < 32 + ? current_thread_info()->exec_domain->signal_invmap[sig] + : sig; + + err = __put_user(usig, &frame->sig); + if (err) + goto give_sigsegv; + + err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); + if (err) + goto give_sigsegv; + + if (_NSIG_WORDS > 1) { + err = __copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask)); + if (err) + goto give_sigsegv; + } + + restorer = &__kernel_sigreturn; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + /* Set up to return from userspace. */ + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is popl %eax ; movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); + err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); + + if (err) + goto give_sigsegv; + + /* Set up registers for signal handler */ + regs->esp = (unsigned long) frame; + regs->eip = (unsigned long) ka->sa.sa_handler; + regs->eax = (unsigned long) sig; + regs->edx = (unsigned long) 0; + regs->ecx = (unsigned long) 0; + + set_fs(USER_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __USER_DS; + regs->xcs = __USER_CS; + + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); + +#if DEBUG_SIG + printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + current->comm, current->pid, frame, regs->eip, frame->pretcode); +#endif + + return; + +give_sigsegv: + force_sigsegv(sig, current); +} + +static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs * regs) +{ + void __user *restorer; + struct rt_sigframe __user *frame; + int err = 0; + int usig; + + frame = get_sigframe(ka, regs, sizeof(*frame)); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto give_sigsegv; + + usig = current_thread_info()->exec_domain + && current_thread_info()->exec_domain->signal_invmap + && sig < 32 + ? current_thread_info()->exec_domain->signal_invmap[sig] + : sig; + + err |= __put_user(usig, &frame->sig); + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + goto give_sigsegv; + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->esp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto give_sigsegv; + + /* Set up to return from userspace. */ + restorer = &__kernel_rt_sigreturn; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); + err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); + + if (err) + goto give_sigsegv; + + /* Set up registers for signal handler */ + regs->esp = (unsigned long) frame; + regs->eip = (unsigned long) ka->sa.sa_handler; + regs->eax = (unsigned long) usig; + regs->edx = (unsigned long) &frame->info; + regs->ecx = (unsigned long) &frame->uc; + + set_fs(USER_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __USER_DS; + regs->xcs = __USER_CS; + + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); + +#if DEBUG_SIG + printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + current->comm, current->pid, frame, regs->eip, frame->pretcode); +#endif + + return; + +give_sigsegv: + force_sigsegv(sig, current); +} + +/* + * OK, we're invoking a handler + */ + +static void +handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, + sigset_t *oldset, struct pt_regs * regs) +{ + /* Are we from a system call? */ + if (regs->orig_eax >= 0) { + /* If so, check system call restarting.. */ + switch (regs->eax) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->eax = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ka->sa.sa_flags & SA_RESTART)) { + regs->eax = -EINTR; + break; + } + /* fallthrough */ + case -ERESTARTNOINTR: + regs->eax = regs->orig_eax; + regs->eip -= 2; + } + } + + /* + * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so + * that register information in the sigcontext is correct. + */ + if (unlikely(regs->eflags & TF_MASK) + && likely(current->ptrace & PT_DTRACE)) { + current->ptrace &= ~PT_DTRACE; + regs->eflags &= ~TF_MASK; + } + + /* Set up the stack frame */ + if (ka->sa.sa_flags & SA_SIGINFO) + setup_rt_frame(sig, ka, info, oldset, regs); + else + setup_frame(sig, ka, oldset, regs); + + if (!(ka->sa.sa_flags & SA_NODEFER)) { + spin_lock_irq(&current->sighand->siglock); + sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask); + sigaddset(&current->blocked,sig); + recalc_sigpending(); + spin_unlock_irq(&current->sighand->siglock); + } +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) +{ + siginfo_t info; + int signr; + struct k_sigaction ka; + + /* + * We want the common case to go fast, which + * is why we may in certain cases get here from + * kernel mode. Just return without doing anything + * if so. + */ + if ((regs->xcs & 2) != 2) + return 1; + + if (current->flags & PF_FREEZE) { + refrigerator(0); + goto no_signal; + } + + if (!oldset) + oldset = &current->blocked; + + signr = get_signal_to_deliver(&info, &ka, regs, NULL); + if (signr > 0) { + /* Reenable any watchpoints before delivering the + * signal to user space. The processor register will + * have been cleared if the watchpoint triggered + * inside the kernel. + */ + if (unlikely(current->thread.debugreg[7])) { + loaddebug(&current->thread, 7); + } + + /* Whee! Actually deliver the signal. */ + handle_signal(signr, &info, &ka, oldset, regs); + return 1; + } + + no_signal: + /* Did we come from a system call? */ + if (regs->orig_eax >= 0) { + /* Restart the system call - no handlers present */ + if (regs->eax == -ERESTARTNOHAND || + regs->eax == -ERESTARTSYS || + regs->eax == -ERESTARTNOINTR) { + regs->eax = regs->orig_eax; + regs->eip -= 2; + } + if (regs->eax == -ERESTART_RESTARTBLOCK){ + regs->eax = __NR_restart_syscall; + regs->eip -= 2; + } + } + return 0; +} + +/* + * notification of userspace execution resumption + * - triggered by current->work.notify_resume + */ +__attribute__((regparm(3))) +void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, + __u32 thread_info_flags) +{ + /* Pending single-step? */ + if (thread_info_flags & _TIF_SINGLESTEP) { + regs->eflags |= TF_MASK; + clear_thread_flag(TIF_SINGLESTEP); + } + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs,oldset); + + clear_thread_flag(TIF_IRET); +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/syscall_table.S =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/syscall_table.S @@ -0,0 +1,291 @@ +.data +ENTRY(sys_call_table) + .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ + .long sys_exit + .long sys_fork + .long sys_read + .long sys_write + .long sys_open /* 5 */ + .long sys_close + .long sys_waitpid + .long sys_creat + .long sys_link + .long sys_unlink /* 10 */ + .long sys_execve + .long sys_chdir + .long sys_time + .long sys_mknod + .long sys_chmod /* 15 */ + .long sys_lchown16 + .long sys_ni_syscall /* old break syscall holder */ + .long sys_stat + .long sys_lseek + .long sys_getpid /* 20 */ + .long sys_mount + .long sys_oldumount + .long sys_setuid16 + .long sys_getuid16 + .long sys_stime /* 25 */ + .long sys_ptrace + .long sys_alarm + .long sys_fstat + .long sys_pause + .long sys_utime /* 30 */ + .long sys_ni_syscall /* old stty syscall holder */ + .long sys_ni_syscall /* old gtty syscall holder */ + .long sys_access + .long sys_nice + .long sys_ni_syscall /* 35 - old ftime syscall holder */ + .long sys_sync + .long sys_kill + .long sys_rename + .long sys_mkdir + .long sys_rmdir /* 40 */ + .long sys_dup + .long sys_pipe + .long sys_times + .long sys_ni_syscall /* old prof syscall holder */ + .long sys_brk /* 45 */ + .long sys_setgid16 + .long sys_getgid16 + .long sys_signal + .long sys_geteuid16 + .long sys_getegid16 /* 50 */ + .long sys_acct + .long sys_umount /* recycled never used phys() */ + .long sys_ni_syscall /* old lock syscall holder */ + .long sys_ioctl + .long sys_fcntl /* 55 */ + .long sys_ni_syscall /* old mpx syscall holder */ + .long sys_setpgid + .long sys_ni_syscall /* old ulimit syscall holder */ + .long sys_olduname + .long sys_umask /* 60 */ + .long sys_chroot + .long sys_ustat + .long sys_dup2 + .long sys_getppid + .long sys_getpgrp /* 65 */ + .long sys_setsid + .long sys_sigaction + .long sys_sgetmask + .long sys_ssetmask + .long sys_setreuid16 /* 70 */ + .long sys_setregid16 + .long sys_sigsuspend + .long sys_sigpending + .long sys_sethostname + .long sys_setrlimit /* 75 */ + .long sys_old_getrlimit + .long sys_getrusage + .long sys_gettimeofday + .long sys_settimeofday + .long sys_getgroups16 /* 80 */ + .long sys_setgroups16 + .long old_select + .long sys_symlink + .long sys_lstat + .long sys_readlink /* 85 */ + .long sys_uselib + .long sys_swapon + .long sys_reboot + .long old_readdir + .long old_mmap /* 90 */ + .long sys_munmap + .long sys_truncate + .long sys_ftruncate + .long sys_fchmod + .long sys_fchown16 /* 95 */ + .long sys_getpriority + .long sys_setpriority + .long sys_ni_syscall /* old profil syscall holder */ + .long sys_statfs + .long sys_fstatfs /* 100 */ + .long sys_ioperm + .long sys_socketcall + .long sys_syslog + .long sys_setitimer + .long sys_getitimer /* 105 */ + .long sys_newstat + .long sys_newlstat + .long sys_newfstat + .long sys_uname + .long sys_iopl /* 110 */ + .long sys_vhangup + .long sys_ni_syscall /* old "idle" system call */ + .long sys_vm86old + .long sys_wait4 + .long sys_swapoff /* 115 */ + .long sys_sysinfo + .long sys_ipc + .long sys_fsync + .long sys_sigreturn + .long sys_clone /* 120 */ + .long sys_setdomainname + .long sys_newuname + .long sys_modify_ldt + .long sys_adjtimex + .long sys_mprotect /* 125 */ + .long sys_sigprocmask + .long sys_ni_syscall /* old "create_module" */ + .long sys_init_module + .long sys_delete_module + .long sys_ni_syscall /* 130: old "get_kernel_syms" */ + .long sys_quotactl + .long sys_getpgid + .long sys_fchdir + .long sys_bdflush + .long sys_sysfs /* 135 */ + .long sys_personality + .long sys_ni_syscall /* reserved for afs_syscall */ + .long sys_setfsuid16 + .long sys_setfsgid16 + .long sys_llseek /* 140 */ + .long sys_getdents + .long sys_select + .long sys_flock + .long sys_msync + .long sys_readv /* 145 */ + .long sys_writev + .long sys_getsid + .long sys_fdatasync + .long sys_sysctl + .long sys_mlock /* 150 */ + .long sys_munlock + .long sys_mlockall + .long sys_munlockall + .long sys_sched_setparam + .long sys_sched_getparam /* 155 */ + .long sys_sched_setscheduler + .long sys_sched_getscheduler + .long sys_sched_yield + .long sys_sched_get_priority_max + .long sys_sched_get_priority_min /* 160 */ + .long sys_sched_rr_get_interval + .long sys_nanosleep + .long sys_mremap + .long sys_setresuid16 + .long sys_getresuid16 /* 165 */ + .long sys_vm86 + .long sys_ni_syscall /* Old sys_query_module */ + .long sys_poll + .long sys_nfsservctl + .long sys_setresgid16 /* 170 */ + .long sys_getresgid16 + .long sys_prctl + .long sys_rt_sigreturn + .long sys_rt_sigaction + .long sys_rt_sigprocmask /* 175 */ + .long sys_rt_sigpending + .long sys_rt_sigtimedwait + .long sys_rt_sigqueueinfo + .long sys_rt_sigsuspend + .long sys_pread64 /* 180 */ + .long sys_pwrite64 + .long sys_chown16 + .long sys_getcwd + .long sys_capget + .long sys_capset /* 185 */ + .long sys_sigaltstack + .long sys_sendfile + .long sys_ni_syscall /* reserved for streams1 */ + .long sys_ni_syscall /* reserved for streams2 */ + .long sys_vfork /* 190 */ + .long sys_getrlimit + .long sys_mmap2 + .long sys_truncate64 + .long sys_ftruncate64 + .long sys_stat64 /* 195 */ + .long sys_lstat64 + .long sys_fstat64 + .long sys_lchown + .long sys_getuid + .long sys_getgid /* 200 */ + .long sys_geteuid + .long sys_getegid + .long sys_setreuid + .long sys_setregid + .long sys_getgroups /* 205 */ + .long sys_setgroups + .long sys_fchown + .long sys_setresuid + .long sys_getresuid + .long sys_setresgid /* 210 */ + .long sys_getresgid + .long sys_chown + .long sys_setuid + .long sys_setgid + .long sys_setfsuid /* 215 */ + .long sys_setfsgid + .long sys_pivot_root + .long sys_mincore + .long sys_madvise + .long sys_getdents64 /* 220 */ + .long sys_fcntl64 + .long sys_ni_syscall /* reserved for TUX */ + .long sys_ni_syscall + .long sys_gettid + .long sys_readahead /* 225 */ + .long sys_setxattr + .long sys_lsetxattr + .long sys_fsetxattr + .long sys_getxattr + .long sys_lgetxattr /* 230 */ + .long sys_fgetxattr + .long sys_listxattr + .long sys_llistxattr + .long sys_flistxattr + .long sys_removexattr /* 235 */ + .long sys_lremovexattr + .long sys_fremovexattr + .long sys_tkill + .long sys_sendfile64 + .long sys_futex /* 240 */ + .long sys_sched_setaffinity + .long sys_sched_getaffinity + .long sys_set_thread_area + .long sys_get_thread_area + .long sys_io_setup /* 245 */ + .long sys_io_destroy + .long sys_io_getevents + .long sys_io_submit + .long sys_io_cancel + .long sys_fadvise64 /* 250 */ + .long sys_ni_syscall + .long sys_exit_group + .long sys_lookup_dcookie + .long sys_epoll_create + .long sys_epoll_ctl /* 255 */ + .long sys_epoll_wait + .long sys_remap_file_pages + .long sys_set_tid_address + .long sys_timer_create + .long sys_timer_settime /* 260 */ + .long sys_timer_gettime + .long sys_timer_getoverrun + .long sys_timer_delete + .long sys_clock_settime + .long sys_clock_gettime /* 265 */ + .long sys_clock_getres + .long sys_clock_nanosleep + .long sys_statfs64 + .long sys_fstatfs64 + .long sys_tgkill /* 270 */ + .long sys_utimes + .long sys_fadvise64_64 + .long sys_ni_syscall /* sys_vserver */ + .long sys_mbind + .long sys_get_mempolicy + .long sys_set_mempolicy + .long sys_mq_open + .long sys_mq_unlink + .long sys_mq_timedsend + .long sys_mq_timedreceive /* 280 */ + .long sys_mq_notify + .long sys_mq_getsetattr + .long sys_ni_syscall /* reserved for kexec */ + .long sys_waitid + .long sys_ni_syscall /* 285 */ /* available */ + .long sys_add_key + .long sys_request_key + .long sys_keyctl Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/time.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/time.c @@ -0,0 +1,929 @@ +/* + * linux/arch/i386/kernel/time.c + * + * Copyright (C) 1991, 1992, 1995 Linus Torvalds + * + * This file contains the PC-specific time handling details: + * reading the RTC at bootup, etc.. + * 1994-07-02 Alan Modra + * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime + * 1995-03-26 Markus Kuhn + * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 + * precision CMOS clock update + * 1996-05-03 Ingo Molnar + * fixed time warps in do_[slow|fast]_gettimeoffset() + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-09-05 (Various) + * More robust do_fast_gettimeoffset() algorithm implemented + * (works with APM, Cyrix 6x86MX and Centaur C6), + * monotonic gettimeofday() with fast_get_timeoffset(), + * drift-proof precision TSC calibration on boot + * (C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, Andrew D. + * Balsa <andrebalsa@xxxxxxxxxx>, Philip Gladstone <philip@xxxxxxxxxx>; + * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@xxxxxxxxxxxxx>). + * 1998-12-16 Andrea Arcangeli + * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy + * because was not accounting lost_ticks. + * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli + * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/time.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/bcd.h> +#include <linux/efi.h> +#include <linux/mca.h> +#include <linux/sysctl.h> +#include <linux/percpu.h> + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/msr.h> +#include <asm/delay.h> +#include <asm/mpspec.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/timer.h> + +#include "mach_time.h" + +#include <linux/timex.h> +#include <linux/config.h> + +#include <asm/hpet.h> + +#include <asm/arch_hooks.h> + +#include "io_ports.h" + +extern spinlock_t i8259A_lock; +int pit_latch_buggy; /* extern */ + +u64 jiffies_64 = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +#if defined(__x86_64__) +unsigned long vxtime_hz = PIT_TICK_RATE; +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; +struct timespec __xtime __section_xtime; +struct timezone __sys_tz __section_sys_tz; +#endif + +#if defined(__x86_64__) +unsigned int cpu_khz; /* Detected as we calibrate the TSC */ +#else +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +#endif + +extern unsigned long wall_jiffies; + +DEFINE_SPINLOCK(rtc_lock); + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +extern struct init_timer_opts timer_tsc_init; +extern struct timer_opts timer_tsc; +struct timer_opts *cur_timer = &timer_tsc; + +/* These are peridically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + u32 tsc_to_usec_mul; + int tsc_shift; + u32 version; +}; +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); +static struct timeval shadow_tv; + +/* Keep track of last time we did processing/updating of jiffies and xtime. */ +static u64 processed_system_time; /* System time (ns) at last processing. */ +static DEFINE_PER_CPU(u64, processed_system_time); + +#define NS_PER_TICK (1000000000ULL/HZ) + +#define HANDLE_USEC_UNDERFLOW(_tv) do { \ + while ((_tv).tv_usec < 0) { \ + (_tv).tv_usec += USEC_PER_SEC; \ + (_tv).tv_sec--; \ + } \ +} while (0) +#define HANDLE_USEC_OVERFLOW(_tv) do { \ + while ((_tv).tv_usec >= USEC_PER_SEC) { \ + (_tv).tv_usec -= USEC_PER_SEC; \ + (_tv).tv_sec++; \ + } \ +} while (0) +static inline void __normalize_time(time_t *sec, s64 *nsec) +{ + while (*nsec >= NSEC_PER_SEC) { + (*nsec) -= NSEC_PER_SEC; + (*sec)++; + } + while (*nsec < 0) { + (*nsec) += NSEC_PER_SEC; + (*sec)--; + } +} + +/* Does this guest OS track Xen time, or set its wall clock independently? */ +static int independent_wallclock = 0; +static int __init __independent_wallclock(char *str) +{ + independent_wallclock = 1; + return 1; +} +__setup("independent_wallclock", __independent_wallclock); +#define INDEPENDENT_WALLCLOCK() \ + (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN)) + +int tsc_disable __initdata = 0; + +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +struct timer_opts timer_tsc = { + .name = "tsc", + .delay = delay_tsc, +}; + +static inline u32 down_shift(u64 time, int shift) +{ + if ( shift < 0 ) + return (u32)(time >> -shift); + return (u32)((u32)time << shift); +} + +/* + * 32-bit multiplication of integer multiplicand and fractional multiplier + * yielding 32-bit integer product. + */ +static inline u32 mul_frac(u32 multiplicand, u32 multiplier) +{ + u32 product_int, product_frac; + __asm__ ( + "mul %3" + : "=a" (product_frac), "=d" (product_int) + : "0" (multiplicand), "r" (multiplier) ); + return product_int; +} + +void init_cpu_khz(void) +{ + u64 __cpu_khz = 1000000ULL << 32; + struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0]; + do_div(__cpu_khz, info->tsc_to_system_mul); + cpu_khz = down_shift(__cpu_khz, -info->tsc_shift); + printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now; + u32 delta; + rdtscll(now); + delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); + return mul_frac(delta, shadow->tsc_to_nsec_mul); +} + +static unsigned long get_usec_offset(struct shadow_time_info *shadow) +{ + u64 now; + u32 delta; + rdtscll(now); + delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); + return mul_frac(delta, shadow->tsc_to_usec_mul); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + long wtm_nsec, xtime_nsec; + time_t wtm_sec, xtime_sec; + u64 tmp, usec; + + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_usec = s->wc_usec; + + if (INDEPENDENT_WALLCLOCK()) + return; + + if ((time_status & STA_UNSYNC) != 0) + return; + + /* Adjust wall-clock time base based on wall_jiffies ticks. */ + usec = processed_system_time; + do_div(usec, 1000); + usec += (u64)shadow_tv.tv_sec * 1000000ULL; + usec += (u64)shadow_tv.tv_usec; + usec -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); + + /* Split wallclock base into seconds and nanoseconds. */ + tmp = usec; + xtime_nsec = do_div(tmp, 1000000) * 1000ULL; + xtime_sec = (time_t)tmp; + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec); + + set_normalized_timespec(&xtime, xtime_sec, xtime_nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &s->vcpu_time[smp_processor_id()]; + dst = &per_cpu(shadow_time, smp_processor_id()); + + do { + dst->version = src->time_version2; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + } + while (dst->version != src->time_version1); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; + + if ((shadow_tv.tv_sec != s->wc_sec) || + (shadow_tv.tv_usec != s->wc_usec)) + update_wallclock(); +} + +static inline int time_values_up_to_date(int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()]; + dst = &per_cpu(shadow_time, smp_processor_id()); + + return (dst->version == src->time_version2); +} + +#define TIME_VALUES_UP_TO_DATE \ + ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) + +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +/* + * This version of gettimeofday has microsecond resolution + * and better than microsecond precision on fast x86 machines with TSC. + */ +void do_gettimeofday(struct timeval *tv) +{ + unsigned long seq; + unsigned long usec, sec; + unsigned long max_ntp_tick; + unsigned long flags; + s64 nsec; + unsigned int cpu; + struct shadow_time_info *shadow; + + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); + + do { + unsigned long lost; + + seq = read_seqbegin(&xtime_lock); + + usec = get_usec_offset(shadow); + lost = jiffies - wall_jiffies; + + /* + * If time_adjust is negative then NTP is slowing the clock + * so make sure not to go into next possible interval. + * Better to lose some accuracy than have time go backwards.. + */ + if (unlikely(time_adjust < 0)) { + max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; + usec = min(usec, max_ntp_tick); + + if (lost) + usec += lost * max_ntp_tick; + } + else if (unlikely(lost)) + usec += lost * (USEC_PER_SEC / HZ); + + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / NSEC_PER_USEC); + + nsec = shadow->system_timestamp - processed_system_time; + __normalize_time(&sec, &nsec); + usec += (long)nsec / NSEC_PER_USEC; + + if (unlikely(!time_values_up_to_date(cpu))) { + /* + * We may have blocked for a long time, + * rendering our calculations invalid + * (e.g. the time delta may have + * overflowed). Detect that and recalculate + * with fresh values. + */ + write_seqlock_irqsave(&xtime_lock, flags); + __get_time_values_from_xen(); + write_sequnlock_irqrestore(&xtime_lock, flags); + continue; + } + } while (read_seqretry(&xtime_lock, seq)); + + put_cpu(); + + while (usec >= USEC_PER_SEC) { + usec -= USEC_PER_SEC; + sec++; + } + + tv->tv_sec = sec; + tv->tv_usec = usec; +} + +EXPORT_SYMBOL(do_gettimeofday); + +int do_settimeofday(struct timespec *tv) +{ + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec; + s64 nsec; + struct timespec xentime; + unsigned int cpu; + struct shadow_time_info *shadow; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + if (!INDEPENDENT_WALLCLOCK()) + return 0; /* Silent failure? */ + + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); + + write_seqlock_irq(&xtime_lock); + + /* + * Ensure we don't get blocked for a long time so that our time delta + * overflows. If that were to happen then our shadow time values would + * be stale, so we can retry with fresh ones. + */ + again: + nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow); + if (unlikely(!time_values_up_to_date(cpu))) { + __get_time_values_from_xen(); + goto again; + } + + __normalize_time(&sec, &nsec); + set_normalized_timespec(&xentime, sec, nsec); + + /* + * This is revolting. We need to set "xtime" correctly. However, the + * value in this location is the value at the most recent update of + * wall time. Discover what correction gettimeofday() would have + * made, and then undo it! + */ + nsec -= (jiffies - wall_jiffies) * TICK_NSEC; + + nsec -= (shadow->system_timestamp - processed_system_time); + + __normalize_time(&sec, &nsec); + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + if (xen_start_info.flags & SIF_INITDOMAIN) { + dom0_op_t op; + op.cmd = DOM0_SETTIME; + op.u.settime.secs = xentime.tv_sec; + op.u.settime.usecs = xentime.tv_nsec / NSEC_PER_USEC; + op.u.settime.system_time = shadow->system_timestamp; + write_sequnlock_irq(&xtime_lock); + HYPERVISOR_dom0_op(&op); + } else +#endif + write_sequnlock_irq(&xtime_lock); + + put_cpu(); + + clock_was_set(); + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static int set_rtc_mmss(unsigned long nowtime) +{ + int retval; + + WARN_ON(irqs_disabled()); + + /* gets recalled with irq locally disabled */ + spin_lock_irq(&rtc_lock); + if (efi_enabled) + retval = efi_set_rtc_mmss(nowtime); + else + retval = mach_set_rtc_mmss(nowtime); + spin_unlock_irq(&rtc_lock); + + return retval; +} +#else +static int set_rtc_mmss(unsigned long nowtime) +{ + return 0; +} +#endif + +/* monotonic_clock(): returns # of nanoseconds passed since time_init() + * Note: This function is required to return accurate + * time even in the absence of multiple timer ticks. + */ +unsigned long long monotonic_clock(void) +{ + int cpu = get_cpu(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + s64 off; + unsigned long flags; + + for ( ; ; ) { + off = get_nsec_offset(shadow); + if (time_values_up_to_date(cpu)) + break; + write_seqlock_irqsave(&xtime_lock, flags); + __get_time_values_from_xen(); + write_sequnlock_irqrestore(&xtime_lock, flags); + } + + put_cpu(); + + return shadow->system_timestamp + off; +} +EXPORT_SYMBOL(monotonic_clock); + +unsigned long long sched_clock(void) +{ + return monotonic_clock(); +} + +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (in_lock_functions(pc)) + return *(unsigned long *)(regs->ebp + 4); + + return pc; +} +EXPORT_SYMBOL(profile_pc); +#endif + +/* + * timer_interrupt() needs to keep up the real-time clock, + * as well as call the "do_timer()" routine every clocktick + */ +static inline void do_timer_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + s64 delta, delta_cpu; + int cpu = smp_processor_id(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + + do { + __get_time_values_from_xen(); + + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); + delta -= processed_system_time; + delta_cpu -= per_cpu(processed_system_time, cpu); + } + while (!time_values_up_to_date(cpu)); + + if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) { + printk("Timer ISR/%d: Time went backwards: " + "delta=%lld cpu_delta=%lld shadow=%lld " + "off=%lld processed=%lld cpu_processed=%lld\n", + cpu, delta, delta_cpu, shadow->system_timestamp, + (s64)get_nsec_offset(shadow), + processed_system_time, + per_cpu(processed_system_time, cpu)); + for (cpu = 0; cpu < num_online_cpus(); cpu++) + printk(" %d: %lld\n", cpu, + per_cpu(processed_system_time, cpu)); + return; + } + + /* System-wide jiffy work. */ + while (delta >= NS_PER_TICK) { + delta -= NS_PER_TICK; + processed_system_time += NS_PER_TICK; + do_timer(regs); + } + + /* Local CPU jiffy work. */ + while (delta_cpu >= NS_PER_TICK) { + delta_cpu -= NS_PER_TICK; + per_cpu(processed_system_time, cpu) += NS_PER_TICK; + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); + } +} + +/* + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly. + */ +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + /* + * Here we are in the timer irq handler. We just have irqs locally + * disabled but we don't know if the timer_bh is running on the other + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need + * the irq version of write_lock because as just said we have irq + * locally disabled. -arca + */ + write_seqlock(&xtime_lock); + do_timer_interrupt(irq, NULL, regs); + write_sequnlock(&xtime_lock); + return IRQ_HANDLED; +} + +/* not static: needed by APM */ +unsigned long get_cmos_time(void) +{ + unsigned long retval; + + spin_lock(&rtc_lock); + + if (efi_enabled) + retval = efi_get_time(); + else + retval = mach_get_cmos_time(); + + spin_unlock(&rtc_lock); + + return retval; +} +static void sync_cmos_clock(unsigned long dummy); + +static struct timer_list sync_cmos_timer = + TIMER_INITIALIZER(sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) +{ + struct timeval now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if ((time_status & STA_UNSYNC) != 0) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + do_gettimeofday(&now); + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) + fail = set_rtc_mmss(now.tv_sec); + + next.tv_usec = USEC_AFTER - now.tv_usec; + if (next.tv_usec <= 0) + next.tv_usec += USEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_usec >= USEC_PER_SEC) { + next.tv_sec++; + next.tv_usec -= USEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); +} + +void notify_arch_cmos_timer(void) +{ + mod_timer(&sync_cmos_timer, jiffies + 1); +} + +static long clock_cmos_diff, sleep_start; + +static int timer_suspend(struct sys_device *dev, pm_message_t state) +{ + /* + * Estimate time zone so that set_time can update the clock + */ + clock_cmos_diff = -get_cmos_time(); + clock_cmos_diff += get_seconds(); + sleep_start = get_cmos_time(); + return 0; +} + +static int timer_resume(struct sys_device *dev) +{ + unsigned long flags; + unsigned long sec; + unsigned long sleep_length; + +#ifdef CONFIG_HPET_TIMER + if (is_hpet_enabled()) + hpet_reenable(); +#endif + sec = get_cmos_time() + clock_cmos_diff; + sleep_length = (get_cmos_time() - sleep_start) * HZ; + write_seqlock_irqsave(&xtime_lock, flags); + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + jiffies += sleep_length; + wall_jiffies += sleep_length; + return 0; +} + +static struct sysdev_class timer_sysclass = { + .resume = timer_resume, + .suspend = timer_suspend, + set_kset_name("timer"), +}; + + +/* XXX this driverfs stuff should probably go elsewhere later -john */ +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int time_init_device(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(time_init_device); + +#ifdef CONFIG_HPET_TIMER +extern void (*late_time_init)(void); +/* Duplicate of time_init() below, with hpet_enable part added */ +static void __init hpet_time_init(void) +{ + xtime.tv_sec = get_cmos_time(); + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + if ((hpet_enable() >= 0) && hpet_use_timer) { + printk("Using HPET for base-timer\n"); + } + + cur_timer = select_timer(); + printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); + + time_init_hook(); +} +#endif + +/* Dynamically-mapped IRQ. */ +static DEFINE_PER_CPU(int, timer_irq); + +static struct irqaction irq_timer = { + timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0", + NULL, NULL +}; + +void __init time_init(void) +{ +#ifdef CONFIG_HPET_TIMER + if (is_hpet_capable()) { + /* + * HPET initialization needs to do memory-mapped io. So, let + * us do a late initialization after mem_init(). + */ + late_time_init = hpet_time_init; + return; + } +#endif + __get_time_values_from_xen(); + xtime.tv_sec = shadow_tv.tv_sec; + xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC; + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; + + init_cpu_khz(); + +#if defined(__x86_64__) + vxtime.mode = VXTIME_TSC; + vxtime.quot = (1000000L << 32) / vxtime_hz; + vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.hz = vxtime_hz; + sync_core(); + rdtscll(vxtime.last_tsc); +#endif + + per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER); + (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer); +} + +/* Convert jiffies to system time. */ +static inline u64 jiffies_to_st(unsigned long j) +{ + unsigned long seq; + long delta; + u64 st; + + do { + seq = read_seqbegin(&xtime_lock); + delta = j - jiffies; + /* NB. The next check can trigger in some wrap-around cases, + * but that's ok: we'll just end up with a shorter timeout. */ + if (delta < 1) + delta = 1; + st = processed_system_time + (delta * NS_PER_TICK); + } while (read_seqretry(&xtime_lock, seq)); + + return st; +} + +/* + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu + * These functions are based on implementations from arch/s390/kernel/time.c + */ +void stop_hz_timer(void) +{ + unsigned int cpu = smp_processor_id(); + unsigned long j; + + /* s390 does this /before/ checking rcu_pending(). We copy them. */ + cpu_set(cpu, nohz_cpu_mask); + + /* Leave ourselves in 'tick mode' if rcu or softirq pending. */ + if (rcu_pending(cpu) || local_softirq_pending()) { + cpu_clear(cpu, nohz_cpu_mask); + j = jiffies + 1; + } else { + j = next_timer_interrupt(); + } + + BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0); +} + +void start_hz_timer(void) +{ + cpu_clear(smp_processor_id(), nohz_cpu_mask); +} + +void time_suspend(void) +{ + /* nothing */ +} + +/* No locking required. We are only CPU running, and interrupts are off. */ +void time_resume(void) +{ + init_cpu_khz(); + + /* Get timebases for new environment. */ + __get_time_values_from_xen(); + + /* Reset our own concept of passage of system time. */ + processed_system_time = + per_cpu(shadow_time, smp_processor_id()).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; +} + +#ifdef CONFIG_SMP +static char timer_name[NR_CPUS][15]; +void local_setup_timer(void) +{ + int seq, cpu = smp_processor_id(); + + do { + seq = read_seqbegin(&xtime_lock); + per_cpu(processed_system_time, cpu) = + per_cpu(shadow_time, cpu).system_timestamp; + } while (read_seqretry(&xtime_lock, seq)); + + per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); + sprintf(timer_name[cpu], "timer%d", cpu); + BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt, + SA_INTERRUPT, timer_name[cpu], NULL)); +} +#endif + +/* + * /proc/sys/xen: This really belongs in another file. It can stay here for + * now however. + */ +static ctl_table xen_subtable[] = { + {1, "independent_wallclock", &independent_wallclock, + sizeof(independent_wallclock), 0644, NULL, proc_dointvec}, + {0} +}; +static ctl_table xen_table[] = { + {123, "xen", NULL, 0, 0555, xen_subtable}, + {0} +}; +static int __init xen_sysctl_init(void) +{ + (void)register_sysctl_table(xen_table, 0); + return 0; +} +__initcall(xen_sysctl_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/traps.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/traps.c @@ -0,0 +1,1026 @@ +/* + * linux/arch/i386/traps.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'asm.s'. + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/highmem.h> +#include <linux/kallsyms.h> +#include <linux/ptrace.h> +#include <linux/utsname.h> +#include <linux/kprobes.h> + +#ifdef CONFIG_EISA +#include <linux/ioport.h> +#include <linux/eisa.h> +#endif + +#ifdef CONFIG_MCA +#include <linux/mca.h> +#endif + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/nmi.h> + +#include <asm/smp.h> +#include <asm/arch_hooks.h> +#include <asm/kdebug.h> + +#include <linux/irq.h> +#include <linux/module.h> + +#include "mach_traps.h" + +asmlinkage int system_call(void); + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq = 0; + +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void coprocessor_error(void); +asmlinkage void simd_coprocessor_error(void); +asmlinkage void alignment_check(void); +asmlinkage void fixup_4gb_segment(void); +asmlinkage void machine_check(void); + +static int kstack_depth_to_print = 24; +struct notifier_block *i386die_chain; +static DEFINE_SPINLOCK(die_notifier_lock); + +int register_die_notifier(struct notifier_block *nb) +{ + int err = 0; + unsigned long flags; + spin_lock_irqsave(&die_notifier_lock, flags); + err = notifier_chain_register(&i386die_chain, nb); + spin_unlock_irqrestore(&die_notifier_lock, flags); + return err; +} + +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; +} + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long ebp) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + while (valid_stack_ptr(tinfo, (void *)ebp)) { + addr = *(unsigned long *)(ebp + 4); + printk(" [<%08lx>] ", addr); + print_symbol("%s", addr); + printk("\n"); + ebp = *(unsigned long *)ebp; + } +#else + while (valid_stack_ptr(tinfo, stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) { + printk(" [<%08lx>]", addr); + print_symbol(" %s", addr); + printk("\n"); + } + } +#endif + return ebp; +} + +void show_trace(struct task_struct *task, unsigned long * stack) +{ + unsigned long ebp; + + if (!task) + task = current; + + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } + + while (1) { + struct thread_info *context; + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + ebp = print_context_stack(context, stack, ebp); + stack = (unsigned long*)context->previous_esp; + if (!stack) + break; + printk(" =======================\n"); + } +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + unsigned long *stack; + int i; + + if (esp == NULL) { + if (task) + esp = (unsigned long*)task->thread.esp; + else + esp = (unsigned long *)&esp; + } + + stack = esp; + for(i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % 8) == 0)) + printk("\n "); + printk("%08lx ", *stack++); + } + printk("\nCall Trace:\n"); + show_trace(task, esp); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long stack; + + show_trace(current, &stack); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + int in_kernel = 1; + unsigned long esp; + unsigned short ss; + + esp = (unsigned long) (&regs->esp); + ss = __KERNEL_DS; + if (regs->xcs & 2) { + in_kernel = 0; + esp = regs->esp; + ss = regs->xss & 0xffff; + } + print_modules(); + printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx" + " (%s) \n", + smp_processor_id(), 0xffff & regs->xcs, regs->eip, + print_tainted(), regs->eflags, system_utsname.release); + print_symbol("EIP is at %s\n", regs->eip); + printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->eax, regs->ebx, regs->ecx, regs->edx); + printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->esi, regs->edi, regs->ebp, esp); + printk("ds: %04x es: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk("Process %s (pid: %d, threadinfo=%p task=%p)", + current->comm, current->pid, current_thread_info(), current); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (in_kernel) { + u8 *eip; + + printk("\nStack: "); + show_stack(NULL, (unsigned long*)esp); + + printk("Code: "); + + eip = (u8 *)regs->eip - 43; + for (i = 0; i < 64; i++, eip++) { + unsigned char c; + + if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) { + printk(" Bad EIP value."); + break; + } + if (eip == (u8 *)regs->eip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +static void handle_BUG(struct pt_regs *regs) +{ + unsigned short ud2; + unsigned short line; + char *file; + char c; + unsigned long eip; + + if (regs->xcs & 2) + goto no_bug; /* Not in kernel */ + + eip = regs->eip; + + if (eip < PAGE_OFFSET) + goto no_bug; + if (__get_user(ud2, (unsigned short *)eip)) + goto no_bug; + if (ud2 != 0x0b0f) + goto no_bug; + if (__get_user(line, (unsigned short *)(eip + 2))) + goto bug; + if (__get_user(file, (char **)(eip + 4)) || + (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + file = "<bad filename>"; + + printk("------------[ cut here ]------------\n"); + printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line); + +no_bug: + return; + + /* Here we know it was a BUG but file-n-line is unavailable */ +bug: + printk("Kernel BUG\n"); +} + +void die(const char * str, struct pt_regs * regs, long err) +{ + static struct { + spinlock_t lock; + u32 lock_owner; + int lock_owner_depth; + } die = { + .lock = SPIN_LOCK_UNLOCKED, + .lock_owner = -1, + .lock_owner_depth = 0 + }; + static int die_counter; + + if (die.lock_owner != _smp_processor_id()) { + console_verbose(); + spin_lock_irq(&die.lock); + die.lock_owner = smp_processor_id(); + die.lock_owner_depth = 0; + bust_spinlocks(1); + } + + if (++die.lock_owner_depth < 3) { + int nl = 0; + handle_BUG(regs); + printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); + nl = 1; +#endif +#ifdef CONFIG_SMP + printk("SMP "); + nl = 1; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); + nl = 1; +#endif + if (nl) + printk("\n"); + notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); + show_registers(regs); + } else + printk(KERN_ERR "Recursive die() failure, output suppressed\n"); + + bust_spinlocks(0); + die.lock_owner = -1; + spin_unlock_irq(&die.lock); + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) { + printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); + ssleep(5); + panic("Fatal exception"); + } + do_exit(SIGSEGV); +} + +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) +{ + if (!(regs->eflags & VM_MASK) && !(2 & regs->xcs)) + die(str, regs, err); +} + +static void do_trap(int trapnr, int signr, char *str, int vm86, + struct pt_regs * regs, long error_code, siginfo_t *info) +{ + if (regs->eflags & VM_MASK) { + if (vm86) + goto vm86_trap; + goto trap_signal; + } + + if (!(regs->xcs & 2)) + goto kernel_trap; + + trap_signal: { + struct task_struct *tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + } + + kernel_trap: { + if (!fixup_exception(regs)) + die(str, regs, error_code); + return; + } + + vm86_trap: { + int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); + if (ret) goto trap_signal; + return; + } +} + +#define DO_ERROR(trapnr, signr, str, name) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ +} + +#define DO_VM86_ERROR(trapnr, signr, str, name) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ +} + +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ +} + +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +#ifndef CONFIG_KPROBES +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) +#endif +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) +DO_VM86_ERROR( 7, SIGSEGV, "device not available", device_not_available) +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) +#ifdef CONFIG_X86_MCE +DO_ERROR(18, SIGBUS, "machine check", machine_check) +#endif + +fastcall void do_general_protection(struct pt_regs * regs, long error_code) +{ + /* + * If we trapped on an LDT access then ensure that the default_ldt is + * loaded, if nothing else. We load default_ldt lazily because LDT + * switching costs time and many applications don't need it. + */ + if (unlikely((error_code & 6) == 4)) { + unsigned long ldt; + __asm__ __volatile__ ("sldt %0" : "=r" (ldt)); + if (ldt == 0) { + xen_set_ldt((unsigned long)&default_ldt[0], 5); + return; + } + } + + if (regs->eflags & VM_MASK) + goto gp_in_vm86; + + if (!(regs->xcs & 2)) + goto gp_in_kernel; + + current->thread.error_code = error_code; + current->thread.trap_no = 13; + force_sig(SIGSEGV, current); + return; + +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; + +gp_in_kernel: + if (!fixup_exception(regs)) { + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); + } +} + +static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +{ + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); + printk("You probably have a hardware problem with your RAM chips\n"); + + /* Clear and disable the memory parity error line. */ + clear_mem_error(reason); +} + +static void io_check_error(unsigned char reason, struct pt_regs * regs) +{ + unsigned long i; + + printk("NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & 0xf) | 8; + outb(reason, 0x61); + i = 2000; + while (--i) udelay(1000); + reason &= ~8; + outb(reason, 0x61); +} + +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +{ +#ifdef CONFIG_MCA + /* Might actually be able to figure out what the guilty party + * is. */ + if( MCA_bus ) { + mca_handle_nmi(); + return; + } +#endif + printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + printk("Dazed and confused, but trying to continue\n"); + printk("Do you have a strange power saving mode enabled?\n"); +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void die_nmi (struct pt_regs *regs, const char *msg) +{ + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + bust_spinlocks(1); + printk(msg); + printk(" on CPU%d, eip %08lx, registers:\n", + smp_processor_id(), regs->eip); + show_registers(regs); + printk("console shuts up ...\n"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + do_exit(SIGSEGV); +} + +static void default_do_nmi(struct pt_regs * regs) +{ + unsigned char reason = 0; + + /* Only the BSP gets external NMIs from the system. */ + if (!smp_processor_id()) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog) { + nmi_watchdog_tick(regs); + return; + } +#endif + unknown_nmi_error(reason, regs); + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) + return; + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered. + */ + reassert_nmi(); +} + +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + +fastcall void do_nmi(struct pt_regs * regs, long error_code) +{ + int cpu; + + nmi_enter(); + + cpu = smp_processor_id(); + +#ifdef CONFIG_HOTPLUG_CPU + if (!cpu_online(cpu)) { + nmi_exit(); + return; + } +#endif + + ++nmi_count(cpu); + + if (!nmi_callback(regs, cpu)) + default_do_nmi(regs); + + nmi_exit(); +} + +void set_nmi_callback(nmi_callback_t callback) +{ + nmi_callback = callback; +} + +void unset_nmi_callback(void) +{ + nmi_callback = dummy_nmi_callback; +} + +#ifdef CONFIG_KPROBES +fastcall void do_int3(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; + /* This is an interrupt gate, because kprobes wants interrupts + disabled. Normal trap handlers don't. */ + restore_interrupts(regs); + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); +} +#endif + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + */ +fastcall void do_debug(struct pt_regs * regs, long error_code) +{ + unsigned int condition; + struct task_struct *tsk = current; + + condition = HYPERVISOR_get_debugreg(6); + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; +#if 0 + /* It's safe to allow irq's after DR6 has been saved */ + if (regs->eflags & X86_EFLAGS_IF) + local_irq_enable(); +#endif + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg[7]) + goto clear_dr7; + } + + if (regs->eflags & VM_MASK) + goto debug_vm86; + + /* Save debug status register where ptrace can see it */ + tsk->thread.debugreg[6] = condition; + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ + if (condition & DR_STEP) { + /* + * We already checked v86 mode above, so we can + * check for kernel mode by just checking the CPL + * of CS. + */ + if ((regs->xcs & 2) == 0) + goto clear_TF_reenable; + } + + /* Ok, finally something we can handle */ + send_sigtrap(tsk, regs, error_code); + + /* Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ +clear_dr7: + HYPERVISOR_set_debugreg(7, 0); + return; + +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + return; + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->eflags &= ~TF_MASK; + return; +} + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +void math_error(void __user *eip) +{ + struct task_struct * task; + siginfo_t info; + unsigned short cwd, swd; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = eip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't syncronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + case 0x041: /* Stack Fault */ + case 0x241: /* Stack Fault | Direction */ + info.si_code = FPE_FLTINV; + /* Should we clear the SF or let user space do it ???? */ + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) +{ + ignore_fpu_irq = 1; + math_error((void __user *)regs->eip); +} + +static void simd_math_error(void __user *eip) +{ + struct task_struct * task; + siginfo_t info; + unsigned short mxcsr; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = eip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +fastcall void do_simd_coprocessor_error(struct pt_regs * regs, + long error_code) +{ + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->eip); + } else { + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->eflags & VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, + error_code); + return; + } + die_if_kernel("cache flush denied", regs, error_code); + current->thread.trap_no = 19; + current->thread.error_code = error_code; + force_sig(SIGSEGV, current); + } +} + +fastcall void setup_x86_bogus_stack(unsigned char * stk) +{ + unsigned long *switch16_ptr, *switch32_ptr; + struct pt_regs *regs; + unsigned long stack_top, stack_bot; + unsigned short iret_frame16_off; + int cpu = smp_processor_id(); + /* reserve the space on 32bit stack for the magic switch16 pointer */ + memmove(stk, stk + 8, sizeof(struct pt_regs)); + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); + regs = (struct pt_regs *)stk; + /* now the switch32 on 16bit stack */ + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; + /* copy iret frame on 16bit stack */ + memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20); + /* fill in the switch pointers */ + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; + switch16_ptr[1] = __ESPFIX_SS; + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + + 8 - CPU_16BIT_STACK_SIZE; + switch32_ptr[1] = __KERNEL_DS; +} + +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) +{ + unsigned long *switch32_ptr; + unsigned char *stack16, *stack32; + unsigned long stack_top, stack_bot; + int len; + int cpu = smp_processor_id(); + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + /* copy the data from 16bit stack to 32bit stack */ + len = CPU_16BIT_STACK_SIZE - 8 - sp; + stack16 = (unsigned char *)(stack_bot + sp); + stack32 = (unsigned char *) + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); + memcpy(stack32, stack16, len); + return stack32; +} + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). + */ +asmlinkage void math_state_restore(struct pt_regs regs) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* NB. 'clts' is done for us by Xen during virtual trap. */ + if (!tsk_used_math(tsk)) + init_fpu(tsk); + restore_fpu(tsk); + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ +} + +#ifndef CONFIG_MATH_EMULATION + +asmlinkage void math_emulate(long arg) +{ + printk("math-emulation not enabled and no coprocessor found.\n"); + printk("killing %s.\n",current->comm); + force_sig(SIGFPE,current); + schedule(); +} + +#endif /* CONFIG_MATH_EMULATION */ + +#ifdef CONFIG_X86_F00F_BUG +void __init trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); +} +#endif + + +/* NB. All these are "trap gates" (i.e. events_mask isn't cleared). */ +static trap_info_t trap_table[] = { + { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, + { 1, 0, __KERNEL_CS, (unsigned long)debug }, + { 3, 3, __KERNEL_CS, (unsigned long)int3 }, + { 4, 3, __KERNEL_CS, (unsigned long)overflow }, + { 5, 3, __KERNEL_CS, (unsigned long)bounds }, + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, + { 7, 0, __KERNEL_CS, (unsigned long)device_not_available }, + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, + { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, + { 14, 0, __KERNEL_CS, (unsigned long)page_fault }, + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, +#ifdef CONFIG_X86_MCE + { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, +#endif + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call }, + { 0, 0, 0, 0 } +}; + +void __init trap_init(void) +{ + HYPERVISOR_set_trap_table(trap_table); + + /* + * default LDT is a single-entry callgate to lcall7 for iBCS + * and a callgate to lcall27 for Solaris/x86 binaries + */ + make_lowmem_page_readonly(&default_ldt[0]); + + /* + * Should be a barrier for any external CPU state. + */ + cpu_init(); +} + +void smp_trap_init(trap_info_t *trap_ctxt) +{ + trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("kstack=", kstack_setup); Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/vsyscall.S =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/vsyscall.S @@ -0,0 +1,15 @@ +#include <linux/init.h> + +__INITDATA + + .globl vsyscall_int80_start, vsyscall_int80_end +vsyscall_int80_start: + .incbin "arch/i386/mach-xen/kernel/vsyscall-int80.so" +vsyscall_int80_end: + + .globl vsyscall_sysenter_start, vsyscall_sysenter_end +vsyscall_sysenter_start: + .incbin "arch/i386/mach-xen/kernel/vsyscall-sysenter.so" +vsyscall_sysenter_end: + +__FINIT Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Makefile @@ -0,0 +1 @@ +obj-y := kernel/ mm/ Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/fault.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/fault.c @@ -0,0 +1,561 @@ +/* + * linux/arch/i386/mm/fault.c + * + * Copyright (C) 1995 Linus Torvalds + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/percpu.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/kdebug.h> + +extern void die(const char *,struct pt_regs *,long); + +DEFINE_PER_CPU(pgd_t *, cur_pgd); + +/* + * Unlock any spinlocks which will prevent us from getting the + * message out + */ +void bust_spinlocks(int yes) +{ + int loglevel_save = console_loglevel; + + if (yes) { + oops_in_progress = 1; + return; + } +#ifdef CONFIG_VT + unblank_screen(); +#endif + oops_in_progress = 0; + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; /* NMI oopser may have shut the console up */ + printk(" "); + console_loglevel = loglevel_save; +} + +/* + * Return EIP plus the CS segment base. The segment limit is also + * adjusted, clamped to the kernel/user address space (whichever is + * appropriate), and returned in *eip_limit. + * + * The segment is checked, because it might have been changed by another + * task between the original faulting instruction and here. + * + * If CS is no longer a valid code segment, or if EIP is beyond the + * limit, or if it is a kernel address when CS is not a kernel segment, + * then the returned value will be greater than *eip_limit. + * + * This is slow, but is very rarely executed. + */ +static inline unsigned long get_segment_eip(struct pt_regs *regs, + unsigned long *eip_limit) +{ + unsigned long eip = regs->eip; + unsigned seg = regs->xcs & 0xffff; + u32 seg_ar, seg_limit, base, *desc; + + /* The standard kernel/user address space limit. */ + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; + + /* Unlikely, but must come before segment checks. */ + if (unlikely((regs->eflags & VM_MASK) != 0)) + return eip + (seg << 4); + + /* By far the most common cases. */ + if (likely(seg == __USER_CS || seg == __KERNEL_CS)) + return eip; + + /* Check the segment exists, is within the current LDT/GDT size, + that kernel/user (ring 0..3) has the appropriate privilege, + that it's a code segment, and get the limit. */ + __asm__ ("larl %3,%0; lsll %3,%1" + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); + if ((~seg_ar & 0x9800) || eip > seg_limit) { + *eip_limit = 0; + return 1; /* So that returned eip > *eip_limit. */ + } + + /* Get the GDT/LDT descriptor base. + When you look for races in this code remember that + LDT and other horrors are only used in user space. */ + if (seg & (1<<2)) { + /* Must lock the LDT while reading it. */ + down(&current->mm->context.sem); + desc = current->mm->context.ldt; + desc = (void *)desc + (seg & ~7); + } else { + /* Must disable preemption while reading the GDT. */ + desc = (u32 *)get_cpu_gdt_table(get_cpu()); + desc = (void *)desc + (seg & ~7); + } + + /* Decode the code segment base from the descriptor */ + base = get_desc_base((unsigned long *)desc); + + if (seg & (1<<2)) { + up(&current->mm->context.sem); + } else + put_cpu(); + + /* Adjust EIP and segment limit, and clamp at the kernel limit. + It's legitimate for segments to wrap at 0xffffffff. */ + seg_limit += base; + if (seg_limit < *eip_limit && seg_limit >= base) + *eip_limit = seg_limit; + return eip + base; +} + +/* + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + */ +static int __is_prefetch(struct pt_regs *regs, unsigned long addr) +{ + unsigned long limit; + unsigned long instr = get_segment_eip (regs, &limit); + int scan_more = 1; + int prefetch = 0; + int i; + + for (i = 0; scan_more && i < 15; i++) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char *) instr)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ + scan_more = ((instr_lo & 7) == 0x6); + break; + + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char *) instr)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 6)) { + /* Catch an obscure case of prefetch inside an NX page. */ + if (nx_enabled && (error_code & 16)) + return 0; + return __is_prefetch(regs, addr); + } + return 0; +} + +fastcall void do_invalid_op(struct pt_regs *, unsigned long); + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + * + * error_code: + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + */ +fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long page; + int write; + siginfo_t info; + + /* Set the "privileged fault" bit to something sane. */ + error_code &= 3; + error_code |= (regs->xcs & 2) << 1; + if (regs->eflags & X86_EFLAGS_VM) + error_code |= 4; + + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; +#if 0 + /* It's safe to allow irq's after cr2 has been saved */ + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); +#endif + + tsk = current; + + info.si_code = SEGV_MAPERR; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 1) == 0. + */ + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & 5)) + goto vmalloc_fault; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + mm = tsk->mm; + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault.. + */ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; + + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occuring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & 4) == 0 && + !search_exception_tables(regs->eip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & 4) { + /* + * accessing the stack below %esp is always a bug. + * The "+ 32" is there due to some instructions (like + * pusha) doing post-decrement on the stack and that + * doesn't show up until later.. + */ + if (address + 32 < regs->esp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + info.si_code = SEGV_ACCERR; + write = 0; + switch (error_code & 3) { + default: /* 3: write, present */ +#ifdef TEST_VERIFY_AREA + if (regs->cs == KERNEL_CS) + printk("WP fault at %08lx\n", regs->eip); +#endif + /* fall through */ + case 2: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case 1: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } + + survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + switch (handle_mm_fault(mm, vma, address, write)) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + goto do_sigbus; + case VM_FAULT_OOM: + goto out_of_memory; + default: + BUG(); + } + + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (regs->eflags & VM_MASK) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & 4) { + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address, error_code)) + return; + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code has been set above */ + info.si_addr = (void __user *)address; + force_sig_info(SIGSEGV, &info, tsk); + return; + } + +#ifdef CONFIG_X86_F00F_BUG + /* + * Pentium F0 0F C7 C8 bug workaround. + */ + if (boot_cpu_data.f00f_bug) { + unsigned long nr; + + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return; + } + } +#endif + +no_context: + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + */ + if (is_prefetch(regs, address, error_code)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + + bust_spinlocks(1); + +#ifdef CONFIG_X86_PAE + if (error_code & 16) { + pte_t *pte = lookup_address(address); + + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) + printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); + } +#endif + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at virtual address %08lx\n",address); + printk(KERN_ALERT " printing eip:\n"); + printk("%08lx\n", regs->eip); + page = ((unsigned long *) per_cpu(cur_pgd, smp_processor_id())) + [address >> 22]; + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); + /* + * We must not directly access the pte in the highpte + * case, the page table might be allocated in highmem. + * And lets rather not kmap-atomic the pte, just in case + * it's allocated already. + */ +#ifndef CONFIG_HIGHPTE + if (page & 1) { + page &= PAGE_MASK; + address &= 0x003ff000; + page = machine_to_phys(page); + page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); + } +#endif + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (tsk->pid == 1) { + yield(); + down_read(&mm->mmap_sem); + goto survive; + } + printk("VM: killing process %s\n", tsk->comm); + if (error_code & 4) + do_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & 4)) + goto no_context; + + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address, error_code)) + return; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, tsk); + return; + +vmalloc_fault: + { + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "tsk" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + int index = pgd_index(address); + pgd_t *pgd, *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + pte_t *pte_k; + + pgd = index + per_cpu(cur_pgd, smp_processor_id()); + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + goto no_context; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + goto no_context; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + goto no_context; + set_pmd(pmd, *pmd_k); + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + goto no_context; + return; + } +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/highmem.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/highmem.c @@ -0,0 +1,94 @@ +#include <linux/highmem.h> + +void *kmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_high(page); +} + +void kunmap(struct page *page) +{ + if (in_interrupt()) + BUG(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + inc_preempt_count(); + if (!PageHighMem(page)) + return page_address(page); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#ifdef CONFIG_DEBUG_HIGHMEM + if (!pte_none(*(kmap_pte-idx))) + BUG(); +#endif + set_pte(kmap_pte-idx, mk_pte(page, prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + +void *kmap_atomic(struct page *page, enum km_type type) +{ + return __kmap_atomic(page, type, kmap_prot); +} + +void kunmap_atomic(void *kvaddr, enum km_type type) +{ +#ifdef CONFIG_DEBUG_HIGHMEM + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + if (vaddr < FIXADDR_START) { // FIXME + dec_preempt_count(); + preempt_check_resched(); + return; + } + + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) + BUG(); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + __flush_tlb_one(vaddr); +#endif + + dec_preempt_count(); + preempt_check_resched(); +} + +struct page *kmap_atomic_to_page(void *ptr) +{ + unsigned long idx, vaddr = (unsigned long)ptr; + pte_t *pte; + + if (vaddr < FIXADDR_START) + return virt_to_page(ptr); + + idx = virt_to_fix(vaddr); + pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + return pte_page(*pte); +} + Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/hypervisor.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/hypervisor.c @@ -0,0 +1,363 @@ +/****************************************************************************** + * mm/hypervisor.c + * + * Update page tables via the hypervisor. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm-xen/hypervisor.h> +#include <asm-xen/balloon.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#include <linux/percpu.h> +#include <asm/tlbflush.h> +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +#define pte_offset_kernel pte_offset +#define pud_t pgd_t +#define pud_offset(d, va) d +#elif defined(CONFIG_X86_64) +#define pmd_val_ma(v) (v).pmd +#else +#ifdef CONFIG_X86_PAE +# define pmd_val_ma(v) ((v).pmd) +# define pud_val_ma(v) ((v).pgd.pgd) +#else +# define pmd_val_ma(v) ((v).pud.pgd.pgd) +#endif +#endif + +#ifndef CONFIG_XEN_SHADOW_MODE +void xen_l1_entry_update(pte_t *ptr, pte_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pte_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +void xen_l2_entry_update(pmd_t *ptr, pmd_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pmd_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +#ifdef CONFIG_X86_PAE +void xen_l3_entry_update(pud_t *ptr, pud_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pud_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +#endif + +#ifdef CONFIG_X86_64 +void xen_l3_entry_update(pud_t *ptr, pud_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = val.pud; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +void xen_l4_entry_update(pgd_t *ptr, pgd_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = val.pgd; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_XEN_SHADOW_MODE */ + +void xen_machphys_update(unsigned long mfn, unsigned long pfn) +{ + mmu_update_t u; + u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + u.val = pfn; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pt_switch(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_NEW_BASEPTR; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_new_user_pt(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_NEW_USER_BASEPTR; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_invlpg(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_LOCAL; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +#ifdef CONFIG_SMP + +void xen_tlb_flush_all(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_ALL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush_mask(cpumask_t *mask) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_TLB_FLUSH_MULTI; + op.vcpumask = mask->bits; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_invlpg_all(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_INVLPG_MULTI; + op.vcpumask = mask->bits; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +#endif /* CONFIG_SMP */ + +#ifndef CONFIG_XEN_SHADOW_MODE +void xen_pgd_pin(unsigned long ptr) +{ + struct mmuext_op op; +#ifdef CONFIG_X86_64 + op.cmd = MMUEXT_PIN_L4_TABLE; +#elif defined(CONFIG_X86_PAE) + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L2_TABLE; +#endif + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pgd_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pte_pin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L1_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pte_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +#ifdef CONFIG_X86_64 +void xen_pud_pin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L3_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pud_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pmd_pin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L2_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pmd_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_XEN_SHADOW_MODE */ + +void xen_set_ldt(unsigned long ptr, unsigned long len) +{ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.linear_addr = ptr; + op.nr_ents = len; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_contig_memory(unsigned long vstart, unsigned int order) +{ + /* + * Ensure multi-page extents are contiguous in machine memory. This code + * could be cleaned up some, and the number of hypercalls reduced. + */ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long mfn, i, flags; + + scrub_pages(vstart, 1 << order); + + balloon_lock(flags); + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1<<order); i++) { + pgd = pgd_offset_k(vstart + (i*PAGE_SIZE)); + pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); + pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); + pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); + mfn = pte_mfn(*pte); + HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), __pte_ma(0), 0); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = + INVALID_P2M_ENTRY; + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, &mfn, 1, 0) != 1); + } + + /* 2. Get a new contiguous memory extent. */ + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_increase_reservation, &mfn, 1, order) != 1); + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1<<order); i++) { + HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), + __pte_ma(((mfn+i)<<PAGE_SHIFT)|__PAGE_KERNEL), 0); + xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i; + } + + flush_tlb_all(); + + balloon_unlock(flags); +} + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + +unsigned long allocate_empty_lowmem_region(unsigned long pages) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long *pfn_array; + unsigned long vstart; + unsigned long i; + unsigned int order = get_order(pages*PAGE_SIZE); + + vstart = __get_free_pages(GFP_KERNEL, order); + if ( vstart == 0 ) + return 0UL; + + scrub_pages(vstart, 1 << order); + + pfn_array = vmalloc((1<<order) * sizeof(*pfn_array)); + if ( pfn_array == NULL ) + BUG(); + + for ( i = 0; i < (1<<order); i++ ) + { + pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE))); + pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); + pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); + pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); + pfn_array[i] = pte_mfn(*pte); +#ifdef CONFIG_X86_64 + xen_l1_entry_update(pte, __pte(0)); +#else + HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0); +#endif + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = + INVALID_P2M_ENTRY; + } + + flush_tlb_all(); + + balloon_put_pages(pfn_array, 1 << order); + + vfree(pfn_array); + + return vstart; +} + +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/init.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/init.c @@ -0,0 +1,801 @@ +/* + * linux/arch/i386/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/proc_fs.h> +#include <linux/efi.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm-xen/hypervisor.h> + +unsigned int __VMALLOC_RESERVE = 128 << 20; + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +unsigned long highstart_pfn, highend_pfn; + +static int noinline do_test_wp_bit(void); + +/* + * Creates a middle page table and puts a pointer to it in the + * given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t * __init one_md_table_init(pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + make_page_readonly(pmd_table); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) + BUG(); +#else + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); +#endif + + return pmd_table; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry. + */ +static pte_t * __init one_page_table_init(pmd_t *pmd) +{ + if (pmd_none(*pmd)) { + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + make_page_readonly(page_table); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + if (page_table != pte_offset_kernel(pmd, 0)) + BUG(); + + return page_table; + } + + return pte_offset_kernel(pmd, 0); +} + +/* + * This function initializes a certain range of kernel virtual memory + * with new bootmem page tables, everywhere page tables are missing in + * the given range. + */ + +/* + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without + * checking the pgd every time. + */ +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { + if (pgd_none(*pgd)) + one_md_table_init(pgd); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { + if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) + one_page_table_init(pmd); + + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +} + +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET. + */ +static void __init kernel_physical_mapping_init(pgd_t *pgd_base) +{ + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int pgd_idx, pmd_idx, pte_ofs; + + unsigned long max_ram_pfn = xen_start_info.nr_pages; + if (max_ram_pfn > max_low_pfn) + max_ram_pfn = max_low_pfn; + + pgd_idx = pgd_index(PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + pfn = 0; + pmd_idx = pmd_index(PAGE_OFFSET); + pte_ofs = pte_index(PAGE_OFFSET); + + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { +#ifdef CONFIG_XEN + /* + * Native linux hasn't PAE-paging enabled yet at this + * point. When running as xen domain we are in PAE + * mode already, thus we can't simply hook a empty + * pmd. That would kill the mappings we are currently + * using ... + */ + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET); +#else + pmd = one_md_table_init(pgd); +#endif + if (pfn >= max_low_pfn) + continue; + pmd += pmd_idx; + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; + if (address >= HYPERVISOR_VIRT_START) + continue; + + /* Map with big pages if possible, otherwise create normal page tables. */ + if (cpu_has_pse) { + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(address) || is_kernel_text(address2)) + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + else + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; + } else { + pte = one_page_table_init(pmd); + + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { + /* XEN: Only map initial RAM allocation. */ + if ((pfn >= max_ram_pfn) || pte_present(*pte)) + continue; + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + } + pte_ofs = 0; + } + } + pmd_idx = 0; + } +} + +static inline int page_kills_ppro(unsigned long pagenr) +{ + if (pagenr >= 0x70000 && pagenr <= 0x7003F) + return 1; + return 0; +} + +extern int is_available_memory(efi_memory_desc_t *); + +static inline int page_is_ram(unsigned long pagenr) +{ + int i; + unsigned long addr, end; + + if (efi_enabled) { + efi_memory_desc_t *md; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + if (!is_available_memory(md)) + continue; + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; + } + + for (i = 0; i < e820.nr_map; i++) { + + if (e820.map[i].type != E820_RAM) /* not usable memory */ + continue; + /* + * !!!FIXME!!! Some BIOSen report areas as RAM that + * are not. Notably the 640->1Mb area. We need a sanity + * check here. + */ + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +#ifdef CONFIG_HIGHMEM +pte_t *kmap_pte; +pgprot_t kmap_prot; + +#define kmap_get_fixmap_pte(vaddr) \ + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) + +static void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* cache the first kmap pte */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; +} + +static void __init permanent_kmaps_init(pgd_t *pgd_base) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr; + + vaddr = PKMAP_BASE; + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; +} + +void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +{ + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { + ClearPageReserved(page); + set_bit(PG_highmem, &page->flags); + set_page_count(page, 1); + if (pfn < xen_start_info.nr_pages) + __free_page(page); + totalhigh_pages++; + } else + SetPageReserved(page); +} + +#ifndef CONFIG_DISCONTIGMEM +static void __init set_highmem_pages_init(int bad_ppro) +{ + int pfn; + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) + one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + totalram_pages += totalhigh_pages; +} +#else +extern void set_highmem_pages_init(int); +#endif /* !CONFIG_DISCONTIGMEM */ + +#else +#define kmap_init() do { } while (0) +#define permanent_kmaps_init(pgd_base) do { } while (0) +#define set_highmem_pages_init(bad_ppro) do { } while (0) +#endif /* CONFIG_HIGHMEM */ + +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; + +#ifndef CONFIG_DISCONTIGMEM +#define remap_numa_kva() do {} while (0) +#else +extern void __init remap_numa_kva(void); +#endif + +static void __init pagetable_init (void) +{ + unsigned long vaddr; + pgd_t *pgd_base = swapper_pg_dir; + pgd_t *old_pgd = (pgd_t *)xen_start_info.pt_base; + +#ifdef CONFIG_X86_PAE + int i; + /* Init entries of the first-level page table to the zero page */ + for (i = 0; i < PTRS_PER_PGD; i++) + set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) { + set_in_cr4(X86_CR4_PSE); + } + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __PAGE_KERNEL |= _PAGE_GLOBAL; + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; + } + + /* + * Switch to proper mm_init page directory. Initialise from the current + * page directory, write-protect the new page directory, then switch to + * it. We clean up by write-enabling and then freeing the old page dir. + */ +#ifndef CONFIG_X86_PAE + memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); + make_page_readonly(pgd_base); + xen_pgd_pin(__pa(pgd_base)); + load_cr3(pgd_base); + xen_pgd_unpin(__pa(old_pgd)); + make_page_writable(old_pgd); + __flush_tlb_all(); + free_bootmem(__pa(old_pgd), PAGE_SIZE); +#else + { + pud_t *old_pud = pud_offset(old_pgd+3, PAGE_OFFSET); + pmd_t *old_pmd = pmd_offset(old_pud, PAGE_OFFSET); + pmd_t *new_pmd = alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(new_pmd, old_pmd, PAGE_SIZE); + memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); + set_pgd(&pgd_base[3], __pgd(__pa(new_pmd) | _PAGE_PRESENT)); + + make_page_readonly(new_pmd); + make_page_readonly(pgd_base); + xen_pgd_pin(__pa(pgd_base)); + load_cr3(pgd_base); + xen_pgd_unpin(__pa(old_pgd)); + make_page_writable(old_pgd); + make_page_writable(old_pmd); + __flush_tlb_all(); + + free_bootmem(__pa(old_pgd), PAGE_SIZE); + free_bootmem(__pa(old_pmd), PAGE_SIZE); + } +#endif + + init_mm.context.pinned = 1; + kernel_physical_mapping_init(pgd_base); + remap_numa_kva(); + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + page_table_range_init(vaddr, 0, pgd_base); + + permanent_kmaps_init(pgd_base); + +#if 0 /* def CONFIG_X86_PAE */ + /* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup. + */ + set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); +#endif +} + +#if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND) +/* + * Swap suspend & friends need this for resume because things like the intel-agp + * driver might have split up a kernel 4MB mapping. + */ +char __nosavedata swsusp_pg_dir[PAGE_SIZE] + __attribute__ ((aligned (PAGE_SIZE))); + +static inline void save_pg_dir(void) +{ + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); +} +#else +static inline void save_pg_dir(void) +{ +} +#endif + +void zap_low_mappings (void) +{ + int i; + + save_pg_dir(); + + /* + * Zap initial low-memory mappings. + * + * Note that "pgd_clear()" doesn't do it for + * us, because pgd_clear() is a no-op on i386. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); +#else + set_pgd(swapper_pg_dir+i, __pgd(0)); +#endif + flush_tlb_all(); +} + +static int disable_nx __initdata = 0; +u64 __supported_pte_mask = ~_PAGE_NX; + +/* + * noexec = on|off + * + * Control non executable mappings. + * + * on Enable + * off Disable + */ +void __init noexec_setup(const char *str) +{ + if (!strncmp(str, "on",2) && cpu_has_nx) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str,"off",3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } +} + +int nx_enabled = 0; +#ifdef CONFIG_X86_PAE + +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} + +/* + * Enables/disables executability of a given kernel page and + * returns the previous setting. + */ +int __init set_kernel_exec(unsigned long vaddr, int enable) +{ + pte_t *pte; + int ret = 1; + + if (!nx_enabled) + goto out; + + pte = lookup_address(vaddr); + BUG_ON(!pte); + + if (!pte_exec_kernel(*pte)) + ret = 0; + + if (enable) + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); + else + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); + __flush_tlb_all(); +out: + return ret; +} + +#endif + +/* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +void __init paging_init(void) +{ +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + int i; +#endif + +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk("NX (Execute Disable) protection: active\n"); +#endif + + pagetable_init(); + +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) + /* + * We will bail out later - printk doesn't work right now so + * the user would just see a hanging kernel. + * when running as xen domain we are already in PAE mode at + * this point. + */ + if (cpu_has_pae) + set_in_cr4(X86_CR4_PAE); +#endif + __flush_tlb_all(); + + kmap_init(); + + /* Switch to the real shared_info page, and clear the dummy page. */ + set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + memset(empty_zero_page, 0, sizeof(empty_zero_page)); + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + /* Setup mapping of lower 1st MB */ + for (i = 0; i < NR_FIX_ISAMAPS; i++) + if (xen_start_info.flags & SIF_PRIVILEGED) + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); + else + __set_fixmap(FIX_ISAMAP_BEGIN - i, + virt_to_machine(empty_zero_page), + PAGE_KERNEL_RO); +#endif +} + +/* + * Test if the WP bit works in supervisor mode. It isn't supported on 386's + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This + * used to involve black magic jumps to work around some nasty CPU bugs, + * but fortunately the switch to using exceptions got rid of all that. + */ + +static void __init test_wp_bit(void) +{ + printk("Checking if this processor honours the WP bit even in supervisor mode... "); + + /* Any page-aligned address will do, the test is non-destructive */ + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + boot_cpu_data.wp_works_ok = do_test_wp_bit(); + clear_fixmap(FIX_WP_TEST); + + if (!boot_cpu_data.wp_works_ok) { + printk("No.\n"); +#ifdef CONFIG_X86_WP_WORKS_OK + panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); +#endif + } else { + printk("Ok.\n"); + } +} + +static void __init set_max_mapnr_init(void) +{ +#ifdef CONFIG_HIGHMEM + num_physpages = highend_pfn; +#else + num_physpages = max_low_pfn; +#endif +#ifndef CONFIG_DISCONTIGMEM + max_mapnr = num_physpages; +#endif +} + +static struct kcore_list kcore_mem, kcore_vmalloc; + +void __init mem_init(void) +{ + extern int ppro_with_ram_bug(void); + int codesize, reservedpages, datasize, initsize; + int tmp; + int bad_ppro; + unsigned long pfn; + +#ifndef CONFIG_DISCONTIGMEM + if (!mem_map) + BUG(); +#endif + + bad_ppro = ppro_with_ram_bug(); + +#ifdef CONFIG_HIGHMEM + /* check that fixmap and pkmap do not overlap */ + if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { + printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); + printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); + BUG(); + } +#endif + + set_max_mapnr_init(); + +#ifdef CONFIG_HIGHMEM + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + printk("vmalloc area: %lx-%lx, maxmem %lx\n", + VMALLOC_START,VMALLOC_END,MAXMEM); + BUG_ON(VMALLOC_START > VMALLOC_END); + + /* this will put all low memory onto the freelists */ + totalram_pages += free_all_bootmem(); + /* XEN: init and count low-mem pages outside initial allocation. */ + for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) { + ClearPageReserved(&mem_map[pfn]); + set_page_count(&mem_map[pfn], 1); + totalram_pages++; + } + + reservedpages = 0; + for (tmp = 0; tmp < max_low_pfn; tmp++) + /* + * Only count reserved RAM pages + */ + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + reservedpages++; + + set_highmem_pages_init(bad_ppro); + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END-VMALLOC_START); + + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10, + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) + ); + +#ifdef CONFIG_X86_PAE + if (!cpu_has_pae) + panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); +#endif + if (boot_cpu_data.wp_works_ok < 0) + test_wp_bit(); + + /* + * Subtle. SMP is doing it's boot stuff late (because it has to + * fork idle threads) - but it also needs low mappings for the + * protected-mode entry to work. We zap these entries only after + * the WP-bit has been tested. + */ +#ifndef CONFIG_SMP + zap_low_mappings(); +#endif +} + +kmem_cache_t *pgd_cache; +kmem_cache_t *pmd_cache; + +void __init pgtable_cache_init(void) +{ + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + 0, + pmd_ctor, + NULL); + if (!pmd_cache) + panic("pgtable_cache_init(): cannot create pmd cache"); + } + pgd_cache = kmem_cache_create("pgd", +#if 0 /* How the heck _this_ works in native linux ??? */ + PTRS_PER_PGD*sizeof(pgd_t), + PTRS_PER_PGD*sizeof(pgd_t), +#else + PAGE_SIZE, + PAGE_SIZE, +#endif + 0, + pgd_ctor, + pgd_dtor); + if (!pgd_cache) + panic("pgtable_cache_init(): Cannot create pgd cache"); +} + +/* + * This function cannot be __init, since exceptions don't work in that + * section. Put this after the callers, so that it cannot be inlined. + */ +static int noinline do_test_wp_bit(void) +{ + char tmp_reg; + int flag; + + __asm__ __volatile__( + " movb %0,%1 \n" + "1: movb %1,%0 \n" + " xorl %2,%2 \n" + "2: \n" + ".section __ex_table,\"a\"\n" + " .align 4 \n" + " .long 1b,2b \n" + ".previous \n" + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), + "=q" (tmp_reg), + "=r" (flag) + :"2" (1) + :"memory"); + + return flag; +} + +void free_initmem(void) +{ + unsigned long addr; + + addr = (unsigned long)(&__init_begin); + for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + set_page_count(virt_to_page(addr), 1); + memset((void *)addr, 0xcc, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } + printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + if (start < end) + printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); + for (; start < end; start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + set_page_count(virt_to_page(start), 1); + free_page(start); + totalram_pages++; + } +} +#endif Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/ioremap.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/ioremap.c @@ -0,0 +1,442 @@ +/* + * arch/i386/mm/ioremap.c + * + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <asm/io.h> +#include <asm/fixmap.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> + +#ifndef CONFIG_XEN_PHYSDEV_ACCESS + +void * __ioremap(unsigned long phys_addr, unsigned long size, + unsigned long flags) +{ + return NULL; +} + +void *ioremap_nocache (unsigned long phys_addr, unsigned long size) +{ + return NULL; +} + +void iounmap(volatile void __iomem *addr) +{ +} + +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) +{ + return NULL; +} + +void __init bt_iounmap(void *addr, unsigned long size) +{ +} + +#else + +/* + * Does @address reside within a non-highmem page that is local to this virtual + * machine (i.e., not an I/O page, nor a memory page belonging to another VM). + * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand + * why this works. + */ +static inline int is_local_lowmem(unsigned long address) +{ + extern unsigned long max_low_pfn; + unsigned long mfn = address >> PAGE_SHIFT; + unsigned long pfn = mfn_to_pfn(mfn); + return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn)); +} + +/* + * Generic mapping function (not visible outside): + */ + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) +{ + void __iomem * addr; + struct vm_struct * area; + unsigned long offset, last_addr; + domid_t domid = DOMID_IO; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= 0x0 && last_addr < 0x100000) + return isa_bus_to_virt(phys_addr); +#endif + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + if (is_local_lowmem(phys_addr)) { + char *t_addr, *t_end; + struct page *page; + + t_addr = bus_to_virt(phys_addr); + t_end = t_addr + (size - 1); + + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) + if(!PageReserved(page)) + return NULL; + + domid = DOMID_SELF; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP | (flags << 20)); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = (void __iomem *) area->addr; + if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr, + size, __pgprot(_PAGE_PRESENT | _PAGE_RW | + _PAGE_DIRTY | _PAGE_ACCESSED + | flags), domid)) { + vunmap((void __force *) addr); + return NULL; + } + return (void __iomem *) (offset + (char __iomem *)addr); +} + + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ + +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) +{ + unsigned long last_addr; + void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); + if (!p) + return p; + + /* Guaranteed to be > phys_addr, as per __ioremap() */ + last_addr = phys_addr + size - 1; + + if (is_local_lowmem(last_addr)) { + struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); + unsigned long npages; + + phys_addr &= PAGE_MASK; + + /* This might overflow and become zero.. */ + last_addr = PAGE_ALIGN(last_addr); + + /* .. but that's ok, because modulo-2**n arithmetic will make + * the page-aligned "last - first" come out right. + */ + npages = (last_addr - phys_addr) >> PAGE_SHIFT; + + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { + iounmap(p); + p = NULL; + } + global_flush_tlb(); + } + + return p; +} + +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p; + if ((void __force *) addr <= high_memory) + return; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) + return; +#endif + p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); + if (!p) { + printk("__iounmap: bad address %p\n", addr); + return; + } + + if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { + /* p->size includes the guard page, but cpa doesn't like that */ + change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), + (p->size - PAGE_SIZE) >> PAGE_SHIFT, + PAGE_KERNEL); + global_flush_tlb(); + } + kfree(p); +} + +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long offset, last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= 0x0 && last_addr < 0x100000) + return isa_bus_to_virt(phys_addr); +#endif + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN; + while (nrpages > 0) { + set_fixmap(idx, phys_addr); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); +} + +void __init bt_iounmap(void *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) + return; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) + return; +#endif + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN; + while (nrpages > 0) { + clear_fixmap(idx); + --idx; + --nrpages; + } +} + +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ + +/* These hacky macros avoid phys->machine translations. */ +#define __direct_pte(x) ((pte_t) { (x) } ) +#define __direct_mk_pte(page_nr,pgprot) \ + __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) +#define direct_mk_pte_phys(physpage, pgprot) \ + __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) + +static inline void direct_remap_area_pte(pte_t *pte, + unsigned long address, + unsigned long size, + mmu_update_t **v) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + if (address >= end) + BUG(); + + do { + (*v)->ptr = virt_to_machine(pte); + (*v)++; + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int direct_remap_area_pmd(struct mm_struct *mm, + pmd_t *pmd, + unsigned long address, + unsigned long size, + mmu_update_t **v) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + if (address >= end) + BUG(); + do { + pte_t *pte = (mm == &init_mm) ? + pte_alloc_kernel(mm, pmd, address) : + pte_alloc_map(mm, pmd, address); + if (!pte) + return -ENOMEM; + direct_remap_area_pte(pte, address, end - address, v); + pte_unmap(pte); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +int __direct_remap_area_pages(struct mm_struct *mm, + unsigned long address, + unsigned long size, + mmu_update_t *v) +{ + pgd_t * dir; + unsigned long end = address + size; + int error; + + dir = pgd_offset(mm, address); + if (address >= end) + BUG(); + spin_lock(&mm->page_table_lock); + do { + pud_t *pud; + pmd_t *pmd; + + error = -ENOMEM; + pud = pud_alloc(mm, dir, address); + if (!pud) + break; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + break; + error = 0; + direct_remap_area_pmd(mm, pmd, address, end - address, &v); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + + } while (address && (address < end)); + spin_unlock(&mm->page_table_lock); + return error; +} + + +int direct_remap_area_pages(struct mm_struct *mm, + unsigned long address, + unsigned long machine_addr, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + int i; + unsigned long start_address; +#define MAX_DIRECTMAP_MMU_QUEUE 130 + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u; + + start_address = address; + + flush_cache_all(); + + for (i = 0; i < size; i += PAGE_SIZE) { + if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) { + /* Fill in the PTE pointers. */ + __direct_remap_area_pages(mm, + start_address, + address-start_address, + u); + + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) + return -EFAULT; + v = u; + start_address = address; + } + + /* + * Fill in the machine address: PTE ptr is done later by + * __direct_remap_area_pages(). + */ + v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot); + + machine_addr += PAGE_SIZE; + address += PAGE_SIZE; + v++; + } + + if (v != u) { + /* get the ptep's filled in */ + __direct_remap_area_pages(mm, + start_address, + address-start_address, + u); + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) + return -EFAULT; + } + + flush_tlb_all(); + + return 0; +} + +EXPORT_SYMBOL(direct_remap_area_pages); Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/Makefile @@ -0,0 +1,24 @@ +# +# Makefile for the linux i386-specific parts of the memory manager. +# + +XENARCH := $(subst ",,$(CONFIG_XENARCH)) + +CFLAGS += -Iarch/$(XENARCH)/mm + +obj-y := init.o pgtable.o fault.o ioremap.o hypervisor.o +c-obj-y := extable.o mmap.o pageattr.o + +c-obj-$(CONFIG_DISCONTIGMEM) += discontig.o +c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_HIGHMEM) += highmem.o +c-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/mm/$(notdir $@) $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/pgtable.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/pgtable.c @@ -0,0 +1,551 @@ +/* + * linux/arch/i386/mm/pgtable.c + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> + +#include <asm/system.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/io.h> +#include <asm/mmu_context.h> + +#include <asm-xen/foreign_page.h> + +void show_mem(void) +{ + int total = 0, reserved = 0; + int shared = 0, cached = 0; + int highmem = 0; + struct page *page; + pg_data_t *pgdat; + unsigned long i; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + for_each_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + page = pgdat->node_mem_map + i; + total++; + if (PageHighMem(page)) + highmem++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + } + printk("%d pages of RAM\n", total); + printk("%d pages of HIGHMEM\n",highmem); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* <pfn,flags> stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte(pfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn, + pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* <pfn,flags> stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte_ma(pfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk ("set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk ("set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + printk ("set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + switch (idx) { + case FIX_WP_TEST: + case FIX_VSYSCALL: +#ifdef CONFIG_X86_F00F_BUG + case FIX_F00F_IDT: +#endif + set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + break; + default: + set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags); + break; + } +} + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + if (pte) + make_page_readonly(pte); + return pte; +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + if (pte) { + SetPageForeign(pte, _pte_free); + set_page_count(pte, 1); + } +#endif + + return pte; +} + +void _pte_free(struct page *pte) +{ + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); + + if (!pte_write(*virt_to_ptep(va))) + HYPERVISOR_update_va_mapping( + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0); + + ClearPageForeign(pte); + set_page_count(pte, 1); + + __free_page(pte); +} + +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +{ + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * The locking scheme was chosen on the basis of manfred's + * recommendations and having no core impact whatsoever. + * -- wli + */ +DEFINE_SPINLOCK(pgd_lock); +struct page *pgd_list; + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + page->index = (unsigned long)pgd_list; + if (pgd_list) + pgd_list->private = (unsigned long)&page->index; + pgd_list = page; + page->private = (unsigned long)&pgd_list; +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *next, **pprev, *page = virt_to_page(pgd); + next = (struct page *)page->index; + pprev = (struct page **)page->private; + *pprev = next; + if (next) + next->private = (unsigned long)pprev; +} + +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + if (!HAVE_SHARED_KERNEL_PMD) + spin_lock_irqsave(&pgd_lock, flags); + + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + + if (HAVE_SHARED_KERNEL_PMD) + return; + + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +} + +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; /* can be called from interrupt context */ + + if (HAVE_SHARED_KERNEL_PMD) + return; + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i = 0; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + if (!HAVE_SHARED_KERNEL_PMD) { + /* alloc and copy kernel pmd */ + unsigned long flags; + pgd_t *copy_pgd = pgd_offset_k(PAGE_OFFSET); + pud_t *copy_pud = pud_offset(copy_pgd, PAGE_OFFSET); + pmd_t *copy_pmd = pmd_offset(copy_pud, PAGE_OFFSET); + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (0 == pmd) + goto out_oom; + + spin_lock_irqsave(&pgd_lock, flags); + memcpy(pmd, copy_pmd, PAGE_SIZE); + spin_unlock_irqrestore(&pgd_lock, flags); + make_page_readonly(pmd); + set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd))); + } + + /* alloc user pmds */ + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + } + return pgd; + +out_oom: + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pgd_cache, pgd); + return NULL; +} + +void pgd_free(pgd_t *pgd) +{ + int i; + pte_t *ptep = virt_to_ptep(pgd); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(pgd)); + HYPERVISOR_update_va_mapping( + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), + 0); + } + + /* in the PAE case user pgd entries are overwritten before usage */ + if (PTRS_PER_PMD > 1) { + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); + kmem_cache_free(pmd_cache, pmd); + } + if (!HAVE_SHARED_KERNEL_PMD) { + pmd_t *pmd = (void *)__va(pgd_val(pgd[USER_PTRS_PER_PGD])-1); + make_page_writable(pmd); + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); + kmem_cache_free(pmd_cache, pmd); + } + } + /* in the non-PAE case, free_pgtables() clears user pgd entries */ + kmem_cache_free(pgd_cache, pgd); +} + +#ifndef CONFIG_XEN_SHADOW_MODE +void make_lowmem_page_readonly(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_wrprotect(*pte)); +} + +void make_lowmem_page_writable(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_mkwrite(*pte)); +} + +void make_page_readonly(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_wrprotect(*pte)); + if ( (unsigned long)va >= (unsigned long)high_memory ) + { + unsigned long phys; + phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK); +#ifdef CONFIG_HIGHMEM + if ( (phys >> PAGE_SHIFT) < highstart_pfn ) +#endif + make_lowmem_page_readonly(phys_to_virt(phys)); + } +} + +void make_page_writable(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_mkwrite(*pte)); + if ( (unsigned long)va >= (unsigned long)high_memory ) + { + unsigned long phys; + phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK); +#ifdef CONFIG_HIGHMEM + if ( (phys >> PAGE_SHIFT) < highstart_pfn ) +#endif + make_lowmem_page_writable(phys_to_virt(phys)); + } +} + +void make_pages_readonly(void *va, unsigned int nr) +{ + while ( nr-- != 0 ) + { + make_page_readonly(va); + va = (void *)((unsigned long)va + PAGE_SIZE); + } +} + +void make_pages_writable(void *va, unsigned int nr) +{ + while ( nr-- != 0 ) + { + make_page_writable(va); + va = (void *)((unsigned long)va + PAGE_SIZE); + } +} +#endif /* CONFIG_XEN_SHADOW_MODE */ + +LIST_HEAD(mm_unpinned); +DEFINE_SPINLOCK(mm_unpinned_lock); + +static inline void mm_walk_set_prot(void *pt, pgprot_t flags) +{ + struct page *page = virt_to_page(pt); + unsigned long pfn = page_to_pfn(page); + + if (PageHighMem(page)) + return; + HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0); +} + +static void mm_walk(struct mm_struct *mm, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g,u,m; + + pgd = mm->pgd; + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + mm_walk_set_prot(pud,flags); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + mm_walk_set_prot(pmd,flags); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + pte = pte_offset_kernel(pmd,0); + mm_walk_set_prot(pte,flags); + } + } + } +} + +void mm_pin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + mm_walk(mm, PAGE_KERNEL_RO); + HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH); + xen_pgd_pin(__pa(mm->pgd)); + mm->context.pinned = 1; + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_unpin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + xen_pgd_unpin(__pa(mm->pgd)); + HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0); + mm_walk(mm, PAGE_KERNEL); + xen_tlb_flush(); + mm->context.pinned = 0; + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_pin_all(void) +{ + while (!list_empty(&mm_unpinned)) + mm_pin(list_entry(mm_unpinned.next, struct mm_struct, + context.unpinned)); +} + +void _arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if ( tsk->active_mm == mm ) + { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) + mm_unpin(mm); +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/irq.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/irq.c @@ -0,0 +1,1120 @@ +/* + * Low-Level PCI Support for PC -- Routing of Interrupts + * + * (c) 1999--2000 Martin Mares <mj@xxxxxx> + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/dmi.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/io_apic.h> +#include <asm/hw_irq.h> +#include <linux/acpi.h> + +#include "pci.h" + +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) +#define PIRQ_VERSION 0x0100 + +static int broken_hp_bios_irq9; +static int acer_tm360_irqrouting; + +static struct irq_routing_table *pirq_table; + +static int pirq_enable_irq(struct pci_dev *dev); + +/* + * Never use: 0, 1, 2 (timer, keyboard, and cascade) + * Avoid using: 13, 14 and 15 (FP error and IDE). + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse) + */ +unsigned int pcibios_irq_mask = 0xfff8; + +static int pirq_penalty[16] = { + 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000, + 0, 0, 0, 0, 1000, 100000, 100000, 100000 +}; + +struct irq_router { + char *name; + u16 vendor, device; + int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); +}; + +struct irq_router_handler { + u16 vendor; + int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); +}; + +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; + +/* + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. + */ + +static struct irq_routing_table * __init pirq_find_routing_table(void) +{ + u8 *addr; + struct irq_routing_table *rt; + int i; + u8 sum; + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) { + rt = (struct irq_routing_table *) addr; + if (rt->signature != PIRQ_SIGNATURE || + rt->version != PIRQ_VERSION || + rt->size % 16 || + rt->size < sizeof(struct irq_routing_table)) + continue; + sum = 0; + for(i=0; i<rt->size; i++) + sum += addr[i]; + if (!sum) { + DBG("PCI: Interrupt Routing Table found at 0x%p\n", rt); + return rt; + } + } +#endif + + return NULL; +} + +/* + * If we have a IRQ routing table, use it to search for peer host + * bridges. It's a gross hack, but since there are no other known + * ways how to get a list of buses, we have to go this way. + */ + +static void __init pirq_peer_trick(void) +{ + struct irq_routing_table *rt = pirq_table; + u8 busmap[256]; + int i; + struct irq_info *e; + + memset(busmap, 0, sizeof(busmap)); + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { + e = &rt->slots[i]; +#ifdef DEBUG + { + int j; + DBG("%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); + for(j=0; j<4; j++) + DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); + DBG("\n"); + } +#endif + busmap[e->bus] = 1; + } + for(i = 1; i < 256; i++) { + if (!busmap[i] || pci_find_bus(0, i)) + continue; + if (pci_scan_bus(i, &pci_root_ops, NULL)) + printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i); + } + pcibios_last_bus = -1; +} + +/* + * Code for querying and setting of IRQ routes on various interrupt routers. + */ + +void eisa_set_level_irq(unsigned int irq) +{ + unsigned char mask = 1 << (irq & 7); + unsigned int port = 0x4d0 + (irq >> 3); + unsigned char val; + static u16 eisa_irq_mask; + + if (irq >= 16 || (1 << irq) & eisa_irq_mask) + return; + + eisa_irq_mask |= (1 << irq); + printk("PCI: setting IRQ %u as level-triggered\n", irq); + val = inb(port); + if (!(val & mask)) { + DBG(" -> edge"); + outb(val | mask, port); + } +} + +/* + * Common IRQ routing practice: nybbles in config space, + * offset by some magic constant. + */ +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr) +{ + u8 x; + unsigned reg = offset + (nr >> 1); + + pci_read_config_byte(router, reg, &x); + return (nr & 1) ? (x >> 4) : (x & 0xf); +} + +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) +{ + u8 x; + unsigned reg = offset + (nr >> 1); + + pci_read_config_byte(router, reg, &x); + x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val); + pci_write_config_byte(router, reg, x); +} + +/* + * ALI pirq entries are damn ugly, and completely undocumented. + * This has been figured out from pirq tables, and it's not a pretty + * picture. + */ +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; + + return irqmap[read_config_nybble(router, 0x48, pirq-1)]; +} + +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; + unsigned int val = irqmap[irq]; + + if (val) { + write_config_nybble(router, 0x48, pirq-1, val); + return 1; + } + return 0; +} + +/* + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is + * just a pointer to the config space. + */ +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 x; + + pci_read_config_byte(router, pirq, &x); + return (x < 16) ? x : 0; +} + +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + pci_write_config_byte(router, pirq, irq); + return 1; +} + +/* + * The VIA pirq rules are nibble-based, like ALI, + * but without the ugly irq number munging. + * However, PIRQD is in the upper instead of lower 4 bits. + */ +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq); +} + +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq); + return 1; +} + +/* + * ITE 8330G pirq rules are nibble-based + * FIXME: pirqmap may be { 1, 0, 3, 2 }, + * 2+3 are both mapped to irq 9 on my system + */ +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + return read_config_nybble(router,0x43, pirqmap[pirq-1]); +} + +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); + return 1; +} + +/* + * OPTI: high four bits are nibble pointer.. + * I wonder what the low bits do? + */ +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0xb8, pirq >> 4); +} + +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0xb8, pirq >> 4, irq); + return 1; +} + +/* + * Cyrix: nibble offset 0x5C + * 0x5C bits 7:4 is INTB bits 3:0 is INTA + * 0x5D bits 7:4 is INTD bits 3:0 is INTC + */ +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0x5C, (pirq-1)^1); +} + +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0x5C, (pirq-1)^1, irq); + return 1; +} + +/* + * PIRQ routing for SiS 85C503 router used in several SiS chipsets. + * We have to deal with the following issues here: + * - vendors have different ideas about the meaning of link values + * - some onboard devices (integrated in the chipset) have special + * links and are thus routed differently (i.e. not via PCI INTA-INTD) + * - different revision of the router have a different layout for + * the routing registers, particularly for the onchip devices + * + * For all routing registers the common thing is we have one byte + * per routeable link which is defined as: + * bit 7 IRQ mapping enabled (0) or disabled (1) + * bits [6:4] reserved (sometimes used for onchip devices) + * bits [3:0] IRQ to map to + * allowed: 3-7, 9-12, 14-15 + * reserved: 0, 1, 2, 8, 13 + * + * The config-space registers located at 0x41/0x42/0x43/0x44 are + * always used to route the normal PCI INT A/B/C/D respectively. + * Apparently there are systems implementing PCI routing table using + * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. + * We try our best to handle both link mappings. + * + * Currently (2003-05-21) it appears most SiS chipsets follow the + * definition of routing registers from the SiS-5595 southbridge. + * According to the SiS 5595 datasheets the revision id's of the + * router (ISA-bridge) should be 0x01 or 0xb0. + * + * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1. + * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets. + * They seem to work with the current routing code. However there is + * some concern because of the two USB-OHCI HCs (original SiS 5595 + * had only one). YMMV. + * + * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1: + * + * 0x61: IDEIRQ: + * bits [6:5] must be written 01 + * bit 4 channel-select primary (0), secondary (1) + * + * 0x62: USBIRQ: + * bit 6 OHCI function disabled (0), enabled (1) + * + * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved + * + * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved + * + * We support USBIRQ (in addition to INTA-INTD) and keep the + * IDE, ACPI and DAQ routing untouched as set by the BIOS. + * + * Currently the only reported exception is the new SiS 65x chipset + * which includes the SiS 69x southbridge. Here we have the 85C503 + * router revision 0x04 and there are changes in the register layout + * mostly related to the different USB HCs with USB 2.0 support. + * + * Onchip routing for router rev-id 0x04 (try-and-error observation) + * + * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs + * bit 6-4 are probably unused, not like 5595 + */ + +#define PIRQ_SIS_IRQ_MASK 0x0f +#define PIRQ_SIS_IRQ_DISABLE 0x80 +#define PIRQ_SIS_USB_ENABLE 0x40 + +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 x; + int reg; + + reg = pirq; + if (reg >= 0x01 && reg <= 0x04) + reg += 0x40; + pci_read_config_byte(router, reg, &x); + return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK); +} + +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + u8 x; + int reg; + + reg = pirq; + if (reg >= 0x01 && reg <= 0x04) + reg += 0x40; + pci_read_config_byte(router, reg, &x); + x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE); + x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE; + pci_write_config_byte(router, reg, x); + return 1; +} + + +/* + * VLSI: nibble offset 0x74 - educated guess due to routing table and + * config space of VLSI 82C534 PCI-bridge/router (1004:0102) + * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard + * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6 + * for the busbridge to the docking station. + */ + +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + if (pirq > 8) { + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + return 0; + } + return read_config_nybble(router, 0x74, pirq-1); +} + +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + if (pirq > 8) { + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + return 0; + } + write_config_nybble(router, 0x74, pirq-1, irq); + return 1; +} + +/* + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index + * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect + * register is a straight binary coding of desired PIC IRQ (low nibble). + * + * The 'link' value in the PIRQ table is already in the correct format + * for the Index register. There are some special index values: + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1, + * and 0x03 for SMBus. + */ +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + outb_p(pirq, 0xc00); + return inb(0xc01) & 0xf; +} + +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + outb_p(pirq, 0xc00); + outb_p(irq, 0xc01); + return 1; +} + +/* Support for AMD756 PCI IRQ Routing + * Jhon H. Caicedo <jhcaiced@xxxxxxxxxxx> + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced) + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced) + * The AMD756 pirq rules are nibble-based + * offset 0x56 0-3 PIRQA 4-7 PIRQB + * offset 0x57 0-3 PIRQC 4-7 PIRQD + */ +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 irq; + irq = 0; + if (pirq <= 4) + { + irq = read_config_nybble(router, 0x56, pirq - 1); + } + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", + dev->vendor, dev->device, pirq, irq); + return irq; +} + +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", + dev->vendor, dev->device, pirq, irq); + if (pirq <= 4) + { + write_config_nybble(router, 0x56, pirq - 1, irq); + } + return 1; +} + +#ifdef CONFIG_PCI_BIOS + +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + struct pci_dev *bridge; + int pin = pci_get_interrupt_pin(dev, &bridge); + return pcibios_set_irq_routing(bridge, pin, irq); +} + +#endif + +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + static struct pci_device_id pirq_440gx[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) }, + { }, + }; + + /* 440GX has a proprietary PIRQ router -- don't use it */ + if (pci_dev_present(pirq_440gx)) + return 0; + + switch(device) + { + case PCI_DEVICE_ID_INTEL_82371FB_0: + case PCI_DEVICE_ID_INTEL_82371SB_0: + case PCI_DEVICE_ID_INTEL_82371AB_0: + case PCI_DEVICE_ID_INTEL_82371MX: + case PCI_DEVICE_ID_INTEL_82443MX_0: + case PCI_DEVICE_ID_INTEL_82801AA_0: + case PCI_DEVICE_ID_INTEL_82801AB_0: + case PCI_DEVICE_ID_INTEL_82801BA_0: + case PCI_DEVICE_ID_INTEL_82801BA_10: + case PCI_DEVICE_ID_INTEL_82801CA_0: + case PCI_DEVICE_ID_INTEL_82801CA_12: + case PCI_DEVICE_ID_INTEL_82801DB_0: + case PCI_DEVICE_ID_INTEL_82801E_0: + case PCI_DEVICE_ID_INTEL_82801EB_0: + case PCI_DEVICE_ID_INTEL_ESB_1: + case PCI_DEVICE_ID_INTEL_ICH6_0: + case PCI_DEVICE_ID_INTEL_ICH6_1: + case PCI_DEVICE_ID_INTEL_ICH7_0: + case PCI_DEVICE_ID_INTEL_ICH7_1: + case PCI_DEVICE_ID_INTEL_ICH7_30: + case PCI_DEVICE_ID_INTEL_ICH7_31: + case PCI_DEVICE_ID_INTEL_ESB2_0: + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; +} + +static __init int via_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + /* FIXME: We should move some of the quirk fixup stuff here */ + switch(device) + { + case PCI_DEVICE_ID_VIA_82C586_0: + case PCI_DEVICE_ID_VIA_82C596: + case PCI_DEVICE_ID_VIA_82C686: + case PCI_DEVICE_ID_VIA_8231: + /* FIXME: add new ones for 8233/5 */ + r->name = "VIA"; + r->get = pirq_via_get; + r->set = pirq_via_set; + return 1; + } + return 0; +} + +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_VLSI_82C534: + r->name = "VLSI 82C534"; + r->get = pirq_vlsi_get; + r->set = pirq_vlsi_set; + return 1; + } + return 0; +} + + +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_SERVERWORKS_OSB4: + case PCI_DEVICE_ID_SERVERWORKS_CSB5: + r->name = "ServerWorks"; + r->get = pirq_serverworks_get; + r->set = pirq_serverworks_set; + return 1; + } + return 0; +} + +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + if (device != PCI_DEVICE_ID_SI_503) + return 0; + + r->name = "SIS"; + r->get = pirq_sis_get; + r->set = pirq_sis_set; + return 1; +} + +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_CYRIX_5520: + r->name = "NatSemi"; + r->get = pirq_cyrix_get; + r->set = pirq_cyrix_set; + return 1; + } + return 0; +} + +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_OPTI_82C700: + r->name = "OPTI"; + r->get = pirq_opti_get; + r->set = pirq_opti_set; + return 1; + } + return 0; +} + +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_ITE_IT8330G_0: + r->name = "ITE"; + r->get = pirq_ite_get; + r->set = pirq_ite_set; + return 1; + } + return 0; +} + +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_AL_M1533: + case PCI_DEVICE_ID_AL_M1563: + printk("PCI: Using ALI IRQ Router\n"); + r->name = "ALI"; + r->get = pirq_ali_get; + r->set = pirq_ali_set; + return 1; + } + return 0; +} + +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_AMD_VIPER_740B: + r->name = "AMD756"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7413: + r->name = "AMD766"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7443: + r->name = "AMD768"; + break; + default: + return 0; + } + r->get = pirq_amd756_get; + r->set = pirq_amd756_set; + return 1; +} + +static __initdata struct irq_router_handler pirq_routers[] = { + { PCI_VENDOR_ID_INTEL, intel_router_probe }, + { PCI_VENDOR_ID_AL, ali_router_probe }, + { PCI_VENDOR_ID_ITE, ite_router_probe }, + { PCI_VENDOR_ID_VIA, via_router_probe }, + { PCI_VENDOR_ID_OPTI, opti_router_probe }, + { PCI_VENDOR_ID_SI, sis_router_probe }, + { PCI_VENDOR_ID_CYRIX, cyrix_router_probe }, + { PCI_VENDOR_ID_VLSI, vlsi_router_probe }, + { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe }, + { PCI_VENDOR_ID_AMD, amd_router_probe }, + /* Someone with docs needs to add the ATI Radeon IGP */ + { 0, NULL } +}; +static struct irq_router pirq_router; +static struct pci_dev *pirq_router_dev; + + +/* + * FIXME: should we have an option to say "generic for + * chipset" ? + */ + +static void __init pirq_find_router(struct irq_router *r) +{ + struct irq_routing_table *rt = pirq_table; + struct irq_router_handler *h; + +#ifdef CONFIG_PCI_BIOS + if (!rt->signature) { + printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n"); + r->set = pirq_bios_set; + r->name = "BIOS"; + return; + } +#endif + + /* Default unless a driver reloads it */ + r->name = "default"; + r->get = NULL; + r->set = NULL; + + DBG("PCI: Attempting to find IRQ router for %04x:%04x\n", + rt->rtr_vendor, rt->rtr_device); + + pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); + if (!pirq_router_dev) { + DBG("PCI: Interrupt router not found at %02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); + return; + } + + for( h = pirq_routers; h->vendor; h++) { + /* First look for a router match */ + if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) + break; + /* Fall back to a device match */ + if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) + break; + } + printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", + pirq_router.name, + pirq_router_dev->vendor, + pirq_router_dev->device, + pci_name(pirq_router_dev)); +} + +static struct irq_info *pirq_get_info(struct pci_dev *dev) +{ + struct irq_routing_table *rt = pirq_table; + int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); + struct irq_info *info; + + for (info = rt->slots; entries--; info++) + if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) + return info; + return NULL; +} + +static int pcibios_lookup_irq(struct pci_dev *dev, int assign) +{ + u8 pin; + struct irq_info *info; + int i, pirq, newirq; + int irq = 0; + u32 mask; + struct irq_router *r = &pirq_router; + struct pci_dev *dev2 = NULL; + char *msg = NULL; + + /* Find IRQ pin */ + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (!pin) { + DBG(" -> no interrupt pin\n"); + return 0; + } + pin = pin - 1; + + /* Find IRQ routing entry */ + + if (!pirq_table) + return 0; + + DBG("IRQ for %s[%c]", pci_name(dev), 'A' + pin); + info = pirq_get_info(dev); + if (!info) { + DBG(" -> not found in routing table\n"); + return 0; + } + pirq = info->irq[pin].link; + mask = info->irq[pin].bitmap; + if (!pirq) { + DBG(" -> not routed\n"); + return 0; + } + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); + mask &= pcibios_irq_mask; + + /* Work around broken HP Pavilion Notebooks which assign USB to + IRQ 9 even though it is actually wired to IRQ 11 */ + + if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) { + dev->irq = 11; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11); + r->set(pirq_router_dev, dev, pirq, 11); + } + + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ + if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { + pirq = 0x68; + mask = 0x400; + dev->irq = r->get(pirq_router_dev, dev, pirq); + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); + } + + /* + * Find the best IRQ to assign: use the one + * reported by the device if possible. + */ + newirq = dev->irq; + if (!((1 << newirq) & mask)) { + if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; + else printk(KERN_WARNING "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n", newirq, pci_name(dev)); + } + if (!newirq && assign) { + for (i = 0; i < 16; i++) { + if (!(mask & (1 << i))) + continue; + if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ)) + newirq = i; + } + } + DBG(" -> newirq=%d", newirq); + + /* Check if it is hardcoded */ + if ((pirq & 0xf0) == 0xf0) { + irq = pirq & 0xf; + DBG(" -> hardcoded IRQ %d\n", irq); + msg = "Hardcoded"; + } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { + DBG(" -> got IRQ %d\n", irq); + msg = "Found"; + } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { + DBG(" -> assigning IRQ %d", newirq); + if (r->set(pirq_router_dev, dev, pirq, newirq)) { + eisa_set_level_irq(newirq); + DBG(" ... OK\n"); + msg = "Assigned"; + irq = newirq; + } + } + + if (!irq) { + DBG(" ... failed\n"); + if (newirq && mask == (1 << newirq)) { + msg = "Guessed"; + irq = newirq; + } else + return 0; + } + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); + + /* Update IRQ for all devices with the same pirq value */ + while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); + if (!pin) + continue; + pin--; + info = pirq_get_info(dev2); + if (!info) + continue; + if (info->irq[pin].link == pirq) { + /* We refuse to override the dev->irq information. Give a warning! */ + if ( dev2->irq && dev2->irq != irq && \ + (!(pci_probe & PCI_USE_PIRQ_MASK) || \ + ((1 << dev2->irq) & mask)) ) { +#ifndef CONFIG_PCI_MSI + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", + pci_name(dev2), dev2->irq, irq); +#endif + continue; + } + dev2->irq = irq; + pirq_penalty[irq]++; + if (dev != dev2) + printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); + } + } + return 1; +} + +static void __init pcibios_fixup_irqs(void) +{ + struct pci_dev *dev = NULL; + u8 pin; + + DBG("PCI: IRQ fixup\n"); + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + /* + * If the BIOS has set an out of range IRQ number, just ignore it. + * Also keep track of which IRQ's are already in use. + */ + if (dev->irq >= 16) { + DBG("%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); + dev->irq = 0; + } + /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ + if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) + pirq_penalty[dev->irq] = 0; + pirq_penalty[dev->irq]++; + } + + dev = NULL; + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); +#ifdef CONFIG_X86_IO_APIC + /* + * Recalculate IRQ numbers if we use the I/O APIC. + */ + if (io_apic_assign_pci_irqs) + { + int irq; + + if (pin) { + pin--; /* interrupt pins are numbered starting from 1 */ + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the MP-table. + * In this case we have to look up the IRQ based on the parent bus, + * parent slot, and pin number. The SMP code detects such bridged + * busses itself so we should get into this branch reliably. + */ + if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ + struct pci_dev * bridge = dev->bus->self; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), 'A' + pin, irq); + } + if (irq >= 0) { + if (use_pci_vector() && + !platform_legacy_irq(irq)) + irq = IO_APIC_VECTOR(irq); + + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + pci_name(dev), 'A' + pin, irq); + dev->irq = irq; + } + } + } +#endif + /* + * Still no IRQ? Try to lookup one... + */ + if (pin && !dev->irq) + pcibios_lookup_irq(dev, 0); + } +} + +/* + * Work around broken HP Pavilion Notebooks which assign USB to + * IRQ 9 even though it is actually wired to IRQ 11 + */ +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d) +{ + if (!broken_hp_bios_irq9) { + broken_hp_bios_irq9 = 1; + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + } + return 0; +} + +/* + * Work around broken Acer TravelMate 360 Notebooks which assign + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10 + */ +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d) +{ + if (!acer_tm360_irqrouting) { + acer_tm360_irqrouting = 1; + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pciirq_dmi_table[] = { + { + .callback = fix_broken_hp_bios_irq9, + .ident = "HP Pavilion N5400 Series Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), + DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), + DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), + }, + }, + { + .callback = fix_acer_tm360_irqrouting, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, + { } +}; + +static int __init pcibios_irq_init(void) +{ + DBG("PCI: IRQ init\n"); + + if (pcibios_enable_irq || raw_pci_ops == NULL) + return 0; + + dmi_check_system(pciirq_dmi_table); + + pirq_table = pirq_find_routing_table(); + +#ifdef CONFIG_PCI_BIOS + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) + pirq_table = pcibios_get_irq_routing_table(); +#endif + if (pirq_table) { + pirq_peer_trick(); + pirq_find_router(&pirq_router); + if (pirq_table->exclusive_irqs) { + int i; + for (i=0; i<16; i++) + if (!(pirq_table->exclusive_irqs & (1 << i))) + pirq_penalty[i] += 100; + } + /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ + if (io_apic_assign_pci_irqs) + pirq_table = NULL; + } + + pcibios_enable_irq = pirq_enable_irq; + + pcibios_fixup_irqs(); + return 0; +} + +subsys_initcall(pcibios_irq_init); + + +static void pirq_penalize_isa_irq(int irq) +{ + /* + * If any ISAPnP device reports an IRQ in its list of possible + * IRQ's, we try to avoid assigning it to PCI devices. + */ + if (irq < 16) + pirq_penalty[irq] += 100; +} + +void pcibios_penalize_isa_irq(int irq) +{ +#ifdef CONFIG_ACPI_PCI + if (!acpi_noirq) + acpi_penalize_isa_irq(irq); + else +#endif + pirq_penalize_isa_irq(irq); +} + +static int pirq_enable_irq(struct pci_dev *dev) +{ + u8 pin; + struct pci_dev *temp_dev; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { + char *msg = ""; + + pin--; /* interrupt pins are numbered starting from 1 */ + + if (io_apic_assign_pci_irqs) { + int irq; + + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the MP-table. + * In this case we have to look up the IRQ based on the parent bus, + * parent slot, and pin number. The SMP code detects such bridged + * busses itself so we should get into this branch reliably. + */ + temp_dev = dev; + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ + struct pci_dev * bridge = dev->bus->self; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), 'A' + pin, irq); + dev = bridge; + } + dev = temp_dev; + if (irq >= 0) { +#ifdef CONFIG_PCI_MSI + if (!platform_legacy_irq(irq)) + irq = IO_APIC_VECTOR(irq); +#endif + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + pci_name(dev), 'A' + pin, irq); + dev->irq = irq; + return 0; + } else + msg = " Probably buggy MP table."; + } else if (pci_probe & PCI_BIOS_IRQ_SCAN) + msg = ""; + else + msg = " Please try using pci=biosirq."; + + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) + return 0; + + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", + 'A' + pin, pci_name(dev), msg); + } + return 0; +} + +int pci_vector_resources(int last, int nr_released) +{ + int count = nr_released; + + int next = last; + int offset = (last % 8); + + while (next < FIRST_SYSTEM_VECTOR) { + next += 8; +#ifdef CONFIG_X86_64 + if (next == IA32_SYSCALL_VECTOR) + continue; +#else + if (next == SYSCALL_VECTOR) + continue; +#endif + count++; + if (next >= FIRST_SYSTEM_VECTOR) { + if (offset%8) { + next = FIRST_DEVICE_VECTOR + offset; + offset++; + continue; + } + count--; + } + } + + return count; +} Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/Makefile @@ -0,0 +1,30 @@ +CFLAGS += -Iarch/i386/pci + +c-obj-y := i386.o + +c-obj-$(CONFIG_PCI_BIOS) += pcbios.o +c-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o +c-obj-$(CONFIG_PCI_DIRECT) += direct.o + +c-pci-y := fixup.o +c-pci-$(CONFIG_ACPI_PCI) += acpi.o +c-pci-y += legacy.o +# Make sure irq.o gets linked in after legacy.o +l-pci-y += irq.o + +c-pci-$(CONFIG_X86_VISWS) := visws.o fixup.o +pci-$(CONFIG_X86_VISWS) := +c-pci-$(CONFIG_X86_NUMAQ) := numa.o +pci-$(CONFIG_X86_NUMAQ) := irq.o + +obj-y += $(pci-y) +c-obj-y += $(c-pci-y) common.o + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@ + +obj-y += $(c-obj-y) $(l-pci-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) Index: linux-2.6.12-xen0-arch/arch/i386/Makefile =================================================================== --- linux-2.6.12-xen0-arch.orig/arch/i386/Makefile +++ linux-2.6.12-xen0-arch/arch/i386/Makefile @@ -71,6 +71,10 @@ CFLAGS += $(cflags-y) # Default subarch .c files mcore-y := mach-default +head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o +boot := arch/i386/boot +pci := arch/i386/pci/ + # Voyager subarch support mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-i386/mach-voyager mcore-$(CONFIG_X86_VOYAGER) := mach-voyager @@ -91,6 +95,16 @@ mcore-$(CONFIG_X86_BIGSMP) := mach-defau mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit mcore-$(CONFIG_X86_SUMMIT) := mach-default +#Xen subarch support +mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-i386/mach-xen +mcore-$(CONFIG_X86_XEN) := mach-xen +core-$(CONFIG_X86_XEN) += arch/i386/mach-xen/ +head-$(CONFIG_X86_XEN) := arch/i386/mach-xen/kernel/head.o arch/i386/mach-xen/kernel/init_task.o +ifdef CONFIG_X86_XEN +boot := arch/i386/mach-xen/boot +pci := arch/i386/mach-xen/pci/ +endif + # generic subarchitecture mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic mcore-$(CONFIG_X86_GENERICARCH) := mach-default @@ -104,15 +118,15 @@ core-$(CONFIG_X86_ES7000) := arch/i386/m # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default -head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o - libs-y += arch/i386/lib/ +ifndef CONFIG_X86_XEN core-y += arch/i386/kernel/ \ arch/i386/mm/ \ arch/i386/$(mcore-y)/ \ arch/i386/crypto/ drivers-$(CONFIG_MATH_EMULATION) += arch/i386/math-emu/ -drivers-$(CONFIG_PCI) += arch/i386/pci/ +endif +drivers-$(CONFIG_PCI) += $(pci) # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/ drivers-$(CONFIG_PM) += arch/i386/power/ @@ -120,11 +134,16 @@ drivers-$(CONFIG_PM) += arch/i386/powe CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) -boot := arch/i386/boot - .PHONY: zImage bzImage compressed zlilo bzlilo \ zdisk bzdisk fdimage fdimage144 fdimage288 install kernel_install +ifdef CONFIG_X86_XEN +.PHONY: vmlinuz + +vmlinuz: vmlinux + $(Q)$(MAKE) $(build)=$(boot) vmlinuz +endif + all: bzImage # KBUILD_IMAGE specify target image being built @@ -171,4 +190,7 @@ define archhelp echo ' fdimage - Create a boot floppy image' endef +debug: + @echo $(core-y) + CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf Index: linux-2.6.12-xen0-arch/drivers/xen/core/ctrl_if.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/ctrl_if.c @@ -0,0 +1,569 @@ +/****************************************************************************** + * ctrl_if.c + * + * Management functions for special interface to the domain controller. + * + * Copyright (c) 2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <asm-xen/ctrl_if.h> +#include <asm-xen/evtchn.h> + +#if 0 +#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define DPRINTK(_f, _a...) ((void)0) +#endif + +/* + * Extra ring macros to sync a consumer index up to the public producer index. + * Generally UNSAFE, but we use it for recovery and shutdown in some cases. + */ +#define RING_DROP_PENDING_REQUESTS(_r) \ + do { \ + (_r)->req_cons = (_r)->sring->req_prod; \ + } while (0) +#define RING_DROP_PENDING_RESPONSES(_r) \ + do { \ + (_r)->rsp_cons = (_r)->sring->rsp_prod; \ + } while (0) + +/* + * Only used by initial domain which must create its own control-interface + * event channel. This value is picked up by the user-space domain controller + * via an ioctl. + */ +int initdom_ctrlif_domcontroller_port = -1; + +static int ctrl_if_evtchn; +static int ctrl_if_irq; +static spinlock_t ctrl_if_lock; + +static struct irqaction ctrl_if_irq_action; + +static ctrl_front_ring_t ctrl_if_tx_ring; +static ctrl_back_ring_t ctrl_if_rx_ring; + +/* Incoming message requests. */ + /* Primary message type -> message handler. */ +static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256]; + /* Primary message type -> callback in process context? */ +static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)]; + /* Is it late enough during bootstrap to use schedule_task()? */ +static int safe_to_schedule_task; + /* Queue up messages to be handled in process context. */ +static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE]; +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod; +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons; + +/* Incoming message responses: message identifier -> message handler/id. */ +static struct { + ctrl_msg_handler_t fn; + unsigned long id; +} ctrl_if_txmsg_id_mapping[CONTROL_RING_SIZE]; + +/* For received messages that must be deferred to process context. */ +static void __ctrl_if_rxmsg_deferred(void *unused); +static DECLARE_WORK(ctrl_if_rxmsg_deferred_work, + __ctrl_if_rxmsg_deferred, + NULL); + +/* Deferred callbacks for people waiting for space in the transmit ring. */ +static DECLARE_TASK_QUEUE(ctrl_if_tx_tq); + +static DECLARE_WAIT_QUEUE_HEAD(ctrl_if_tx_wait); +static void __ctrl_if_tx_tasklet(unsigned long data); +static DECLARE_TASKLET(ctrl_if_tx_tasklet, __ctrl_if_tx_tasklet, 0); + +static void __ctrl_if_rx_tasklet(unsigned long data); +static DECLARE_TASKLET(ctrl_if_rx_tasklet, __ctrl_if_rx_tasklet, 0); + +#define get_ctrl_if() ((control_if_t *)((char *)HYPERVISOR_shared_info + 2048)) + +static void ctrl_if_notify_controller(void) +{ + notify_via_evtchn(ctrl_if_evtchn); +} + +static void ctrl_if_rxmsg_default_handler(ctrl_msg_t *msg, unsigned long id) +{ + msg->length = 0; + ctrl_if_send_response(msg); +} + +static void __ctrl_if_tx_tasklet(unsigned long data) +{ + ctrl_msg_t *msg; + int was_full = RING_FULL(&ctrl_if_tx_ring); + RING_IDX i, rp; + + i = ctrl_if_tx_ring.rsp_cons; + rp = ctrl_if_tx_ring.sring->rsp_prod; + rmb(); /* Ensure we see all requests up to 'rp'. */ + + for ( ; i != rp; i++ ) + { + msg = RING_GET_RESPONSE(&ctrl_if_tx_ring, i); + + DPRINTK("Rx-Rsp %u/%u :: %d/%d\n", i-1, + ctrl_if_tx_ring.sring->rsp_prod, + msg->type, msg->subtype); + + /* Execute the callback handler, if one was specified. */ + if ( msg->id != 0xFF ) + { + (*ctrl_if_txmsg_id_mapping[msg->id].fn)( + msg, ctrl_if_txmsg_id_mapping[msg->id].id); + smp_mb(); /* Execute, /then/ free. */ + ctrl_if_txmsg_id_mapping[msg->id].fn = NULL; + } + } + + /* + * Step over messages in the ring /after/ finishing reading them. As soon + * as the index is updated then the message may get blown away. + */ + smp_mb(); + ctrl_if_tx_ring.rsp_cons = i; + + if ( was_full && !RING_FULL(&ctrl_if_tx_ring) ) + { + wake_up(&ctrl_if_tx_wait); + run_task_queue(&ctrl_if_tx_tq); + } +} + +static void __ctrl_if_rxmsg_deferred(void *unused) +{ + ctrl_msg_t *msg; + CONTROL_RING_IDX dp; + + dp = ctrl_if_rxmsg_deferred_prod; + rmb(); /* Ensure we see all deferred requests up to 'dp'. */ + + while ( ctrl_if_rxmsg_deferred_cons != dp ) + { + msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( + ctrl_if_rxmsg_deferred_cons++)]; + (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); + } +} + +static void __ctrl_if_rx_tasklet(unsigned long data) +{ + ctrl_msg_t msg, *pmsg; + CONTROL_RING_IDX dp; + RING_IDX rp, i; + + i = ctrl_if_rx_ring.req_cons; + rp = ctrl_if_rx_ring.sring->req_prod; + dp = ctrl_if_rxmsg_deferred_prod; + rmb(); /* Ensure we see all requests up to 'rp'. */ + + for ( ; i != rp; i++) + { + pmsg = RING_GET_REQUEST(&ctrl_if_rx_ring, i); + memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg)); + + DPRINTK("Rx-Req %u/%u :: %d/%d\n", i-1, + ctrl_if_rx_ring.sring->req_prod, + msg.type, msg.subtype); + + if ( msg.length > sizeof(msg.msg) ) + msg.length = sizeof(msg.msg); + + if ( msg.length != 0 ) + memcpy(msg.msg, pmsg->msg, msg.length); + + if ( test_bit(msg.type, + (unsigned long *)&ctrl_if_rxmsg_blocking_context) ) + memcpy(&ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(dp++)], + &msg, offsetof(ctrl_msg_t, msg) + msg.length); + else + (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0); + } + + ctrl_if_rx_ring.req_cons = i; + + if ( dp != ctrl_if_rxmsg_deferred_prod ) + { + wmb(); + ctrl_if_rxmsg_deferred_prod = dp; + schedule_work(&ctrl_if_rxmsg_deferred_work); + } +} + +static irqreturn_t ctrl_if_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + if ( RING_HAS_UNCONSUMED_RESPONSES(&ctrl_if_tx_ring) ) + tasklet_schedule(&ctrl_if_tx_tasklet); + + if ( RING_HAS_UNCONSUMED_REQUESTS(&ctrl_if_rx_ring) ) + tasklet_schedule(&ctrl_if_rx_tasklet); + + return IRQ_HANDLED; +} + +int +ctrl_if_send_message_noblock( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id) +{ + unsigned long flags; + ctrl_msg_t *dmsg; + int i; + + spin_lock_irqsave(&ctrl_if_lock, flags); + + if ( RING_FULL(&ctrl_if_tx_ring) ) + { + spin_unlock_irqrestore(&ctrl_if_lock, flags); + return -EAGAIN; + } + + msg->id = 0xFF; + if ( hnd != NULL ) + { + for ( i = 0; ctrl_if_txmsg_id_mapping[i].fn != NULL; i++ ) + continue; + ctrl_if_txmsg_id_mapping[i].fn = hnd; + ctrl_if_txmsg_id_mapping[i].id = id; + msg->id = i; + } + + DPRINTK("Tx-Req %u/%u :: %d/%d\n", + ctrl_if_tx_ring.req_prod_pvt, + ctrl_if_tx_ring.rsp_cons, + msg->type, msg->subtype); + + dmsg = RING_GET_REQUEST(&ctrl_if_tx_ring, + ctrl_if_tx_ring.req_prod_pvt); + memcpy(dmsg, msg, sizeof(*msg)); + ctrl_if_tx_ring.req_prod_pvt++; + RING_PUSH_REQUESTS(&ctrl_if_tx_ring); + + spin_unlock_irqrestore(&ctrl_if_lock, flags); + + ctrl_if_notify_controller(); + + return 0; +} + +int +ctrl_if_send_message_block( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id, + long wait_state) +{ + DECLARE_WAITQUEUE(wait, current); + int rc; + + /* Fast path. */ + if ( (rc = ctrl_if_send_message_noblock(msg, hnd, id)) != -EAGAIN ) + return rc; + + add_wait_queue(&ctrl_if_tx_wait, &wait); + + for ( ; ; ) + { + set_current_state(wait_state); + + if ( (rc = ctrl_if_send_message_noblock(msg, hnd, id)) != -EAGAIN ) + break; + + rc = -ERESTARTSYS; + if ( signal_pending(current) && (wait_state == TASK_INTERRUPTIBLE) ) + break; + + schedule(); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&ctrl_if_tx_wait, &wait); + + return rc; +} + +/* Allow a reponse-callback handler to find context of a blocked requester. */ +struct rsp_wait { + ctrl_msg_t *msg; /* Buffer for the response message. */ + struct task_struct *task; /* The task that is blocked on the response. */ + int done; /* Indicate to 'task' that response is rcv'ed. */ +}; + +static void __ctrl_if_get_response(ctrl_msg_t *msg, unsigned long id) +{ + struct rsp_wait *wait = (struct rsp_wait *)id; + struct task_struct *task = wait->task; + + memcpy(wait->msg, msg, sizeof(*msg)); + wmb(); + wait->done = 1; + + wake_up_process(task); +} + +int +ctrl_if_send_message_and_get_response( + ctrl_msg_t *msg, + ctrl_msg_t *rmsg, + long wait_state) +{ + struct rsp_wait wait; + int rc; + + wait.msg = rmsg; + wait.done = 0; + wait.task = current; + + if ( (rc = ctrl_if_send_message_block(msg, __ctrl_if_get_response, + (unsigned long)&wait, + wait_state)) != 0 ) + return rc; + + for ( ; ; ) + { + /* NB. Can't easily support TASK_INTERRUPTIBLE here. */ + set_current_state(TASK_UNINTERRUPTIBLE); + if ( wait.done ) + break; + schedule(); + } + + set_current_state(TASK_RUNNING); + return 0; +} + +int +ctrl_if_enqueue_space_callback( + struct tq_struct *task) +{ + /* Fast path. */ + if ( !RING_FULL(&ctrl_if_tx_ring) ) + return 0; + + (void)queue_task(task, &ctrl_if_tx_tq); + + /* + * We may race execution of the task queue, so return re-checked status. If + * the task is not executed despite the ring being non-full then we will + * certainly return 'not full'. + */ + smp_mb(); + return RING_FULL(&ctrl_if_tx_ring); +} + +void +ctrl_if_send_response( + ctrl_msg_t *msg) +{ + unsigned long flags; + ctrl_msg_t *dmsg; + + /* + * NB. The response may the original request message, modified in-place. + * In this situation we may have src==dst, so no copying is required. + */ + spin_lock_irqsave(&ctrl_if_lock, flags); + + DPRINTK("Tx-Rsp %u :: %d/%d\n", + ctrl_if_rx_ring.rsp_prod_pvt, + msg->type, msg->subtype); + + dmsg = RING_GET_RESPONSE(&ctrl_if_rx_ring, + ctrl_if_rx_ring.rsp_prod_pvt); + if ( dmsg != msg ) + memcpy(dmsg, msg, sizeof(*msg)); + + ctrl_if_rx_ring.rsp_prod_pvt++; + RING_PUSH_RESPONSES(&ctrl_if_rx_ring); + + spin_unlock_irqrestore(&ctrl_if_lock, flags); + + ctrl_if_notify_controller(); +} + +int +ctrl_if_register_receiver( + u8 type, + ctrl_msg_handler_t hnd, + unsigned int flags) +{ + unsigned long _flags; + int inuse; + + spin_lock_irqsave(&ctrl_if_lock, _flags); + + inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler); + + if ( inuse ) + { + printk(KERN_INFO "Receiver %p already established for control " + "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type); + } + else + { + ctrl_if_rxmsg_handler[type] = hnd; + clear_bit(type, (unsigned long *)&ctrl_if_rxmsg_blocking_context); + if ( flags == CALLBACK_IN_BLOCKING_CONTEXT ) + { + set_bit(type, (unsigned long *)&ctrl_if_rxmsg_blocking_context); + if ( !safe_to_schedule_task ) + BUG(); + } + } + + spin_unlock_irqrestore(&ctrl_if_lock, _flags); + + return !inuse; +} + +void +ctrl_if_unregister_receiver( + u8 type, + ctrl_msg_handler_t hnd) +{ + unsigned long flags; + + spin_lock_irqsave(&ctrl_if_lock, flags); + + if ( ctrl_if_rxmsg_handler[type] != hnd ) + printk(KERN_INFO "Receiver %p is not registered for control " + "messages of type %d.\n", hnd, type); + else + ctrl_if_rxmsg_handler[type] = ctrl_if_rxmsg_default_handler; + + spin_unlock_irqrestore(&ctrl_if_lock, flags); + + /* Ensure that @hnd will not be executed after this function returns. */ + tasklet_unlock_wait(&ctrl_if_rx_tasklet); +} + +void ctrl_if_suspend(void) +{ + teardown_irq(ctrl_if_irq, &ctrl_if_irq_action); + unbind_evtchn_from_irq(ctrl_if_evtchn); +} + +void ctrl_if_resume(void) +{ + control_if_t *ctrl_if = get_ctrl_if(); + + if ( xen_start_info.flags & SIF_INITDOMAIN ) + { + /* + * The initial domain must create its own domain-controller link. + * The controller is probably not running at this point, but will + * pick up its end of the event channel from + */ + evtchn_op_t op; + extern void bind_evtchn_to_cpu(unsigned port, unsigned cpu); + + op.cmd = EVTCHNOP_bind_interdomain; + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = DOMID_SELF; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = 0; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + BUG(); + xen_start_info.domain_controller_evtchn = op.u.bind_interdomain.port1; + initdom_ctrlif_domcontroller_port = op.u.bind_interdomain.port2; + bind_evtchn_to_cpu(op.u.bind_interdomain.port1, 0); + } + + /* Sync up with shared indexes. */ + FRONT_RING_ATTACH(&ctrl_if_tx_ring, &ctrl_if->tx_ring, CONTROL_RING_MEM); + BACK_RING_ATTACH(&ctrl_if_rx_ring, &ctrl_if->rx_ring, CONTROL_RING_MEM); + + ctrl_if_evtchn = xen_start_info.domain_controller_evtchn; + ctrl_if_irq = bind_evtchn_to_irq(ctrl_if_evtchn); + + memset(&ctrl_if_irq_action, 0, sizeof(ctrl_if_irq_action)); + ctrl_if_irq_action.handler = ctrl_if_interrupt; + ctrl_if_irq_action.name = "ctrl-if"; + (void)setup_irq(ctrl_if_irq, &ctrl_if_irq_action); +} + +void __init ctrl_if_init(void) +{ + control_if_t *ctrl_if = get_ctrl_if(); + int i; + + for ( i = 0; i < 256; i++ ) + ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler; + + FRONT_RING_ATTACH(&ctrl_if_tx_ring, &ctrl_if->tx_ring, CONTROL_RING_MEM); + BACK_RING_ATTACH(&ctrl_if_rx_ring, &ctrl_if->rx_ring, CONTROL_RING_MEM); + + spin_lock_init(&ctrl_if_lock); + + ctrl_if_resume(); +} + + +/* This is called after it is safe to call schedule_task(). */ +static int __init ctrl_if_late_setup(void) +{ + safe_to_schedule_task = 1; + return 0; +} +__initcall(ctrl_if_late_setup); + + +/* + * !! The following are DANGEROUS FUNCTIONS !! + * Use with care [for example, see xencons_force_flush()]. + */ + +int ctrl_if_transmitter_empty(void) +{ + return (ctrl_if_tx_ring.sring->req_prod == ctrl_if_tx_ring.rsp_cons); + +} + +void ctrl_if_discard_responses(void) +{ + RING_DROP_PENDING_RESPONSES(&ctrl_if_tx_ring); +} + +EXPORT_SYMBOL(ctrl_if_send_message_noblock); +EXPORT_SYMBOL(ctrl_if_send_message_block); +EXPORT_SYMBOL(ctrl_if_send_message_and_get_response); +EXPORT_SYMBOL(ctrl_if_enqueue_space_callback); +EXPORT_SYMBOL(ctrl_if_send_response); +EXPORT_SYMBOL(ctrl_if_register_receiver); +EXPORT_SYMBOL(ctrl_if_unregister_receiver); Index: linux-2.6.12-xen0-arch/drivers/xen/core/devmem.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/devmem.c @@ -0,0 +1,158 @@ +/* + * Originally from linux/drivers/char/mem.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Added devfs support. + * Jan-11-1998, C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx> + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@xxxxxxx> + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/mman.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/raw.h> +#include <linux/tty.h> +#include <linux/capability.h> +#include <linux/smp_lock.h> +#include <linux/devfs_fs_kernel.h> +#include <linux/ptrace.h> +#include <linux/device.h> +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/io.h> + +static inline int uncached_access(struct file *file, unsigned long addr) +{ + if (file->f_flags & O_SYNC) + return 1; + /* Xen sets correct MTRR type on non-RAM for us. */ + return 0; +} + +/* + * This funcion reads the *physical* memory. The f_pos points directly to the + * memory location. + */ +static ssize_t read_mem(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + unsigned long i, p = *ppos; + ssize_t read = -EFAULT; + void *v; + + if ((v = ioremap(p, count)) == NULL) { + /* + * Some programs (e.g., dmidecode) groove off into weird RAM + * areas where no table scan possibly exist (because Xen will + * have stomped on them!). These programs get rather upset if + * we let them know that Xen failed their access, so we fake + * out a read of all zeroes. :-) + */ + for (i = 0; i < count; i++) + if (put_user(0, buf+i)) + return -EFAULT; + return count; + } + if (copy_to_user(buf, v, count)) + goto out; + + read = count; + *ppos += read; +out: + iounmap(v); + return read; +} + +static ssize_t write_mem(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t written = -EFAULT; + void *v; + + if ((v = ioremap(p, count)) == NULL) + return -EFAULT; + if (copy_to_user(v, buf, count)) + goto out; + + written = count; + *ppos += written; +out: + iounmap(v); + return written; +} + +static int mmap_mem(struct file * file, struct vm_area_struct * vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + int uncached; + + uncached = uncached_access(file, offset); + if (uncached) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + /* Don't try to swap out physical pages.. */ + vma->vm_flags |= VM_RESERVED; + + /* + * Don't dump addresses that are not real memory to a core file. + */ + if (uncached) + vma->vm_flags |= VM_IO; + + if (io_remap_page_range(vma, vma->vm_start, offset, + vma->vm_end-vma->vm_start, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +/* + * The memory devices use the full 32/64 bits of the offset, and so we cannot + * check against negative addresses: they are ok. The return value is weird, + * though, in that case (0). + * + * also note that seeking relative to the "end of file" isn't supported: + * it has no meaning, so it returns -EINVAL. + */ +static loff_t memory_lseek(struct file * file, loff_t offset, int orig) +{ + loff_t ret; + + down(&file->f_dentry->d_inode->i_sem); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + default: + ret = -EINVAL; + } + up(&file->f_dentry->d_inode->i_sem); + return ret; +} + +static int open_mem(struct inode * inode, struct file * filp) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +struct file_operations mem_fops = { + .llseek = memory_lseek, + .read = read_mem, + .write = write_mem, + .mmap = mmap_mem, + .open = open_mem, +}; Index: linux-2.6.12-xen0-arch/drivers/xen/core/evtchn.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/evtchn.c @@ -0,0 +1,698 @@ +/****************************************************************************** + * evtchn.c + * + * Communication via Xen event channels. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/kernel_stat.h> +#include <linux/version.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/ptrace.h> +#include <asm-xen/synch_bitops.h> +#include <asm-xen/xen-public/event_channel.h> +#include <asm-xen/xen-public/physdev.h> +#include <asm-xen/ctrl_if.h> +#include <asm-xen/hypervisor.h> +#include <asm-xen/evtchn.h> + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +EXPORT_SYMBOL(force_evtchn_callback); +EXPORT_SYMBOL(evtchn_do_upcall); +EXPORT_SYMBOL(bind_evtchn_to_irq); +EXPORT_SYMBOL(unbind_evtchn_from_irq); +#endif + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static spinlock_t irq_mapping_update_lock; + +/* IRQ <-> event-channel mappings. */ +static int evtchn_to_irq[NR_EVENT_CHANNELS]; +static int irq_to_evtchn[NR_IRQS]; + +/* IRQ <-> VIRQ mapping. */ +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]); + +/* evtchn <-> IPI mapping. */ +#ifndef NR_IPIS +#define NR_IPIS 1 +#endif +DEFINE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]); + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ +static unsigned long pirq_needs_unmask_notify[NR_PIRQS/sizeof(unsigned long)]; + +#ifdef CONFIG_SMP + +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; +static u32 cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/32]; + +#define active_evtchns(cpu,sh,idx) \ + ((sh)->evtchn_pending[idx] & \ + cpu_evtchn_mask[cpu][idx] & \ + ~(sh)->evtchn_mask[idx]) + +void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); + set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); + cpu_evtchn[chn] = cpu; +} + +#else + +#define active_evtchns(cpu,sh,idx) \ + ((sh)->evtchn_pending[idx] & \ + ~(sh)->evtchn_mask[idx]) + +void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ +} +#endif + +/* Upcall to generic IRQ layer. */ +#ifdef CONFIG_X86 +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,9) +extern fastcall unsigned int do_IRQ(struct pt_regs *regs); +#else +extern asmlinkage unsigned int do_IRQ(struct pt_regs *regs); +#endif +#if defined (__i386__) +#define IRQ_REG orig_eax +#elif defined (__x86_64__) +#define IRQ_REG orig_rax +#endif +#define do_IRQ(irq, regs) do { \ + (regs)->IRQ_REG = (irq); \ + do_IRQ((regs)); \ +} while (0) +#endif + +#define VALID_EVTCHN(_chn) ((_chn) >= 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0); +} + +/* NB. Interrupts are disabled on entry. */ +asmlinkage void evtchn_do_upcall(struct pt_regs *regs) +{ + u32 l1, l2; + unsigned int l1i, l2i, port; + int irq, cpu = smp_processor_id(); + shared_info_t *s = HYPERVISOR_shared_info; + vcpu_info_t *vcpu_info = &s->vcpu_data[cpu]; + + vcpu_info->evtchn_upcall_pending = 0; + + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + l1 = xchg(&vcpu_info->evtchn_pending_sel, 0); + while ( l1 != 0 ) + { + l1i = __ffs(l1); + l1 &= ~(1 << l1i); + + while ( (l2 = active_evtchns(cpu, s, l1i)) != 0 ) + { + l2i = __ffs(l2); + l2 &= ~(1 << l2i); + + port = (l1i << 5) + l2i; + if ( (irq = evtchn_to_irq[port]) != -1 ) + do_IRQ(irq, regs); + else + evtchn_device_upcall(port); + } + } +} + +static int find_unbound_irq(void) +{ + int irq; + + for ( irq = 0; irq < NR_IRQS; irq++ ) + if ( irq_bindcount[irq] == 0 ) + break; + + if ( irq == NR_IRQS ) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +int bind_virq_to_irq(int virq) +{ + evtchn_op_t op; + int evtchn, irq; + int cpu = smp_processor_id(); + + spin_lock(&irq_mapping_update_lock); + + if ( (irq = per_cpu(virq_to_irq, cpu)[virq]) == -1 ) + { + op.cmd = EVTCHNOP_bind_virq; + op.u.bind_virq.virq = virq; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to bind virtual IRQ %d\n", virq); + evtchn = op.u.bind_virq.port; + + irq = find_unbound_irq(); + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +void unbind_virq_from_irq(int virq) +{ + evtchn_op_t op; + int cpu = smp_processor_id(); + int irq = per_cpu(virq_to_irq, cpu)[virq]; + int evtchn = irq_to_evtchn[irq]; + + spin_lock(&irq_mapping_update_lock); + + if ( --irq_bindcount[irq] == 0 ) + { + op.cmd = EVTCHNOP_close; + op.u.close.dom = DOMID_SELF; + op.u.close.port = evtchn; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to unbind virtual IRQ %d\n", virq); + + /* This is a slight hack. Interdomain ports can be allocated + directly by userspace, and at that point they get bound by + Xen to vcpu 0. We therefore need to make sure that if we + get an event on an event channel we don't know about vcpu 0 + handles it. Binding channels to vcpu 0 when closing them + achieves this. */ + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; + per_cpu(virq_to_irq, cpu)[virq] = -1; + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_ipi_on_cpu_to_irq(int ipi) +{ + evtchn_op_t op; + int evtchn, irq; + int cpu = smp_processor_id(); + + spin_lock(&irq_mapping_update_lock); + + if ( (evtchn = per_cpu(ipi_to_evtchn, cpu)[ipi]) == 0 ) + { + op.cmd = EVTCHNOP_bind_ipi; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to bind virtual IPI %d on cpu %d\n", ipi, cpu); + evtchn = op.u.bind_ipi.port; + + irq = find_unbound_irq(); + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn; + + bind_evtchn_to_cpu(evtchn, cpu); + } + else + { + irq = evtchn_to_irq[evtchn]; + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +void unbind_ipi_from_irq(int ipi) +{ + evtchn_op_t op; + int cpu = smp_processor_id(); + int evtchn = per_cpu(ipi_to_evtchn, cpu)[ipi]; + int irq = irq_to_evtchn[evtchn]; + + spin_lock(&irq_mapping_update_lock); + + if ( --irq_bindcount[irq] == 0 ) + { + op.cmd = EVTCHNOP_close; + op.u.close.dom = DOMID_SELF; + op.u.close.port = evtchn; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to unbind virtual IPI %d on cpu %d\n", ipi, cpu); + + /* See comments in unbind_virq_from_irq */ + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; + per_cpu(ipi_to_evtchn, cpu)[ipi] = 0; + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irq(int evtchn) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + if ( (irq = evtchn_to_irq[evtchn]) == -1 ) + { + irq = find_unbound_irq(); + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +void unbind_evtchn_from_irq(int evtchn) +{ + int irq = evtchn_to_irq[evtchn]; + + spin_lock(&irq_mapping_update_lock); + + if ( --irq_bindcount[irq] == 0 ) + { + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; + } + + spin_unlock(&irq_mapping_update_lock); +} + +static void do_nothing_function(void *ign) +{ +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + evtchn_op_t op; + int evtchn; + + spin_lock(&irq_mapping_update_lock); + evtchn = irq_to_evtchn[irq]; + if (!VALID_EVTCHN(evtchn)) { + spin_unlock(&irq_mapping_update_lock); + return; + } + + /* Tell Xen to send future instances of this interrupt to the + other vcpu */ + op.cmd = EVTCHNOP_bind_vcpu; + op.u.bind_vcpu.port = evtchn; + op.u.bind_vcpu.vcpu = tcpu; + + /* If this fails, it usually just indicates that we're dealing + with a virq or IPI channel, which don't actually need to be + rebound. Ignore it, but don't do the xenlinux-level rebind + in that case. */ + if (HYPERVISOR_event_channel_op(&op) >= 0) + bind_evtchn_to_cpu(evtchn, tcpu); + + spin_unlock(&irq_mapping_update_lock); + + /* Now send the new target processor a NOP IPI. When this + returns, it will check for any pending interrupts, and so + service any that got delivered to the wrong processor by + mistake. */ + /* XXX: The only time this is called with interrupts disabled is + from the hotplug/hotunplug path. In that case, all cpus are + stopped with interrupts disabled, and the missed interrupts + will be picked up when they start again. This is kind of a + hack. + */ + if (!irqs_disabled()) { + smp_call_function(do_nothing_function, NULL, 0, 0); + } +} + + +static void set_affinity_irq(unsigned irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} + +/* + * Interface to generic handling in irq.c + */ + +static unsigned int startup_dynirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + + if ( !VALID_EVTCHN(evtchn) ) + return 0; + unmask_evtchn(evtchn); + return 0; +} + +static void shutdown_dynirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + + if ( !VALID_EVTCHN(evtchn) ) + return; + mask_evtchn(evtchn); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + + mask_evtchn(evtchn); + clear_evtchn(evtchn); +} + +static void end_dynirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + + if ( !(irq_desc[irq].status & IRQ_DISABLED) ) + unmask_evtchn(evtchn); +} + +static struct hw_interrupt_type dynirq_type = { + "Dynamic-irq", + startup_dynirq, + shutdown_dynirq, + enable_dynirq, + disable_dynirq, + ack_dynirq, + end_dynirq, + set_affinity_irq +}; + +static inline void pirq_unmask_notify(int pirq) +{ + physdev_op_t op; + if ( unlikely(test_bit(pirq, &pirq_needs_unmask_notify[0])) ) + { + op.cmd = PHYSDEVOP_IRQ_UNMASK_NOTIFY; + (void)HYPERVISOR_physdev_op(&op); + } +} + +static inline void pirq_query_unmask(int pirq) +{ + physdev_op_t op; + op.cmd = PHYSDEVOP_IRQ_STATUS_QUERY; + op.u.irq_status_query.irq = pirq; + (void)HYPERVISOR_physdev_op(&op); + clear_bit(pirq, &pirq_needs_unmask_notify[0]); + if ( op.u.irq_status_query.flags & PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY ) + set_bit(pirq, &pirq_needs_unmask_notify[0]); +} + +/* + * On startup, if there is no action associated with the IRQ then we are + * probing. In this case we should not share with others as it will confuse us. + */ +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL) + +static unsigned int startup_pirq(unsigned int irq) +{ + evtchn_op_t op; + int evtchn; + + op.cmd = EVTCHNOP_bind_pirq; + op.u.bind_pirq.pirq = irq; + /* NB. We are happy to share unless we are probing. */ + op.u.bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + { + if ( !probing_irq(irq) ) /* Some failures are expected when probing. */ + printk(KERN_INFO "Failed to obtain physical IRQ %d\n", irq); + return 0; + } + evtchn = op.u.bind_pirq.port; + + pirq_query_unmask(irq_to_pirq(irq)); + + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + unmask_evtchn(evtchn); + pirq_unmask_notify(irq_to_pirq(irq)); + + return 0; +} + +static void shutdown_pirq(unsigned int irq) +{ + evtchn_op_t op; + int evtchn = irq_to_evtchn[irq]; + + if ( !VALID_EVTCHN(evtchn) ) + return; + + mask_evtchn(evtchn); + + op.cmd = EVTCHNOP_close; + op.u.close.dom = DOMID_SELF; + op.u.close.port = evtchn; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to unbind physical IRQ %d\n", irq); + + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; +} + +static void enable_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + unmask_evtchn(evtchn); + pirq_unmask_notify(irq_to_pirq(irq)); +} + +static void disable_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + mask_evtchn(evtchn); +} + +static void ack_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + mask_evtchn(evtchn); + clear_evtchn(evtchn); +} + +static void end_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + if ( !(irq_desc[irq].status & IRQ_DISABLED) ) + { + unmask_evtchn(evtchn); + pirq_unmask_notify(irq_to_pirq(irq)); + } +} + +static struct hw_interrupt_type pirq_type = { + "Phys-irq", + startup_pirq, + shutdown_pirq, + enable_pirq, + disable_pirq, + ack_pirq, + end_pirq, + set_affinity_irq +}; + +void irq_suspend(void) +{ + int pirq, virq, irq, evtchn; + int cpu = smp_processor_id(); /* XXX */ + + /* Unbind VIRQs from event channels. */ + for ( virq = 0; virq < NR_VIRQS; virq++ ) + { + if ( (irq = per_cpu(virq_to_irq, cpu)[virq]) == -1 ) + continue; + evtchn = irq_to_evtchn[irq]; + + /* Mark the event channel as unused in our table. */ + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; + } + + /* Check that no PIRQs are still bound. */ + for ( pirq = 0; pirq < NR_PIRQS; pirq++ ) + if ( (evtchn = irq_to_evtchn[pirq_to_irq(pirq)]) != -1 ) + panic("Suspend attempted while PIRQ %d bound to evtchn %d.\n", + pirq, evtchn); +} + +void irq_resume(void) +{ + evtchn_op_t op; + int virq, irq, evtchn; + int cpu = smp_processor_id(); /* XXX */ + + for ( evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++ ) + mask_evtchn(evtchn); /* New event-channel space is not 'live' yet. */ + + for ( virq = 0; virq < NR_VIRQS; virq++ ) + { + if ( (irq = per_cpu(virq_to_irq, cpu)[virq]) == -1 ) + continue; + + /* Get a new binding from Xen. */ + op.cmd = EVTCHNOP_bind_virq; + op.u.bind_virq.virq = virq; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to bind virtual IRQ %d\n", virq); + evtchn = op.u.bind_virq.port; + + /* Record the new mapping. */ + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + /* Ready for use. */ + unmask_evtchn(evtchn); + } +} + +void __init init_IRQ(void) +{ + int i; + int cpu; + + irq_ctx_init(0); + + spin_lock_init(&irq_mapping_update_lock); + +#ifdef CONFIG_SMP + /* By default all event channels notify CPU#0. */ + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +#endif + + for ( cpu = 0; cpu < NR_CPUS; cpu++ ) { + /* No VIRQ -> IRQ mappings. */ + for ( i = 0; i < NR_VIRQS; i++ ) + per_cpu(virq_to_irq, cpu)[i] = -1; + } + + /* No event-channel -> IRQ mappings. */ + for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) + { + evtchn_to_irq[i] = -1; + mask_evtchn(i); /* No event channels are 'live' right now. */ + } + + /* No IRQ -> event-channel mappings. */ + for ( i = 0; i < NR_IRQS; i++ ) + irq_to_evtchn[i] = -1; + + for ( i = 0; i < NR_DYNIRQS; i++ ) + { + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + irq_bindcount[dynirq_to_irq(i)] = 0; + + irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED; + irq_desc[dynirq_to_irq(i)].action = 0; + irq_desc[dynirq_to_irq(i)].depth = 1; + irq_desc[dynirq_to_irq(i)].handler = &dynirq_type; + } + + for ( i = 0; i < NR_PIRQS; i++ ) + { + /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */ + irq_bindcount[pirq_to_irq(i)] = 1; + + irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED; + irq_desc[pirq_to_irq(i)].action = 0; + irq_desc[pirq_to_irq(i)].depth = 1; + irq_desc[pirq_to_irq(i)].handler = &pirq_type; + } + + /* This needs to be done early, but after the IRQ subsystem is alive. */ + ctrl_if_init(); +} Index: linux-2.6.12-xen0-arch/drivers/xen/core/fixup.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/fixup.c @@ -0,0 +1,87 @@ +/****************************************************************************** + * fixup.c + * + * Binary-rewriting of certain IA32 instructions, on notification by Xen. + * Used to avoid repeated slow emulation of common instructions used by the + * user-space TLS (Thread-Local Storage) libraries. + * + * **** NOTE **** + * Issues with the binary rewriting have caused it to be removed. Instead + * we rely on Xen's emulator to boot the kernel, and then print a banner + * message recommending that the user disables /lib/tls. + * + * Copyright (c) 2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/version.h> + +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define __LINKAGE fastcall +#else +#define __LINKAGE asmlinkage +#endif + +__LINKAGE void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +{ + static unsigned long printed = 0; + char info[100]; + int i; + + if ( !test_and_set_bit(0, &printed) ) + { + HYPERVISOR_vm_assist(VMASST_CMD_disable, + VMASST_TYPE_4gb_segments_notify); + + sprintf(info, "%s (pid=%d)", current->comm, current->tgid); + + DP(""); + DP("***************************************************************"); + DP("***************************************************************"); + DP("** WARNING: Currently emulating unsupported memory accesses **"); + DP("** in /lib/tls libraries. The emulation is very **"); + DP("** slow. To ensure full performance you should **"); + DP("** execute the following as root: **"); + DP("** mv /lib/tls /lib/tls.disabled **"); + DP("** Offending process: %-38.38s **", info); + DP("***************************************************************"); + DP("***************************************************************"); + DP(""); + + for ( i = 5; i > 0; i-- ) + { + printk("Pausing... %d", i); + mdelay(1000); + printk("\b\b\b\b\b\b\b\b\b\b\b\b"); + } + printk("Continuing...\n\n"); + } +} + +static int __init fixup_init(void) +{ + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); + return 0; +} +__initcall(fixup_init); Index: linux-2.6.12-xen0-arch/drivers/xen/core/gnttab.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/gnttab.c @@ -0,0 +1,396 @@ +/****************************************************************************** + * gnttab.c + * + * Two sets of functionality: + * 1. Granting foreign access to our memory reservation. + * 2. Accessing others' memory reservations via grant references. + * (i.e., mechanisms for both sender and recipient of grant references) + * + * Copyright (c) 2005, Christopher Clark + * Copyright (c) 2004, K A Fraser + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/uaccess.h> +#include <asm-xen/xen_proc.h> +#include <asm-xen/linux-public/privcmd.h> +#include <asm-xen/gnttab.h> +#include <asm-xen/synch_bitops.h> + +#if 1 +#define ASSERT(_p) \ + if ( !(_p) ) { printk(KERN_ALERT"Assertion '%s': line %d, file %s\n", \ + #_p , __LINE__, __FILE__); *(int*)0=0; } +#else +#define ASSERT(_p) ((void)0) +#endif + +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_grant: " fmt, ##args) + + +EXPORT_SYMBOL(gnttab_grant_foreign_access); +EXPORT_SYMBOL(gnttab_end_foreign_access); +EXPORT_SYMBOL(gnttab_query_foreign_access); +EXPORT_SYMBOL(gnttab_grant_foreign_transfer); +EXPORT_SYMBOL(gnttab_end_foreign_transfer); +EXPORT_SYMBOL(gnttab_alloc_grant_references); +EXPORT_SYMBOL(gnttab_free_grant_references); +EXPORT_SYMBOL(gnttab_claim_grant_reference); +EXPORT_SYMBOL(gnttab_release_grant_reference); +EXPORT_SYMBOL(gnttab_grant_foreign_access_ref); +EXPORT_SYMBOL(gnttab_grant_foreign_transfer_ref); + +static grant_ref_t gnttab_free_list[NR_GRANT_ENTRIES]; +static grant_ref_t gnttab_free_head; + +static grant_entry_t *shared; + +/* + * Lock-free grant-entry allocator + */ + +static inline int +get_free_entry( + void) +{ + grant_ref_t fh, nfh = gnttab_free_head; + do { if ( unlikely((fh = nfh) == NR_GRANT_ENTRIES) ) return -1; } + while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh, + gnttab_free_list[fh])) != fh) ); + return fh; +} + +static inline void +put_free_entry( + grant_ref_t ref) +{ + grant_ref_t fh, nfh = gnttab_free_head; + do { gnttab_free_list[ref] = fh = nfh; wmb(); } + while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh, ref)) != fh) ); +} + +/* + * Public grant-issuing interface functions + */ + +int +gnttab_grant_foreign_access( + domid_t domid, unsigned long frame, int readonly) +{ + int ref; + + if ( unlikely((ref = get_free_entry()) == -1) ) + return -ENOSPC; + + shared[ref].frame = frame; + shared[ref].domid = domid; + wmb(); + shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0); + + return ref; +} + +void +gnttab_grant_foreign_access_ref( + grant_ref_t ref, domid_t domid, unsigned long frame, int readonly) +{ + shared[ref].frame = frame; + shared[ref].domid = domid; + wmb(); + shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0); +} + + +int +gnttab_query_foreign_access( grant_ref_t ref ) +{ + u16 nflags; + + nflags = shared[ref].flags; + + return ( nflags & (GTF_reading|GTF_writing) ); +} + +void +gnttab_end_foreign_access( grant_ref_t ref, int readonly ) +{ + u16 flags, nflags; + + nflags = shared[ref].flags; + do { + if ( (flags = nflags) & (GTF_reading|GTF_writing) ) + printk(KERN_ALERT "WARNING: g.e. still in use!\n"); + } + while ( (nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) != flags ); + + put_free_entry(ref); +} + +int +gnttab_grant_foreign_transfer( + domid_t domid, unsigned long pfn ) +{ + int ref; + + if ( unlikely((ref = get_free_entry()) == -1) ) + return -ENOSPC; + + shared[ref].frame = pfn; + shared[ref].domid = domid; + wmb(); + shared[ref].flags = GTF_accept_transfer; + + return ref; +} + +void +gnttab_grant_foreign_transfer_ref( + grant_ref_t ref, domid_t domid, unsigned long pfn ) +{ + shared[ref].frame = pfn; + shared[ref].domid = domid; + wmb(); + shared[ref].flags = GTF_accept_transfer; +} + +unsigned long +gnttab_end_foreign_transfer( + grant_ref_t ref) +{ + unsigned long frame = 0; + u16 flags; + + flags = shared[ref].flags; +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + /* + * But can't flags == (GTF_accept_transfer | GTF_transfer_completed) + * if gnttab_donate executes without interruption??? + */ +#else + ASSERT(flags == (GTF_accept_transfer | GTF_transfer_committed)); +#endif + /* + * If a transfer is committed then wait for the frame address to appear. + * Otherwise invalidate the grant entry against future use. + */ + if ( likely(flags != GTF_accept_transfer) || + (synch_cmpxchg(&shared[ref].flags, flags, 0) != GTF_accept_transfer) ) + while ( unlikely((frame = shared[ref].frame) == 0) ) + cpu_relax(); + + put_free_entry(ref); + + return frame; +} + +void +gnttab_free_grant_references( u16 count, grant_ref_t head ) +{ + /* TODO: O(N)...? */ + grant_ref_t to_die = 0, next = head; + int i; + + for ( i = 0; i < count; i++ ) + { + to_die = next; + next = gnttab_free_list[next]; + put_free_entry( to_die ); + } +} + +int +gnttab_alloc_grant_references( u16 count, + grant_ref_t *head, + grant_ref_t *terminal ) +{ + int i; + grant_ref_t h = gnttab_free_head; + + for ( i = 0; i < count; i++ ) + if ( unlikely(get_free_entry() == -1) ) + goto not_enough_refs; + + *head = h; + *terminal = gnttab_free_head; + + return 0; + +not_enough_refs: + gnttab_free_head = h; + return -ENOSPC; +} + +int +gnttab_claim_grant_reference( grant_ref_t *private_head, + grant_ref_t terminal ) +{ + grant_ref_t g; + if ( unlikely((g = *private_head) == terminal) ) + return -ENOSPC; + *private_head = gnttab_free_list[g]; + return g; +} + +void +gnttab_release_grant_reference( grant_ref_t *private_head, + grant_ref_t release ) +{ + gnttab_free_list[release] = *private_head; + *private_head = release; +} + +/* + * ProcFS operations + */ + +#ifdef CONFIG_PROC_FS + +static struct proc_dir_entry *grant_pde; + +static int grant_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long data) +{ + int ret; + privcmd_hypercall_t hypercall; + + /* XXX Need safety checks here if using for anything other + * than debugging */ + return -ENOSYS; + + if ( cmd != IOCTL_PRIVCMD_HYPERCALL ) + return -ENOSYS; + + if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) ) + return -EFAULT; + + if ( hypercall.op != __HYPERVISOR_grant_table_op ) + return -ENOSYS; + + /* hypercall-invoking asm taken from privcmd.c */ + __asm__ __volatile__ ( + "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; " + "movl 4(%%eax),%%ebx ;" + "movl 8(%%eax),%%ecx ;" + "movl 12(%%eax),%%edx ;" + "movl 16(%%eax),%%esi ;" + "movl 20(%%eax),%%edi ;" + "movl (%%eax),%%eax ;" + TRAP_INSTR "; " + "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx" + : "=a" (ret) : "0" (&hypercall) : "memory" ); + + return ret; +} + +static struct file_operations grant_file_ops = { + ioctl: grant_ioctl, +}; + +static int grant_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + unsigned int i; + grant_entry_t *gt; + + gt = (grant_entry_t *)shared; + len = 0; + + for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) + /* TODO: safety catch here until this can handle >PAGE_SIZE output */ + if (len > (PAGE_SIZE - 200)) + { + len += sprintf( page + len, "Truncated.\n"); + break; + } + + if ( gt[i].flags ) + len += sprintf( page + len, + "Grant: ref (0x%x) flags (0x%hx) dom (0x%hx) frame (0x%x)\n", + i, + gt[i].flags, + gt[i].domid, + gt[i].frame ); + + *eof = 1; + return len; +} + +static int grant_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + /* TODO: implement this */ + return -ENOSYS; +} + +#endif /* CONFIG_PROC_FS */ + +int gnttab_resume(void) +{ + gnttab_setup_table_t setup; + unsigned long frames[NR_GRANT_FRAMES]; + int i; + + setup.dom = DOMID_SELF; + setup.nr_frames = NR_GRANT_FRAMES; + setup.frame_list = frames; + + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1) != 0); + BUG_ON(setup.status != 0); + + for ( i = 0; i < NR_GRANT_FRAMES; i++ ) + set_fixmap(FIX_GNTTAB_END - i, frames[i] << PAGE_SHIFT); + + return 0; +} + +int gnttab_suspend(void) +{ + int i; + + for ( i = 0; i < NR_GRANT_FRAMES; i++ ) + clear_fixmap(FIX_GNTTAB_END - i); + + return 0; +} + +static int __init gnttab_init(void) +{ + int i; + + BUG_ON(gnttab_resume()); + + shared = (grant_entry_t *)fix_to_virt(FIX_GNTTAB_END); + + for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) + gnttab_free_list[i] = i + 1; + +#ifdef CONFIG_PROC_FS + /* + * /proc/xen/grant : used by libxc to access grant tables + */ + if ( (grant_pde = create_xen_proc_entry("grant", 0600)) == NULL ) + { + WPRINTK("Unable to create grant xen proc entry\n"); + return -1; + } + + grant_file_ops.read = grant_pde->proc_fops->read; + grant_file_ops.write = grant_pde->proc_fops->write; + + grant_pde->proc_fops = &grant_file_ops; + + grant_pde->read_proc = &grant_read; + grant_pde->write_proc = &grant_write; +#endif + + printk("Grant table initialized\n"); + return 0; +} + +__initcall(gnttab_init); Index: linux-2.6.12-xen0-arch/drivers/xen/core/Makefile =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/Makefile @@ -0,0 +1,5 @@ +obj-y := ctrl_if.o evtchn.o fixup.o reboot.o gnttab.o devmem.o + +obj-$(CONFIG_PROC_FS) += xen_proc.o +obj-$(CONFIG_NET) += skbuff.o +obj-$(CONFIG_SMP) += smp.o Index: linux-2.6.12-xen0-arch/drivers/xen/core/reboot.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/reboot.c @@ -0,0 +1,269 @@ + +#define __KERNEL_SYSCALLS__ +static int errno; +#include <linux/errno.h> +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/unistd.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <asm/irq.h> +#include <asm/mmu_context.h> +#include <asm-xen/ctrl_if.h> +#include <asm-xen/hypervisor.h> +#include <asm-xen/xen-public/dom0_ops.h> +#include <asm-xen/linux-public/suspend.h> +#include <asm-xen/queues.h> + +void machine_restart(char * __unused) +{ + /* We really want to get pending console data out before we die. */ + extern void xencons_force_flush(void); + xencons_force_flush(); + HYPERVISOR_reboot(); +} + +void machine_halt(void) +{ + machine_power_off(); +} + +void machine_power_off(void) +{ + /* We really want to get pending console data out before we die. */ + extern void xencons_force_flush(void); + xencons_force_flush(); + HYPERVISOR_shutdown(); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +int reboot_thru_bios = 0; /* for dmi_scan.c */ +EXPORT_SYMBOL(machine_restart); +EXPORT_SYMBOL(machine_halt); +EXPORT_SYMBOL(machine_power_off); +#endif + + +/****************************************************************************** + * Stop/pickle callback handling. + */ + +/* Ignore multiple shutdown requests. */ +static int shutting_down = -1; + +static void __do_suspend(void) +{ + int i, j; + suspend_record_t *suspend_record; + + /* Hmmm... a cleaner interface to suspend/resume blkdevs would be nice. */ + /* XXX SMH: yes it would :-( */ +#ifdef CONFIG_XEN_BLKDEV_FRONTEND + extern void blkdev_suspend(void); + extern void blkdev_resume(void); +#else +#define blkdev_suspend() do{}while(0) +#define blkdev_resume() do{}while(0) +#endif + +#ifdef CONFIG_XEN_NETDEV_FRONTEND + extern void netif_suspend(void); + extern void netif_resume(void); +#else +#define netif_suspend() do{}while(0) +#define netif_resume() do{}while(0) +#endif + +#ifdef CONFIG_XEN_USB_FRONTEND + extern void usbif_resume(); +#else +#define usbif_resume() do{}while(0) +#endif + +#ifdef CONFIG_XEN_BLKDEV_GRANT + extern int gnttab_suspend(void); + extern int gnttab_resume(void); +#else +#define gnttab_suspend() do{}while(0) +#define gnttab_resume() do{}while(0) +#endif + + extern void time_suspend(void); + extern void time_resume(void); + extern unsigned long max_pfn; + extern unsigned int *pfn_to_mfn_frame_list; + + suspend_record = (suspend_record_t *)__get_free_page(GFP_KERNEL); + if ( suspend_record == NULL ) + goto out; + + suspend_record->nr_pfns = max_pfn; /* final number of pfns */ + + __cli(); + +#ifdef __i386__ + mm_pin_all(); + kmem_cache_shrink(pgd_cache); +#endif + + netif_suspend(); + + blkdev_suspend(); + + time_suspend(); + + ctrl_if_suspend(); + + irq_suspend(); + + gnttab_suspend(); + + HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; + clear_fixmap(FIX_SHARED_INFO); + + memcpy(&suspend_record->resume_info, &xen_start_info, + sizeof(xen_start_info)); + + HYPERVISOR_suspend(virt_to_machine(suspend_record) >> PAGE_SHIFT); + + shutting_down = -1; + + memcpy(&xen_start_info, &suspend_record->resume_info, + sizeof(xen_start_info)); + + set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); + + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + + memset(empty_zero_page, 0, PAGE_SIZE); + + for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) + { + pfn_to_mfn_frame_list[j] = + virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; + } + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = + virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; + + gnttab_resume(); + + irq_resume(); + + ctrl_if_resume(); + + time_resume(); + + blkdev_resume(); + + netif_resume(); + + usbif_resume(); + + __sti(); + + out: + if ( suspend_record != NULL ) + free_page((unsigned long)suspend_record); +} + +static int shutdown_process(void *__unused) +{ + static char *envp[] = { "HOME=/", "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; + static char *restart_argv[] = { "/sbin/reboot", NULL }; + static char *poweroff_argv[] = { "/sbin/poweroff", NULL }; + + extern asmlinkage long sys_reboot(int magic1, int magic2, + unsigned int cmd, void *arg); + + daemonize( +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + "shutdown" +#endif + ); + + switch ( shutting_down ) + { + case CMSG_SHUTDOWN_POWEROFF: + if ( execve("/sbin/poweroff", poweroff_argv, envp) < 0 ) + { + sys_reboot(LINUX_REBOOT_MAGIC1, + LINUX_REBOOT_MAGIC2, + LINUX_REBOOT_CMD_POWER_OFF, + NULL); + } + break; + + case CMSG_SHUTDOWN_REBOOT: + if ( execve("/sbin/reboot", restart_argv, envp) < 0 ) + { + sys_reboot(LINUX_REBOOT_MAGIC1, + LINUX_REBOOT_MAGIC2, + LINUX_REBOOT_CMD_RESTART, + NULL); + } + break; + } + + shutting_down = -1; /* could try again */ + + return 0; +} + +static void __shutdown_handler(void *unused) +{ + int err; + + if ( shutting_down != CMSG_SHUTDOWN_SUSPEND ) + { + err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES); + if ( err < 0 ) + printk(KERN_ALERT "Error creating shutdown process!\n"); + } + else + { + __do_suspend(); + } +} + +static void shutdown_handler(ctrl_msg_t *msg, unsigned long id) +{ + static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); + + if ( msg->subtype == CMSG_SHUTDOWN_SYSRQ ) + { + int sysrq = ((shutdown_sysrq_t *)&msg->msg[0])->key; + +#ifdef CONFIG_MAGIC_SYSRQ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + handle_sysrq(sysrq, NULL, NULL); +#else + handle_sysrq(sysrq, NULL, NULL, NULL); +#endif +#endif + } + else if ( (shutting_down == -1) && + ((msg->subtype == CMSG_SHUTDOWN_POWEROFF) || + (msg->subtype == CMSG_SHUTDOWN_REBOOT) || + (msg->subtype == CMSG_SHUTDOWN_SUSPEND)) ) + { + shutting_down = msg->subtype; + schedule_work(&shutdown_work); + } + else + { + printk("Ignore spurious shutdown request\n"); + } + + ctrl_if_send_response(msg); +} + +static int __init setup_shutdown_event(void) +{ + ctrl_if_register_receiver(CMSG_SHUTDOWN, shutdown_handler, 0); + return 0; +} + +__initcall(setup_shutdown_event); Index: linux-2.6.12-xen0-arch/drivers/xen/core/skbuff.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/skbuff.c @@ -0,0 +1,47 @@ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <asm/io.h> +#include <asm/page.h> + +EXPORT_SYMBOL(__dev_alloc_skb); + +/* Referenced in netback.c. */ +/*static*/ kmem_cache_t *skbuff_cachep; + +/* Size must be cacheline-aligned (alloc_skb uses SKB_DATA_ALIGN). */ +#define XEN_SKB_SIZE \ + ((PAGE_SIZE - sizeof(struct skb_shared_info)) & ~(SMP_CACHE_BYTES - 1)) + +struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask) +{ + struct sk_buff *skb; + skb = alloc_skb_from_cache(skbuff_cachep, length + 16, gfp_mask); + if ( likely(skb != NULL) ) + skb_reserve(skb, 16); + return skb; +} + +static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused) +{ + scrub_pages(buf, 1); +} + +static int __init skbuff_init(void) +{ + skbuff_cachep = kmem_cache_create( + "xen-skb", PAGE_SIZE, PAGE_SIZE, 0, skbuff_ctor, NULL); + return 0; +} +__initcall(skbuff_init); Index: linux-2.6.12-xen0-arch/drivers/xen/core/smp.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/smp.c @@ -0,0 +1,16 @@ +/* Copyright (C) 2004, Christian Limpach */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/threads.h> + +/* + * the frequency of the profiling timer can be changed + * by writing a multiplier value into /proc/profile. + */ +int setup_profiling_timer(unsigned int multiplier) +{ + printk("setup_profiling_timer\n"); + + return 0; +} Index: linux-2.6.12-xen0-arch/drivers/xen/core/xen_proc.c =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/drivers/xen/core/xen_proc.c @@ -0,0 +1,18 @@ + +#include <linux/config.h> +#include <linux/proc_fs.h> + +static struct proc_dir_entry *xen_base; + +struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode) +{ + if ( xen_base == NULL ) + if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL ) + panic("Couldn't create /proc/xen"); + return create_proc_entry(name, mode, xen_base); +} + +void remove_xen_proc_entry(const char *name) +{ + remove_proc_entry(name, xen_base); +} Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/ctrl_if.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/ctrl_if.h @@ -0,0 +1,160 @@ +/****************************************************************************** + * ctrl_if.h + * + * Management functions for special interface to the domain controller. + * + * Copyright (c) 2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __ASM_XEN__CTRL_IF_H__ +#define __ASM_XEN__CTRL_IF_H__ + +#include <hypervisor.h> +#include <queues.h> + +typedef control_msg_t ctrl_msg_t; + +/* + * Callback function type. Called for asynchronous processing of received + * request messages, and responses to previously-transmitted request messages. + * The parameters are (@msg, @id). + * @msg: Original request/response message (not a copy). The message can be + * modified in-place by the handler (e.g., a response callback can + * turn a request message into a response message in place). The message + * is no longer accessible after the callback handler returns -- if the + * message is required to persist for longer then it must be copied. + * @id: (Response callbacks only) The 'id' that was specified when the + * original request message was queued for transmission. + */ +typedef void (*ctrl_msg_handler_t)(ctrl_msg_t *, unsigned long); + +/* + * Send @msg to the domain controller. Execute @hnd when a response is + * received, passing the response message and the specified @id. This + * operation will not block: it will return -EAGAIN if there is no space. + * Notes: + * 1. The @msg is copied if it is transmitted and so can be freed after this + * function returns. + * 2. If @hnd is NULL then no callback is executed. + */ +int +ctrl_if_send_message_noblock( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id); + +/* + * Send @msg to the domain controller. Execute @hnd when a response is + * received, passing the response message and the specified @id. This + * operation will block until the message is sent, or a signal is received + * for the calling process (unless @wait_state is TASK_UNINTERRUPTIBLE). + * Notes: + * 1. The @msg is copied if it is transmitted and so can be freed after this + * function returns. + * 2. If @hnd is NULL then no callback is executed. + */ +int +ctrl_if_send_message_block( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id, + long wait_state); + +/* + * Send @msg to the domain controller. Block until the response is received, + * and then copy it into the provided buffer, @rmsg. + */ +int +ctrl_if_send_message_and_get_response( + ctrl_msg_t *msg, + ctrl_msg_t *rmsg, + long wait_state); + +/* + * Request a callback when there is /possibly/ space to immediately send a + * message to the domain controller. This function returns 0 if there is + * already space to trasnmit a message --- in this case the callback task /may/ + * still be executed. If this function returns 1 then the callback /will/ be + * executed when space becomes available. + */ +int +ctrl_if_enqueue_space_callback( + struct tq_struct *task); + +/* + * Send a response (@msg) to a message from the domain controller. This will + * never block. + * Notes: + * 1. The @msg is copied and so can be freed after this function returns. + * 2. The @msg may be the original request message, modified in-place. + */ +void +ctrl_if_send_response( + ctrl_msg_t *msg); + +/* + * Register a receiver for typed messages from the domain controller. The + * handler (@hnd) is called for every received message of specified @type. + * Returns TRUE (non-zero) if the handler was successfully registered. + * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will + * occur in a context in which it is safe to yield (i.e., process context). + */ +#define CALLBACK_IN_BLOCKING_CONTEXT 1 +int ctrl_if_register_receiver( + u8 type, + ctrl_msg_handler_t hnd, + unsigned int flags); + +/* + * Unregister a receiver for typed messages from the domain controller. The + * handler (@hnd) will not be executed after this function returns. + */ +void +ctrl_if_unregister_receiver( + u8 type, ctrl_msg_handler_t hnd); + +/* Suspend/resume notifications. */ +void ctrl_if_suspend(void); +void ctrl_if_resume(void); + +/* Start-of-day setup. */ +void ctrl_if_init(void); + +/* + * Returns TRUE if there are no outstanding message requests at the domain + * controller. This can be used to ensure that messages have really flushed + * through when it is not possible to use the response-callback interface. + * WARNING: If other subsystems are using the control interface then this + * function might never return TRUE! + */ +int ctrl_if_transmitter_empty(void); /* !! DANGEROUS FUNCTION !! */ + +/* + * Manually discard response messages from the domain controller. + * WARNING: This is usually done automatically -- this function should only + * be called when normal interrupt mechanisms are disabled! + */ +void ctrl_if_discard_responses(void); /* !! DANGEROUS FUNCTION !! */ + +#endif /* __ASM_XEN__CONTROL_IF_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/hypercall.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/hypercall.h @@ -0,0 +1,564 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERCALL_H__ +#define __HYPERCALL_H__ +#include <asm-xen/xen-public/xen.h> + +/* + * Assembler stubs for hyper-calls. + */ + +static inline int +HYPERVISOR_set_trap_table( + trap_info_t *table) +{ + int ret; + unsigned long ignore; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ignore) + : "0" (__HYPERVISOR_set_trap_table), "1" (table) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_mmu_update( + mmu_update_t *req, int count, int *success_count, domid_t domid) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) + : "0" (__HYPERVISOR_mmu_update), "1" (req), "2" (count), + "3" (success_count), "4" (domid) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_mmuext_op( + struct mmuext_op *op, int count, int *success_count, domid_t domid) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) + : "0" (__HYPERVISOR_mmuext_op), "1" (op), "2" (count), + "3" (success_count), "4" (domid) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_set_gdt( + unsigned long *frame_list, int entries) +{ + int ret; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_set_gdt), "1" (frame_list), "2" (entries) + : "memory" ); + + + return ret; +} + +static inline int +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + int ret; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_stack_switch), "1" (ss), "2" (esp) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) + : "0" (__HYPERVISOR_set_callbacks), "1" (event_selector), + "2" (event_address), "3" (failsafe_selector), "4" (failsafe_address) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + int ret; + unsigned long ign; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign) + : "0" (__HYPERVISOR_fpu_taskswitch), "1" (set) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_yield( + void) +{ + int ret; + unsigned long ign; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign) + : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_yield) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_block( + void) +{ + int ret; + unsigned long ign1; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_block) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_shutdown( + void) +{ + int ret; + unsigned long ign1; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift)) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_reboot( + void) +{ + int ret; + unsigned long ign1; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift)) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_suspend( + unsigned long srec) +{ + int ret; + unsigned long ign1, ign2; + + /* NB. On suspend, control software expects a suspend record in %esi. */ + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=S" (ign2) + : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), + "S" (srec) : "memory"); + + return ret; +} + +static inline int +HYPERVISOR_crash( + void) +{ + int ret; + unsigned long ign1; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_shutdown | (SHUTDOWN_crash << SCHEDOP_reasonshift)) + : "memory" ); + + return ret; +} + +static inline long +HYPERVISOR_set_timer_op( + u64 timeout) +{ + int ret; + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_set_timer_op), "b" (timeout_lo), "c" (timeout_hi) + : "memory"); + + return ret; +} + +static inline int +HYPERVISOR_dom0_op( + dom0_op_t *dom0_op) +{ + int ret; + unsigned long ign1; + + dom0_op->interface_version = DOM0_INTERFACE_VERSION; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_dom0_op), "1" (dom0_op) + : "memory"); + + return ret; +} + +static inline int +HYPERVISOR_set_debugreg( + int reg, unsigned long value) +{ + int ret; + unsigned long ign1, ign2; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_set_debugreg), "1" (reg), "2" (value) + : "memory" ); + + return ret; +} + +static inline unsigned long +HYPERVISOR_get_debugreg( + int reg) +{ + unsigned long ret; + unsigned long ign; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign) + : "0" (__HYPERVISOR_get_debugreg), "1" (reg) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_update_descriptor( + unsigned long ma, unsigned long word1, unsigned long word2) +{ + int ret; + unsigned long ign1, ign2, ign3; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) + : "0" (__HYPERVISOR_update_descriptor), "1" (ma), "2" (word1), + "3" (word2) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_dom_mem_op( + unsigned int op, unsigned long *extent_list, + unsigned long nr_extents, unsigned int extent_order) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4, ign5; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4), + "=D" (ign5) + : "0" (__HYPERVISOR_dom_mem_op), "1" (op), "2" (extent_list), + "3" (nr_extents), "4" (extent_order), "5" (DOMID_SELF) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_multicall( + void *call_list, int nr_calls) +{ + int ret; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_multicall), "1" (call_list), "2" (nr_calls) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) + : "0" (__HYPERVISOR_update_va_mapping), + "1" (va), "2" ((new_val).pte_low), +#ifdef CONFIG_X86_PAE + "3" ((new_val).pte_high), +#else + "3" (0), +#endif + "4" (flags) + : "memory" ); + + if ( unlikely(ret < 0) ) + { + printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n", + va, (new_val).pte_low, flags); + BUG(); + } + + return ret; +} + +static inline int +HYPERVISOR_event_channel_op( + void *op) +{ + int ret; + unsigned long ignore; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ignore) + : "0" (__HYPERVISOR_event_channel_op), "1" (op) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_xen_version( + int cmd) +{ + int ret; + unsigned long ignore; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ignore) + : "0" (__HYPERVISOR_xen_version), "1" (cmd) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_console_io( + int cmd, int count, char *str) +{ + int ret; + unsigned long ign1, ign2, ign3; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) + : "0" (__HYPERVISOR_console_io), "1" (cmd), "2" (count), "3" (str) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_physdev_op( + void *physdev_op) +{ + int ret; + unsigned long ign; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign) + : "0" (__HYPERVISOR_physdev_op), "1" (physdev_op) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + int ret; + unsigned long ign1, ign2, ign3; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) + : "0" (__HYPERVISOR_grant_table_op), "1" (cmd), "2" (uop), "3" (count) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4, ign5; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), + "=S" (ign4), "=D" (ign5) + : "0" (__HYPERVISOR_update_va_mapping_otherdomain), + "1" (va), "2" ((new_val).pte_low), +#ifdef CONFIG_X86_PAE + "3" ((new_val).pte_high), +#else + "3" (0), +#endif + "4" (flags), "5" (domid) : + "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + int ret; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_vm_assist), "1" (cmd), "2" (type) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_boot_vcpu( + unsigned long vcpu, vcpu_guest_context_t *ctxt) +{ + int ret; + unsigned long ign1, ign2; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_boot_vcpu), "1" (vcpu), "2" (ctxt) + : "memory"); + + return ret; +} + +static inline int +HYPERVISOR_vcpu_down( + int vcpu) +{ + int ret; + unsigned long ign1; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_vcpu_down | (vcpu << SCHEDOP_vcpushift)) + : "memory" ); + + return ret; +} + +static inline int +HYPERVISOR_vcpu_up( + int vcpu) +{ + int ret; + unsigned long ign1; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_vcpu_up | (vcpu << SCHEDOP_vcpushift)) + : "memory" ); + + return ret; +} +#endif /* __HYPERCALL_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/irq_vectors.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/irq_vectors.h @@ -0,0 +1,140 @@ +/* + * This file should contain #defines for all of the interrupt vector + * numbers used by this architecture. + * + * In addition, there are some standard defines: + * + * FIRST_EXTERNAL_VECTOR: + * The first free place for external interrupts + * + * SYSCALL_VECTOR: + * The IRQ vector a syscall makes the user to kernel transition + * under. + * + * TIMER_IRQ: + * The IRQ number the timer interrupt comes in at. + * + * NR_IRQS: + * The total number of interrupt vectors (including all the + * architecture specific interrupts) needed. + * + */ +#ifndef _ASM_IRQ_VECTORS_H +#define _ASM_IRQ_VECTORS_H + +/* + * IDT vectors usable for external interrupt sources start + * at 0x20: + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#define SYSCALL_VECTOR 0x80 + +/* + * Vectors 0x20-0x2f are used for ISA interrupts. + */ + +#if 0 +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + * + * Vectors 0xf0-0xfa are free (reserved for future Linux use). + */ +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe +#define INVALIDATE_TLB_VECTOR 0xfd +#define RESCHEDULE_VECTOR 0xfc +#define CALL_FUNCTION_VECTOR 0xfb + +#define THERMAL_APIC_VECTOR 0xf0 +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef +#endif + +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe + +/* + * First APIC vector available to drivers: (vectors 0x30-0xee) + * we start at 0x31 to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) + */ +#define FIRST_DEVICE_VECTOR 0x31 +#define FIRST_SYSTEM_VECTOR 0xef + +/* + * 16 8259A IRQ's, 208 potential APIC interrupt sources. + * Right now the APIC is mostly only used for SMP. + * 256 vectors is an architectural limit. (we can have + * more than 256 devices theoretically, but they will + * have to use shared interrupts) + * Since vectors 0x00-0x1f are used/reserved for the CPU, + * the usable vector space is 0x20-0xff (224 vectors) + */ + +#define NR_IPIS 8 + +#define RESCHEDULE_VECTOR 1 +#define INVALIDATE_TLB_VECTOR 2 +#define CALL_FUNCTION_VECTOR 3 + +/* + * The maximum number of vectors supported by i386 processors + * is limited to 256. For processors other than i386, NR_VECTORS + * should be changed accordingly. + */ +#define NR_VECTORS 256 + +#define FPU_IRQ 13 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) + +/* + * The flat IRQ space is divided into two regions: + * 1. A one-to-one mapping of real physical IRQs. This space is only used + * if we have physical device-access privilege. This region is at the + * start of the IRQ space so that existing device drivers do not need + * to be modified to translate physical IRQ numbers into our IRQ space. + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These + * are bound using the provided bind/unbind functions. + */ + +#define PIRQ_BASE 0 +#define NR_PIRQS 256 + +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) +#define NR_DYNIRQS 256 + +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) +#define NR_IRQ_VECTORS NR_IRQS + +#define pirq_to_irq(_x) ((_x) + PIRQ_BASE) +#define irq_to_pirq(_x) ((_x) - PIRQ_BASE) + +#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE) +#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE) + +#ifndef __ASSEMBLY__ +/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */ +extern int bind_virq_to_irq(int virq); +extern void unbind_virq_from_irq(int virq); +extern int bind_ipi_to_irq(int ipi); +extern void unbind_ipi_from_irq(int ipi); +extern int bind_evtchn_to_irq(int evtchn); +extern void unbind_evtchn_from_irq(int evtchn); + +extern void irq_suspend(void); +extern void irq_resume(void); +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_IRQ_VECTORS_H */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/queues.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/queues.h @@ -0,0 +1,81 @@ + +/* + * Oh dear. Task queues were removed from Linux 2.6 and replaced by work + * queues. Unfortunately the semantics is not the same. With task queues we + * can defer work until a particular event occurs -- this is not + * straightforwardly done with work queues (queued work is performed asap, or + * after some fixed timeout). Conversely, work queues are a (slightly) neater + * way of deferring work to a process context than using task queues in 2.4. + * + * This is a bit of a needless reimplementation -- should have just pulled + * the code from 2.4, but I tried leveraging work queues to simplify things. + * They didn't help. :-( + */ + +#ifndef __QUEUES_H__ +#define __QUEUES_H__ + +#include <linux/version.h> +#include <linux/list.h> +#include <linux/workqueue.h> + +struct tq_struct { + void (*fn)(void *); + void *arg; + struct list_head list; + unsigned long pending; +}; +#define INIT_TQUEUE(_name, _fn, _arg) \ + do { \ + INIT_LIST_HEAD(&(_name)->list); \ + (_name)->pending = 0; \ + (_name)->fn = (_fn); (_name)->arg = (_arg); \ + } while ( 0 ) +#define DECLARE_TQUEUE(_name, _fn, _arg) \ + struct tq_struct _name = { (_fn), (_arg), LIST_HEAD_INIT((_name).list), 0 } + +typedef struct { + struct list_head list; + spinlock_t lock; +} task_queue; +#define DECLARE_TASK_QUEUE(_name) \ + task_queue _name = { LIST_HEAD_INIT((_name).list), SPIN_LOCK_UNLOCKED } + +static inline int queue_task(struct tq_struct *tqe, task_queue *tql) +{ + unsigned long flags; + if ( test_and_set_bit(0, &tqe->pending) ) + return 0; + spin_lock_irqsave(&tql->lock, flags); + list_add_tail(&tqe->list, &tql->list); + spin_unlock_irqrestore(&tql->lock, flags); + return 1; +} + +static inline void run_task_queue(task_queue *tql) +{ + struct list_head head, *ent; + struct tq_struct *tqe; + unsigned long flags; + void (*fn)(void *); + void *arg; + + spin_lock_irqsave(&tql->lock, flags); + list_add(&head, &tql->list); + list_del_init(&tql->list); + spin_unlock_irqrestore(&tql->lock, flags); + + while ( !list_empty(&head) ) + { + ent = head.next; + list_del_init(ent); + tqe = list_entry(ent, struct tq_struct, list); + fn = tqe->fn; + arg = tqe->arg; + wmb(); + tqe->pending = 0; + fn(arg); + } +} + +#endif /* __QUEUES_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_foreign_page.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_foreign_page.h @@ -0,0 +1,30 @@ +/****************************************************************************** + * foreign_page.h + * + * Provide a "foreign" page type, that is owned by a foreign allocator and + * not the normal buddy allocator in page_alloc.c + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __ASM_XEN_FOREIGN_PAGE_H__ +#define __ASM_XEN_FOREIGN_PAGE_H__ + +#define PG_foreign PG_arch_1 + +#define PageForeign(page) test_bit(PG_foreign, &(page)->flags) + +#define SetPageForeign(page, dtor) do { \ + set_bit(PG_foreign, &(page)->flags); \ + (page)->mapping = (void *)dtor; \ +} while (0) + +#define ClearPageForeign(page) do { \ + clear_bit(PG_foreign, &(page)->flags); \ + (page)->mapping = NULL; \ +} while (0) + +#define PageForeignDestructor(page) \ + ( (void (*) (struct page *)) (page)->mapping ) + +#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_hypervisor.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_hypervisor.h @@ -0,0 +1,197 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERVISOR_H__ +#define __HYPERVISOR_H__ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/version.h> +#include <xen-public/xen.h> +#include <xen-public/dom0_ops.h> +#include <xen-public/io/domain_controller.h> +#include <asm/ptrace.h> +#include <asm/page.h> +#if defined(__i386__) +# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +# ifdef CONFIG_X86_PAE +# include <asm-generic/pgtable-nopud.h> +# else +# include <asm-generic/pgtable-nopmd.h> +# endif +# endif +#endif + +/* arch/xen/i386/kernel/setup.c */ +union xen_start_info_union +{ + start_info_t xen_start_info; + char padding[2048]; +}; +extern union xen_start_info_union xen_start_info_union; +#define xen_start_info (xen_start_info_union.xen_start_info) + +/* arch/xen/kernel/evtchn.c */ +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +/* arch/xen/kernel/process.c */ +void xen_cpu_idle (void); + +/* arch/xen/i386/kernel/hypervisor.c */ +void do_hypervisor_callback(struct pt_regs *regs); + +/* arch/xen/i386/kernel/head.S */ +void lgdt_finish(void); + +/* arch/xen/i386/mm/hypervisor.c */ +/* + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already + * be MACHINE addresses. + */ + +void xen_pt_switch(unsigned long ptr); +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */ +void xen_load_gs(unsigned int selector); /* x86_64 only */ +void xen_tlb_flush(void); +void xen_invlpg(unsigned long ptr); + +#ifndef CONFIG_XEN_SHADOW_MODE +void xen_l1_entry_update(pte_t *ptr, pte_t val); +void xen_l2_entry_update(pmd_t *ptr, pmd_t val); +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ +void xen_pgd_pin(unsigned long ptr); +void xen_pgd_unpin(unsigned long ptr); +void xen_pud_pin(unsigned long ptr); /* x86_64 only */ +void xen_pud_unpin(unsigned long ptr); /* x86_64 only */ +void xen_pmd_pin(unsigned long ptr); /* x86_64 only */ +void xen_pmd_unpin(unsigned long ptr); /* x86_64 only */ +void xen_pte_pin(unsigned long ptr); +void xen_pte_unpin(unsigned long ptr); +#else +#define xen_l1_entry_update(_p, _v) set_pte((_p), (pte_t){(_v)}) +#define xen_l2_entry_update(_p, _v) set_pgd((_p), (pgd_t){(_v)}) +#define xen_pgd_pin(_p) ((void)0) +#define xen_pgd_unpin(_p) ((void)0) +#define xen_pte_pin(_p) ((void)0) +#define xen_pte_unpin(_p) ((void)0) +#endif + +void xen_set_ldt(unsigned long ptr, unsigned long bytes); +void xen_machphys_update(unsigned long mfn, unsigned long pfn); + +#ifdef CONFIG_SMP +#include <linux/cpumask.h> +void xen_tlb_flush_all(void); +void xen_invlpg_all(unsigned long ptr); +void xen_tlb_flush_mask(cpumask_t *mask); +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +/* +** XXX SMH: 2.4 doesn't have percpu.h (or support SMP guests) so just +** include sufficient #defines to allow the below to build. +*/ +#define DEFINE_PER_CPU(type, name) \ + __typeof__(type) per_cpu__##name + +#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define __get_cpu_var(var) per_cpu__##var +#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name + +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#endif /* linux < 2.6.0 */ + +void xen_contig_memory(unsigned long vstart, unsigned int order); + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS +/* Allocate a contiguous empty region of low memory. Return virtual start. */ +unsigned long allocate_empty_lowmem_region(unsigned long pages); +#endif + +#include <hypercall.h> + +#if defined(CONFIG_X86_64) +#define MULTI_UVMFLAGS_INDEX 2 +#define MULTI_UVMDOMID_INDEX 3 +#else +#define MULTI_UVMFLAGS_INDEX 3 +#define MULTI_UVMDOMID_INDEX 4 +#endif + +static inline void +MULTI_update_va_mapping( + multicall_entry_t *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; +#if defined(CONFIG_X86_64) + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; +#elif defined(CONFIG_X86_PAE) + mcl->args[1] = new_val.pte_low; + mcl->args[2] = new_val.pte_high; + mcl->args[3] = flags; +#else + mcl->args[1] = new_val.pte_low; + mcl->args[2] = 0; + mcl->args[3] = flags; +#endif +} + +static inline void +MULTI_update_va_mapping_otherdomain( + multicall_entry_t *mcl, unsigned long va, + pte_t new_val, unsigned long flags, domid_t domid) +{ + mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl->args[0] = va; +#if defined(CONFIG_X86_64) + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; + mcl->args[3] = domid; +#elif defined(CONFIG_X86_PAE) + mcl->args[1] = new_val.pte_low; + mcl->args[2] = new_val.pte_high; + mcl->args[3] = flags; + mcl->args[4] = domid; +#else + mcl->args[1] = new_val.pte_low; + mcl->args[2] = 0; + mcl->args[3] = flags; + mcl->args[4] = domid; +#endif +} + +#endif /* __HYPERVISOR_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/acm.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/acm.h @@ -0,0 +1,157 @@ +/**************************************************************** + * acm.h + * + * Copyright (C) 2005 IBM Corporation + * + * Author: + * Reiner Sailer <sailer@xxxxxxxxxxxxxx> + * + * Contributors: + * Stefan Berger <stefanb@xxxxxxxxxxxxxx> + * added network byte order support for binary policies + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * sHype general access control module header file. + * here are all definitions that are shared between + * xen-core, guest-kernels, and applications. + * + * todo: move from static policy choice to compile option. + */ + +#ifndef _XEN_PUBLIC_ACM_H +#define _XEN_PUBLIC_ACM_H + +#include "xen-public/xen.h" +#include "xen-public/sched_ctl.h" + +/* if ACM_DEBUG defined, all hooks should + * print a short trace message (comment it out + * when not in testing mode ) + */ +/* #define ACM_DEBUG */ + +#ifdef ACM_DEBUG +# define printkd(fmt, args...) printk(fmt,## args) +#else +# define printkd(fmt, args...) +#endif + +/* default ssid reference value if not supplied */ +#define ACM_DEFAULT_SSID 0x0 +#define ACM_DEFAULT_LOCAL_SSID 0x0 + +/* Internal ACM ERROR types */ +#define ACM_OK 0 +#define ACM_UNDEF -1 +#define ACM_INIT_SSID_ERROR -2 +#define ACM_INIT_SOID_ERROR -3 +#define ACM_ERROR -4 + +/* External ACCESS DECISIONS */ +#define ACM_ACCESS_PERMITTED 0 +#define ACM_ACCESS_DENIED -111 +#define ACM_NULL_POINTER_ERROR -200 + +#define ACM_MAX_POLICY 3 + +#define ACM_NULL_POLICY 0 +#define ACM_CHINESE_WALL_POLICY 1 +#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2 +#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY 3 + +/* policy: */ +#define ACM_POLICY_NAME(X) \ + (X == ACM_NULL_POLICY) ? "NULL policy" : \ + (X == ACM_CHINESE_WALL_POLICY) ? "CHINESE WALL policy" : \ + (X == ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY) ? "SIMPLE TYPE ENFORCEMENT policy" : \ + (X == ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT policy" : \ + "UNDEFINED policy" + +/* defines a ssid reference used by xen */ +typedef u32 ssidref_t; + +/* -------security policy relevant type definitions-------- */ + +/* type identifier; compares to "equal" or "not equal" */ +typedef u16 domaintype_t; + +/* CHINESE WALL POLICY DATA STRUCTURES + * + * current accumulated conflict type set: + * When a domain is started and has a type that is in + * a conflict set, the conflicting types are incremented in + * the aggregate set. When a domain is destroyed, the + * conflicting types to its type are decremented. + * If a domain has multiple types, this procedure works over + * all those types. + * + * conflict_aggregate_set[i] holds the number of + * running domains that have a conflict with type i. + * + * running_types[i] holds the number of running domains + * that include type i in their ssidref-referenced type set + * + * conflict_sets[i][j] is "0" if type j has no conflict + * with type i and is "1" otherwise. + */ +/* high-16 = version, low-16 = check magic */ +#define ACM_MAGIC 0x0001debc + +/* each offset in bytes from start of the struct they + * the are part of */ +/* each buffer consists of all policy information for + * the respective policy given in the policy code + */ +struct acm_policy_buffer { + u32 magic; + u32 policyversion; + u32 len; + u16 primary_policy_code; + u16 primary_buffer_offset; + u16 secondary_policy_code; + u16 secondary_buffer_offset; +}; + +struct acm_chwall_policy_buffer { + u16 policy_code; + u16 chwall_max_types; + u16 chwall_max_ssidrefs; + u16 chwall_max_conflictsets; + u16 chwall_ssid_offset; + u16 chwall_conflict_sets_offset; + u16 chwall_running_types_offset; + u16 chwall_conflict_aggregate_offset; +}; + +struct acm_ste_policy_buffer { + u16 policy_code; + u16 ste_max_types; + u16 ste_max_ssidrefs; + u16 ste_ssid_offset; +}; + +struct acm_stats_buffer { + u32 magic; + u32 policyversion; + u32 len; + u16 primary_policy_code; + u16 primary_stats_offset; + u16 secondary_policy_code; + u16 secondary_stats_offset; +}; + +struct acm_ste_stats_buffer { + u32 ec_eval_count; + u32 gt_eval_count; + u32 ec_denied_count; + u32 gt_denied_count; + u32 ec_cachehit_count; + u32 gt_cachehit_count; +}; + + +#endif Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-ia64.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-ia64.h @@ -0,0 +1,219 @@ +/****************************************************************************** + * arch-ia64/hypervisor-if.h + * + * Guest OS interface to IA64 Xen. + */ + +#ifndef __HYPERVISOR_IF_IA64_H__ +#define __HYPERVISOR_IF_IA64_H__ + +/* Maximum number of virtual CPUs in multi-processor guests. */ +/* WARNING: before changing this, check that shared_info fits on a page */ +#define MAX_VIRT_CPUS 1 + +#ifndef __ASSEMBLY__ + +/* NB. Both the following are 64 bits each. */ +typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */ + +#define MAX_NR_SECTION 32 // at most 32 memory holes +typedef struct { + unsigned long start; /* start of memory hole */ + unsigned long end; /* end of memory hole */ +} mm_section_t; + +typedef struct { + unsigned long mfn : 56; + unsigned long type: 8; +} pmt_entry_t; + +#define GPFN_MEM (0UL << 56) /* Guest pfn is normal mem */ +#define GPFN_FRAME_BUFFER (1UL << 56) /* VGA framebuffer */ +#define GPFN_LOW_MMIO (2UL << 56) /* Low MMIO range */ +#define GPFN_PIB (3UL << 56) /* PIB base */ +#define GPFN_IOSAPIC (4UL << 56) /* IOSAPIC base */ +#define GPFN_LEGACY_IO (5UL << 56) /* Legacy I/O base */ +#define GPFN_GFW (6UL << 56) /* Guest Firmware */ +#define GPFN_HIGH_MMIO (7UL << 56) /* High MMIO range */ + +#define GPFN_IO_MASK (7UL << 56) /* Guest pfn is I/O type */ +#define GPFN_INV_MASK (31UL << 59) /* Guest pfn is invalid */ + +#define INVALID_MFN (~0UL) + +/* + * NB. This may become a 64-bit count with no shift. If this happens then the + * structure size will still be 8 bytes, so no other alignments will change. + */ +typedef struct { + unsigned int tsc_bits; /* 0: 32 bits read from the CPU's TSC. */ + unsigned int tsc_bitshift; /* 4: 'tsc_bits' uses N:N+31 of TSC. */ +} tsc_timestamp_t; /* 8 bytes */ + +struct pt_fpreg { + union { + unsigned long bits[2]; + long double __dummy; /* force 16-byte alignment */ + } u; +}; + +struct pt_regs { + /* The following registers are saved by SAVE_MIN: */ + unsigned long b6; /* scratch */ + unsigned long b7; /* scratch */ + + unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */ + unsigned long ar_ssd; /* reserved for future use (scratch) */ + + unsigned long r8; /* scratch (return value register 0) */ + unsigned long r9; /* scratch (return value register 1) */ + unsigned long r10; /* scratch (return value register 2) */ + unsigned long r11; /* scratch (return value register 3) */ + + unsigned long cr_ipsr; /* interrupted task's psr */ + unsigned long cr_iip; /* interrupted task's instruction pointer */ + unsigned long cr_ifs; /* interrupted task's function state */ + + unsigned long ar_unat; /* interrupted task's NaT register (preserved) */ + unsigned long ar_pfs; /* prev function state */ + unsigned long ar_rsc; /* RSE configuration */ + /* The following two are valid only if cr_ipsr.cpl > 0: */ + unsigned long ar_rnat; /* RSE NaT */ + unsigned long ar_bspstore; /* RSE bspstore */ + + unsigned long pr; /* 64 predicate registers (1 bit each) */ + unsigned long b0; /* return pointer (bp) */ + unsigned long loadrs; /* size of dirty partition << 16 */ + + unsigned long r1; /* the gp pointer */ + unsigned long r12; /* interrupted task's memory stack pointer */ + unsigned long r13; /* thread pointer */ + + unsigned long ar_fpsr; /* floating point status (preserved) */ + unsigned long r15; /* scratch */ + + /* The remaining registers are NOT saved for system calls. */ + + unsigned long r14; /* scratch */ + unsigned long r2; /* scratch */ + unsigned long r3; /* scratch */ + +#ifdef CONFIG_VTI + unsigned long r4; /* preserved */ + unsigned long r5; /* preserved */ + unsigned long r6; /* preserved */ + unsigned long r7; /* preserved */ + unsigned long cr_iipa; /* for emulation */ + unsigned long cr_isr; /* for emulation */ + unsigned long eml_unat; /* used for emulating instruction */ + unsigned long rfi_pfs; /* used for elulating rfi */ +#endif + + /* The following registers are saved by SAVE_REST: */ + unsigned long r16; /* scratch */ + unsigned long r17; /* scratch */ + unsigned long r18; /* scratch */ + unsigned long r19; /* scratch */ + unsigned long r20; /* scratch */ + unsigned long r21; /* scratch */ + unsigned long r22; /* scratch */ + unsigned long r23; /* scratch */ + unsigned long r24; /* scratch */ + unsigned long r25; /* scratch */ + unsigned long r26; /* scratch */ + unsigned long r27; /* scratch */ + unsigned long r28; /* scratch */ + unsigned long r29; /* scratch */ + unsigned long r30; /* scratch */ + unsigned long r31; /* scratch */ + + unsigned long ar_ccv; /* compare/exchange value (scratch) */ + + /* + * Floating point registers that the kernel considers scratch: + */ + struct pt_fpreg f6; /* scratch */ + struct pt_fpreg f7; /* scratch */ + struct pt_fpreg f8; /* scratch */ + struct pt_fpreg f9; /* scratch */ + struct pt_fpreg f10; /* scratch */ + struct pt_fpreg f11; /* scratch */ +}; + +typedef struct { + unsigned long ipsr; + unsigned long iip; + unsigned long ifs; + unsigned long precover_ifs; + unsigned long isr; + unsigned long ifa; + unsigned long iipa; + unsigned long iim; + unsigned long unat; // not sure if this is needed until NaT arch is done + unsigned long tpr; + unsigned long iha; + unsigned long itir; + unsigned long itv; + unsigned long pmv; + unsigned long cmcv; + unsigned long pta; + int interrupt_collection_enabled; // virtual psr.ic + int interrupt_delivery_enabled; // virtual psr.i + int pending_interruption; + int incomplete_regframe; // see SDM vol2 6.8 + unsigned long delivery_mask[4]; + int metaphysical_mode; // 1 = use metaphys mapping, 0 = use virtual + int banknum; // 0 or 1, which virtual register bank is active + unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active + unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active + unsigned long rrs[8]; // region registers + unsigned long krs[8]; // kernel registers + unsigned long pkrs[8]; // protection key registers + unsigned long tmp[8]; // temp registers (e.g. for hyperprivops) + int evtchn_vector; +} arch_vcpu_info_t; +#define __ARCH_HAS_VCPU_INFO + +typedef struct { + int domain_controller_evtchn; + unsigned int flags; +//} arch_shared_info_t; +} arch_shared_info_t; // DON'T PACK + +typedef struct vcpu_guest_context { +#define VGCF_FPU_VALID (1<<0) +#define VGCF_VMX_GUEST (1<<1) +#define VGCF_IN_KERNEL (1<<2) + unsigned long flags; /* VGCF_* flags */ + unsigned long pt_base; /* PMT table base */ + unsigned long pt_max_pfn; /* Max pfn including holes */ + unsigned long share_io_pg; /* Shared page for I/O emulation */ + unsigned long vm_assist; /* VMASST_TYPE_* bitmap, now none on IPF */ + unsigned long guest_iip; /* Guest entry point */ + + struct pt_regs regs; + arch_vcpu_info_t vcpu; + arch_shared_info_t shared; +} vcpu_guest_context_t; + +#endif /* !__ASSEMBLY__ */ + +#define XEN_HYPER_RFI 0x1 +#define XEN_HYPER_RSM_DT 0x2 +#define XEN_HYPER_SSM_DT 0x3 +#define XEN_HYPER_COVER 0x4 +#define XEN_HYPER_ITC_D 0x5 +#define XEN_HYPER_ITC_I 0x6 +#define XEN_HYPER_SSM_I 0x7 +#define XEN_HYPER_GET_IVR 0x8 +#define XEN_HYPER_GET_TPR 0x9 +#define XEN_HYPER_SET_TPR 0xa +#define XEN_HYPER_EOI 0xb +#define XEN_HYPER_SET_ITM 0xc +#define XEN_HYPER_THASH 0xd +#define XEN_HYPER_PTC_GA 0xe +#define XEN_HYPER_ITR_D 0xf +#define XEN_HYPER_GET_RR 0x10 +#define XEN_HYPER_SET_RR 0x11 + +#endif /* __HYPERVISOR_IF_IA64_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_32.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_32.h @@ -0,0 +1,142 @@ +/****************************************************************************** + * arch-x86_32.h + * + * Guest OS interface to x86 32-bit Xen. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_32_H__ +#define __XEN_PUBLIC_ARCH_X86_32_H__ + +#include <asm/types.h> + +/* + * SEGMENT DESCRIPTOR TABLES + */ +/* + * A number of GDT entries are reserved by Xen. These are not situated at the + * start of the GDT because some stupid OSes export hard-coded selector values + * in their ABI. These hard-coded values are always near the start of the GDT, + * so Xen places itself out of the way, at the far end of the GDT. + */ +#define FIRST_RESERVED_GDT_PAGE 14 +#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) + +/* + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define FLAT_RING1_SS 0xe021 /* GDT index 260 */ +#define FLAT_RING3_CS 0xe02b /* GDT index 261 */ +#define FLAT_RING3_DS 0xe033 /* GDT index 262 */ +#define FLAT_RING3_SS 0xe033 /* GDT index 262 */ + +#define FLAT_KERNEL_CS FLAT_RING1_CS +#define FLAT_KERNEL_DS FLAT_RING1_DS +#define FLAT_KERNEL_SS FLAT_RING1_SS +#define FLAT_USER_CS FLAT_RING3_CS +#define FLAT_USER_DS FLAT_RING3_DS +#define FLAT_USER_SS FLAT_RING3_SS + +/* And the trap vector is... */ +#define TRAP_INSTR "int $0x82" + + +/* + * Virtual addresses beyond this are not modifiable by guest OSes. The + * machine->physical mapping table starts at this address, read-only. + */ +#ifdef CONFIG_X86_PAE +# define HYPERVISOR_VIRT_START (0xF5800000UL) +#else +# define HYPERVISOR_VIRT_START (0xFC000000UL) +#endif +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((u32 *)HYPERVISOR_VIRT_START) +#endif + +/* Maximum number of virtual CPUs in multi-processor guests. */ +#define MAX_VIRT_CPUS 32 + +#ifndef __ASSEMBLY__ + +/* NB. Both the following are 32 bits each. */ +typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */ + +/* + * Send an array of these to HYPERVISOR_set_trap_table() + */ +#define TI_GET_DPL(_ti) ((_ti)->flags & 3) +#define TI_GET_IF(_ti) ((_ti)->flags & 4) +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl)) +#define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2)) +typedef struct trap_info { + u8 vector; /* exception vector */ + u8 flags; /* 0-3: privilege level; 4: clear event enable? */ + u16 cs; /* code selector */ + memory_t address; /* code address */ +} trap_info_t; + +typedef struct cpu_user_regs { + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + u32 edi; + u32 ebp; + u32 eax; + u16 error_code; /* private */ + u16 entry_vector; /* private */ + u32 eip; + u16 cs; + u8 saved_upcall_mask; + u8 _pad0; + u32 eflags; + u32 esp; + u16 ss, _pad1; + u16 es, _pad2; + u16 ds, _pad3; + u16 fs, _pad4; + u16 gs, _pad5; +} cpu_user_regs_t; + +typedef u64 tsc_timestamp_t; /* RDTSC timestamp */ + +/* + * The following is all CPU context. Note that the fpu_ctxt block is filled + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + */ +typedef struct vcpu_guest_context { + /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ + struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ +#define VGCF_I387_VALID (1<<0) +#define VGCF_VMX_GUEST (1<<1) +#define VGCF_IN_KERNEL (1<<2) + unsigned long flags; /* VGCF_* flags */ + cpu_user_regs_t user_regs; /* User-level CPU registers */ + trap_info_t trap_ctxt[256]; /* Virtual IDT */ + unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ + unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ + unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ + unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ + unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ + unsigned long event_callback_cs; /* CS:EIP of event callback */ + unsigned long event_callback_eip; + unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ + unsigned long failsafe_callback_eip; + unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ +} vcpu_guest_context_t; + +typedef struct arch_shared_info { + /* MFN of a table of MFNs that make up p2m table */ + u64 pfn_to_mfn_frame_list; +} arch_shared_info_t; + +#endif + +#endif Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_64.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_64.h @@ -0,0 +1,198 @@ +/****************************************************************************** + * arch-x86_64.h + * + * Guest OS interface to x86 64-bit Xen. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_64_H__ +#define __XEN_PUBLIC_ARCH_X86_64_H__ + +/* + * SEGMENT DESCRIPTOR TABLES + */ +/* + * A number of GDT entries are reserved by Xen. These are not situated at the + * start of the GDT because some stupid OSes export hard-coded selector values + * in their ABI. These hard-coded values are always near the start of the GDT, + * so Xen places itself out of the way, at the far end of the GDT. + */ +#define FIRST_RESERVED_GDT_PAGE 14 +#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) + +/* + * 64-bit segment selectors + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ + +#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */ +#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */ +#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_DS64 0x0000 /* NULL selector */ +#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */ + +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32 +#define FLAT_KERNEL_DS FLAT_KERNEL_DS64 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32 +#define FLAT_KERNEL_CS FLAT_KERNEL_CS64 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32 +#define FLAT_KERNEL_SS FLAT_KERNEL_SS64 + +#define FLAT_USER_DS64 FLAT_RING3_DS64 +#define FLAT_USER_DS32 FLAT_RING3_DS32 +#define FLAT_USER_DS FLAT_USER_DS64 +#define FLAT_USER_CS64 FLAT_RING3_CS64 +#define FLAT_USER_CS32 FLAT_RING3_CS32 +#define FLAT_USER_CS FLAT_USER_CS64 +#define FLAT_USER_SS64 FLAT_RING3_SS64 +#define FLAT_USER_SS32 FLAT_RING3_SS32 +#define FLAT_USER_SS FLAT_USER_SS64 + +/* And the trap vector is... */ +#define TRAP_INSTR "syscall" + +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START (0xFFFF800000000000UL) +#define HYPERVISOR_VIRT_END (0xFFFF880000000000UL) +#endif + +/* Maximum number of virtual CPUs in multi-processor guests. */ +#define MAX_VIRT_CPUS 32 + +#ifndef __ASSEMBLY__ + +/* The machine->physical mapping table starts at this address, read-only. */ +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((u32 *)HYPERVISOR_VIRT_START) +#endif + +/* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) + * @which == SEGBASE_* ; @base == 64-bit base address + * Returns 0 on success. + */ +#define SEGBASE_FS 0 +#define SEGBASE_GS_USER 1 +#define SEGBASE_GS_KERNEL 2 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */ + +/* + * int HYPERVISOR_switch_to_user(void) + * All arguments are on the kernel stack, in the following format. + * Never returns if successful. Current kernel context is lost. + * If flags contains VGCF_IN_SYSCALL: + * Restore RAX, RIP, RFLAGS, RSP. + * Discard R11, RCX, CS, SS. + * Otherwise: + * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP. + * All other registers are saved on hypercall entry and restored to user. + */ +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */ +#define VGCF_IN_SYSCALL (1<<8) +struct switch_to_user { + /* Top of stack (%rsp at point of hypercall). */ + u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; + /* Bottom of switch_to_user stack frame. */ +}; + +/* NB. Both the following are 64 bits each. */ +typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */ + +/* + * Send an array of these to HYPERVISOR_set_trap_table(). + * N.B. As in x86/32 mode, the privilege level specifies which modes may enter + * a trap via a software interrupt. Since rings 1 and 2 are unavailable, we + * allocate privilege levels as follows: + * Level == 0: Noone may enter + * Level == 1: Kernel may enter + * Level == 2: Kernel may enter + * Level == 3: Everyone may enter + */ +#define TI_GET_DPL(_ti) ((_ti)->flags & 3) +#define TI_GET_IF(_ti) ((_ti)->flags & 4) +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl)) +#define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2)) +typedef struct trap_info { + u8 vector; /* exception vector */ + u8 flags; /* 0-3: privilege level; 4: clear event enable? */ + u16 cs; /* code selector */ + memory_t address; /* code address */ +} trap_info_t; + +typedef struct cpu_user_regs { + u64 r15; + u64 r14; + u64 r13; + u64 r12; + union { u64 rbp, ebp; }; + union { u64 rbx, ebx; }; + u64 r11; + u64 r10; + u64 r9; + u64 r8; + union { u64 rax, eax; }; + union { u64 rcx, ecx; }; + union { u64 rdx, edx; }; + union { u64 rsi, esi; }; + union { u64 rdi, edi; }; + u32 error_code; /* private */ + u32 entry_vector; /* private */ + union { u64 rip, eip; }; + u16 cs, _pad0[1]; + u8 saved_upcall_mask; + u8 _pad1[3]; + union { u64 rflags, eflags; }; + union { u64 rsp, esp; }; + u16 ss, _pad2[3]; + u16 es, _pad3[3]; + u16 ds, _pad4[3]; + u16 fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */ + u16 gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_user. */ +} cpu_user_regs_t; + +typedef u64 tsc_timestamp_t; /* RDTSC timestamp */ + +/* + * The following is all CPU context. Note that the fpu_ctxt block is filled + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + */ +typedef struct vcpu_guest_context { + /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ + struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ +#define VGCF_I387_VALID (1<<0) +#define VGCF_VMX_GUEST (1<<1) +#define VGCF_IN_KERNEL (1<<2) + unsigned long flags; /* VGCF_* flags */ + cpu_user_regs_t user_regs; /* User-level CPU registers */ + trap_info_t trap_ctxt[256]; /* Virtual IDT */ + unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ + unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ + unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ + unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ + unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ + unsigned long event_callback_eip; + unsigned long failsafe_callback_eip; + unsigned long syscall_callback_eip; + unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ + /* Segment base addresses. */ + u64 fs_base; + u64 gs_base_kernel; + u64 gs_base_user; +} vcpu_guest_context_t; + +typedef struct arch_shared_info { + /* MFN of a table of MFNs that make up p2m table */ + u64 pfn_to_mfn_frame_list; +} arch_shared_info_t; + +#endif /* !__ASSEMBLY__ */ + +#endif Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/dom0_ops.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/dom0_ops.h @@ -0,0 +1,414 @@ +/****************************************************************************** + * dom0_ops.h + * + * Process command requests from domain-0 guest OS. + * + * Copyright (c) 2002-2003, B Dragovic + * Copyright (c) 2002-2004, K Fraser + */ + + +#ifndef __XEN_PUBLIC_DOM0_OPS_H__ +#define __XEN_PUBLIC_DOM0_OPS_H__ + +#include "xen-public/xen.h" +#include "xen-public/sched_ctl.h" + +/* + * Make sure you increment the interface version whenever you modify this file! + * This makes sure that old versions of dom0 tools will stop working in a + * well-defined way (rather than crashing the machine, for instance). + */ +#define DOM0_INTERFACE_VERSION 0xAAAA100E + +/************************************************************************/ + +#define DOM0_GETMEMLIST 2 +typedef struct { + /* IN variables. */ + domid_t domain; + memory_t max_pfns; + void *buffer; + /* OUT variables. */ + memory_t num_pfns; +} dom0_getmemlist_t; + +#define DOM0_SCHEDCTL 6 + /* struct sched_ctl_cmd is from sched-ctl.h */ +typedef struct sched_ctl_cmd dom0_schedctl_t; + +#define DOM0_ADJUSTDOM 7 +/* struct sched_adjdom_cmd is from sched-ctl.h */ +typedef struct sched_adjdom_cmd dom0_adjustdom_t; + +#define DOM0_CREATEDOMAIN 8 +typedef struct { + /* IN parameters */ + u32 ssidref; + /* IN/OUT parameters. */ + /* Identifier for new domain (auto-allocate if zero is specified). */ + domid_t domain; +} dom0_createdomain_t; + +#define DOM0_DESTROYDOMAIN 9 +typedef struct { + /* IN variables. */ + domid_t domain; +} dom0_destroydomain_t; + +#define DOM0_PAUSEDOMAIN 10 +typedef struct { + /* IN parameters. */ + domid_t domain; +} dom0_pausedomain_t; + +#define DOM0_UNPAUSEDOMAIN 11 +typedef struct { + /* IN parameters. */ + domid_t domain; +} dom0_unpausedomain_t; + +#define DOM0_GETDOMAININFO 12 +typedef struct { + /* IN variables. */ + domid_t domain; /* NB. IN/OUT variable. */ + /* OUT variables. */ +#define DOMFLAGS_DYING (1<<0) /* Domain is scheduled to die. */ +#define DOMFLAGS_SHUTDOWN (1<<2) /* The guest OS has shut down. */ +#define DOMFLAGS_PAUSED (1<<3) /* Currently paused by control software. */ +#define DOMFLAGS_BLOCKED (1<<4) /* Currently blocked pending an event. */ +#define DOMFLAGS_RUNNING (1<<5) /* Domain is currently running. */ +#define DOMFLAGS_CPUMASK 255 /* CPU to which this domain is bound. */ +#define DOMFLAGS_CPUSHIFT 8 +#define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code. */ +#define DOMFLAGS_SHUTDOWNSHIFT 16 + u32 flags; + memory_t tot_pages; + memory_t max_pages; + memory_t shared_info_frame; /* MFN of shared_info struct */ + u64 cpu_time; + u32 n_vcpu; + s32 vcpu_to_cpu[MAX_VIRT_CPUS]; /* current mapping */ + cpumap_t cpumap[MAX_VIRT_CPUS]; /* allowable mapping */ + u32 ssidref; +} dom0_getdomaininfo_t; + +#define DOM0_SETDOMAININFO 13 +typedef struct { + /* IN variables. */ + domid_t domain; + u16 vcpu; + /* IN/OUT parameters */ + vcpu_guest_context_t *ctxt; +} dom0_setdomaininfo_t; + +#define DOM0_MSR 15 +typedef struct { + /* IN variables. */ + u32 write; + u32 cpu_mask; + u32 msr; + u32 in1; + u32 in2; + /* OUT variables. */ + u32 out1; + u32 out2; +} dom0_msr_t; + +#define DOM0_DEBUG 16 +typedef struct { + /* IN variables. */ + domid_t domain; + u8 opcode; + u32 in1; + u32 in2; + u32 in3; + u32 in4; + /* OUT variables. */ + u32 status; + u32 out1; + u32 out2; +} dom0_debug_t; + +/* + * Set clock such that it would read <secs,usecs> after 00:00:00 UTC, + * 1 January, 1970 if the current system time was <system_time>. + */ +#define DOM0_SETTIME 17 +typedef struct { + /* IN variables. */ + u32 secs; + u32 usecs; + u64 system_time; +} dom0_settime_t; + +#define DOM0_GETPAGEFRAMEINFO 18 +#define NOTAB 0 /* normal page */ +#define L1TAB (1<<28) +#define L2TAB (2<<28) +#define L3TAB (3<<28) +#define L4TAB (4<<28) +#define LPINTAB (1<<31) +#define XTAB (0xf<<28) /* invalid page */ +#define LTAB_MASK XTAB +#define LTABTYPE_MASK (0x7<<28) + +typedef struct { + /* IN variables. */ + memory_t pfn; /* Machine page frame number to query. */ + domid_t domain; /* To which domain does the frame belong? */ + /* OUT variables. */ + /* Is the page PINNED to a type? */ + u32 type; /* see above type defs */ +} dom0_getpageframeinfo_t; + +/* + * Read console content from Xen buffer ring. + */ +#define DOM0_READCONSOLE 19 +typedef struct { + /* IN variables. */ + u32 clear; /* Non-zero -> clear after reading. */ + /* IN/OUT variables. */ + char *buffer; /* In: Buffer start; Out: Used buffer start */ + u32 count; /* In: Buffer size; Out: Used buffer size */ +} dom0_readconsole_t; + +/* + * Set which physical cpus a vcpu can execute on. + */ +#define DOM0_PINCPUDOMAIN 20 +typedef struct { + /* IN variables. */ + domid_t domain; + u16 vcpu; + cpumap_t *cpumap; +} dom0_pincpudomain_t; + +/* Get trace buffers machine base address */ +#define DOM0_TBUFCONTROL 21 +typedef struct { + /* IN variables */ +#define DOM0_TBUF_GET_INFO 0 +#define DOM0_TBUF_SET_CPU_MASK 1 +#define DOM0_TBUF_SET_EVT_MASK 2 + u8 op; + /* IN/OUT variables */ + unsigned long cpu_mask; + u32 evt_mask; + /* OUT variables */ + memory_t mach_addr; + u32 size; +} dom0_tbufcontrol_t; + +/* + * Get physical information about the host machine + */ +#define DOM0_PHYSINFO 22 +typedef struct { + u32 threads_per_core; + u32 cores_per_socket; + u32 sockets_per_node; + u32 nr_nodes; + u32 cpu_khz; + memory_t total_pages; + memory_t free_pages; +} dom0_physinfo_t; + +/* + * Get the ID of the current scheduler. + */ +#define DOM0_SCHED_ID 24 +typedef struct { + /* OUT variable */ + u32 sched_id; +} dom0_sched_id_t; + +/* + * Control shadow pagetables operation + */ +#define DOM0_SHADOW_CONTROL 25 + +#define DOM0_SHADOW_CONTROL_OP_OFF 0 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TEST 1 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY 2 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE 3 + +#define DOM0_SHADOW_CONTROL_OP_FLUSH 10 /* table ops */ +#define DOM0_SHADOW_CONTROL_OP_CLEAN 11 +#define DOM0_SHADOW_CONTROL_OP_PEEK 12 + +typedef struct dom0_shadow_control +{ + u32 fault_count; + u32 dirty_count; + u32 dirty_net_count; + u32 dirty_block_count; +} dom0_shadow_control_stats_t; + +typedef struct { + /* IN variables. */ + domid_t domain; + u32 op; + unsigned long *dirty_bitmap; /* pointer to locked buffer */ + /* IN/OUT variables. */ + memory_t pages; /* size of buffer, updated with actual size */ + /* OUT variables. */ + dom0_shadow_control_stats_t stats; +} dom0_shadow_control_t; + +#define DOM0_SETDOMAINMAXMEM 28 +typedef struct { + /* IN variables. */ + domid_t domain; + memory_t max_memkb; +} dom0_setdomainmaxmem_t; + +#define DOM0_GETPAGEFRAMEINFO2 29 /* batched interface */ +typedef struct { + /* IN variables. */ + domid_t domain; + memory_t num; + /* IN/OUT variables. */ + unsigned long *array; +} dom0_getpageframeinfo2_t; + +/* + * Request memory range (@pfn, @pfn+@nr_pfns-1) to have type @type. + * On x86, @type is an architecture-defined MTRR memory type. + * On success, returns the MTRR that was used (@reg) and a handle that can + * be passed to DOM0_DEL_MEMTYPE to accurately tear down the new setting. + * (x86-specific). + */ +#define DOM0_ADD_MEMTYPE 31 +typedef struct { + /* IN variables. */ + memory_t pfn; + memory_t nr_pfns; + u32 type; + /* OUT variables. */ + u32 handle; + u32 reg; +} dom0_add_memtype_t; + +/* + * Tear down an existing memory-range type. If @handle is remembered then it + * should be passed in to accurately tear down the correct setting (in case + * of overlapping memory regions with differing types). If it is not known + * then @handle should be set to zero. In all cases @reg must be set. + * (x86-specific). + */ +#define DOM0_DEL_MEMTYPE 32 +typedef struct { + /* IN variables. */ + u32 handle; + u32 reg; +} dom0_del_memtype_t; + +/* Read current type of an MTRR (x86-specific). */ +#define DOM0_READ_MEMTYPE 33 +typedef struct { + /* IN variables. */ + u32 reg; + /* OUT variables. */ + memory_t pfn; + memory_t nr_pfns; + u32 type; +} dom0_read_memtype_t; + +/* Interface for controlling Xen software performance counters. */ +#define DOM0_PERFCCONTROL 34 +/* Sub-operations: */ +#define DOM0_PERFCCONTROL_OP_RESET 1 /* Reset all counters to zero. */ +#define DOM0_PERFCCONTROL_OP_QUERY 2 /* Get perfctr information. */ +typedef struct { + u8 name[80]; /* name of perf counter */ + u32 nr_vals; /* number of values for this counter */ + u32 vals[64]; /* array of values */ +} dom0_perfc_desc_t; +typedef struct { + /* IN variables. */ + u32 op; /* DOM0_PERFCCONTROL_OP_??? */ + /* OUT variables. */ + u32 nr_counters; /* number of counters */ + dom0_perfc_desc_t *desc; /* counter information (or NULL) */ +} dom0_perfccontrol_t; + +#define DOM0_MICROCODE 35 +typedef struct { + /* IN variables. */ + void *data; /* Pointer to microcode data */ + u32 length; /* Length of microcode data. */ +} dom0_microcode_t; + +#define DOM0_IOPORT_PERMISSION 36 +typedef struct { + domid_t domain; /* domain to be affected */ + u16 first_port; /* first port int range */ + u16 nr_ports; /* size of port range */ + u16 allow_access; /* allow or deny access to range? */ +} dom0_ioport_permission_t; + +#define DOM0_GETVCPUCONTEXT 37 +typedef struct { + domid_t domain; /* domain to be affected */ + u16 vcpu; /* vcpu # */ + vcpu_guest_context_t *ctxt; /* NB. IN/OUT variable. */ + u64 cpu_time; +} dom0_getvcpucontext_t; + +#define DOM0_GETDOMAININFOLIST 38 +typedef struct { + /* IN variables. */ + domid_t first_domain; + memory_t max_domains; + dom0_getdomaininfo_t *buffer; + /* OUT variables. */ + memory_t num_domains; +} dom0_getdomaininfolist_t; + +#define DOM0_PLATFORM_QUIRK 39 +#define QUIRK_NOIRQBALANCING 1 +typedef struct { + /* IN variables. */ + int quirk_id; +} dom0_platform_quirk_t; + +typedef struct { + u32 cmd; + u32 interface_version; /* DOM0_INTERFACE_VERSION */ + union { + dom0_createdomain_t createdomain; + dom0_pausedomain_t pausedomain; + dom0_unpausedomain_t unpausedomain; + dom0_destroydomain_t destroydomain; + dom0_getmemlist_t getmemlist; + dom0_schedctl_t schedctl; + dom0_adjustdom_t adjustdom; + dom0_setdomaininfo_t setdomaininfo; + dom0_getdomaininfo_t getdomaininfo; + dom0_getpageframeinfo_t getpageframeinfo; + dom0_msr_t msr; + dom0_debug_t debug; + dom0_settime_t settime; + dom0_readconsole_t readconsole; + dom0_pincpudomain_t pincpudomain; + dom0_tbufcontrol_t tbufcontrol; + dom0_physinfo_t physinfo; + dom0_sched_id_t sched_id; + dom0_shadow_control_t shadow_control; + dom0_setdomainmaxmem_t setdomainmaxmem; + dom0_getpageframeinfo2_t getpageframeinfo2; + dom0_add_memtype_t add_memtype; + dom0_del_memtype_t del_memtype; + dom0_read_memtype_t read_memtype; + dom0_perfccontrol_t perfccontrol; + dom0_microcode_t microcode; + dom0_ioport_permission_t ioport_permission; + dom0_getvcpucontext_t getvcpucontext; + dom0_getdomaininfolist_t getdomaininfolist; + dom0_platform_quirk_t platform_quirk; + } u; +} dom0_op_t; + +#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/event_channel.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/event_channel.h @@ -0,0 +1,191 @@ +/****************************************************************************** + * event_channel.h + * + * Event channels between domains. + * + * Copyright (c) 2003-2004, K A Fraser. + */ + +#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__ +#define __XEN_PUBLIC_EVENT_CHANNEL_H__ + +/* + * EVTCHNOP_alloc_unbound: Prepare a local port for binding to <dom>. + * <port> may be wildcarded by setting to zero, in which case a fresh port + * will be allocated, and the field filled in on return. + */ +#define EVTCHNOP_alloc_unbound 6 +typedef struct evtchn_alloc_unbound { + /* IN parameters */ + domid_t dom; + /* IN/OUT parameters */ + u32 port; +} evtchn_alloc_unbound_t; + +/* + * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between + * <dom1> and <dom2>. Either <port1> or <port2> may be wildcarded by setting to + * zero. On successful return both <port1> and <port2> are filled in and + * <dom1,port1> is fully bound to <dom2,port2>. + * + * NOTES: + * 1. A wildcarded port is allocated from the relevant domain's free list + * (i.e., some port that was previously EVTCHNSTAT_closed). However, if the + * remote port pair is already fully bound then a port is not allocated, + * and instead the existing local port is returned to the caller. + * 2. If the caller is unprivileged then <dom1> must be DOMID_SELF. + * 3. If the caller is unprivileged and <dom2,port2> is EVTCHNSTAT_closed + * then <dom2> must be DOMID_SELF. + * 4. If either port is already bound then it must be bound to the other + * specified domain and port (if not wildcarded). + * 5. If either port is awaiting binding (EVTCHNSTAT_unbound) then it must + * be awaiting binding to the other domain, and the other port pair must + * be closed or unbound. + */ +#define EVTCHNOP_bind_interdomain 0 +typedef struct evtchn_bind_interdomain { + /* IN parameters. */ + domid_t dom1, dom2; + /* IN/OUT parameters. */ + u32 port1, port2; +} evtchn_bind_interdomain_t; + +/* + * EVTCHNOP_bind_virq: Bind a local event channel to IRQ <irq> on calling vcpu. + * NOTES: + * 1. A virtual IRQ may be bound to at most one event channel per vcpu. + * 2. The allocated event channel is bound to the calling vcpu. The binding + * may not be changed. + */ +#define EVTCHNOP_bind_virq 1 +typedef struct evtchn_bind_virq { + /* IN parameters. */ + u32 virq; + /* OUT parameters. */ + u32 port; +} evtchn_bind_virq_t; + +/* + * EVTCHNOP_bind_pirq: Bind a local event channel to IRQ <irq>. + * NOTES: + * 1. A physical IRQ may be bound to at most one event channel per domain. + * 2. Only a sufficiently-privileged domain may bind to a physical IRQ. + */ +#define EVTCHNOP_bind_pirq 2 +typedef struct evtchn_bind_pirq { + /* IN parameters. */ + u32 pirq; +#define BIND_PIRQ__WILL_SHARE 1 + u32 flags; /* BIND_PIRQ__* */ + /* OUT parameters. */ + u32 port; +} evtchn_bind_pirq_t; + +/* + * EVTCHNOP_bind_ipi: Bind a local event channel to receive events. + * NOTES: + * 1. The allocated event channel is bound to the calling vcpu. The binding + * may not be changed. + */ +#define EVTCHNOP_bind_ipi 7 +typedef struct evtchn_bind_ipi { + /* OUT parameters. */ + u32 port; +} evtchn_bind_ipi_t; + +/* + * EVTCHNOP_close: Close the communication channel which has an endpoint at + * <dom, port>. If the channel is interdomain then the remote end is placed in + * the unbound state (EVTCHNSTAT_unbound), awaiting a new connection. + * NOTES: + * 1. <dom> may be specified as DOMID_SELF. + * 2. Only a sufficiently-privileged domain may close an event channel + * for which <dom> is not DOMID_SELF. + */ +#define EVTCHNOP_close 3 +typedef struct evtchn_close { + /* IN parameters. */ + domid_t dom; + u32 port; + /* No OUT parameters. */ +} evtchn_close_t; + +/* + * EVTCHNOP_send: Send an event to the remote end of the channel whose local + * endpoint is <DOMID_SELF, local_port>. + */ +#define EVTCHNOP_send 4 +typedef struct evtchn_send { + /* IN parameters. */ + u32 local_port; + /* No OUT parameters. */ +} evtchn_send_t; + +/* + * EVTCHNOP_status: Get the current status of the communication channel which + * has an endpoint at <dom, port>. + * NOTES: + * 1. <dom> may be specified as DOMID_SELF. + * 2. Only a sufficiently-privileged domain may obtain the status of an event + * channel for which <dom> is not DOMID_SELF. + */ +#define EVTCHNOP_status 5 +typedef struct evtchn_status { + /* IN parameters */ + domid_t dom; + u32 port; + /* OUT parameters */ +#define EVTCHNSTAT_closed 0 /* Channel is not in use. */ +#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/ +#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */ +#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */ +#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */ +#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */ + u32 status; + u32 vcpu; /* VCPU to which this channel is bound. */ + union { + struct { + domid_t dom; + } unbound; /* EVTCHNSTAT_unbound */ + struct { + domid_t dom; + u32 port; + } interdomain; /* EVTCHNSTAT_interdomain */ + u32 pirq; /* EVTCHNSTAT_pirq */ + u32 virq; /* EVTCHNSTAT_virq */ + } u; +} evtchn_status_t; + +/* + * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an + * event is pending. + * NOTES: + * 1. IPI- and VIRQ-bound channels always notify the vcpu that initialised + * the binding. This binding cannot be changed. + * 2. All other channels notify vcpu0 by default. This default is set when + * the channel is allocated (a port that is freed and subsequently reused + * has its binding reset to vcpu0). + */ +#define EVTCHNOP_bind_vcpu 8 +typedef struct evtchn_bind_vcpu { + /* IN parameters. */ + u32 port; + u32 vcpu; +} evtchn_bind_vcpu_t; + +typedef struct evtchn_op { + u32 cmd; /* EVTCHNOP_* */ + union { + evtchn_alloc_unbound_t alloc_unbound; + evtchn_bind_interdomain_t bind_interdomain; + evtchn_bind_virq_t bind_virq; + evtchn_bind_pirq_t bind_pirq; + evtchn_bind_ipi_t bind_ipi; + evtchn_close_t close; + evtchn_send_t send; + evtchn_status_t status; + evtchn_bind_vcpu_t bind_vcpu; + } u; +} evtchn_op_t; + +#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/grant_table.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/grant_table.h @@ -0,0 +1,275 @@ +/****************************************************************************** + * grant_table.h + * + * Interface for granting foreign access to page frames, and receiving + * page-ownership transfers. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __XEN_PUBLIC_GRANT_TABLE_H__ +#define __XEN_PUBLIC_GRANT_TABLE_H__ + + +/*********************************** + * GRANT TABLE REPRESENTATION + */ + +/* Some rough guidelines on accessing and updating grant-table entries + * in a concurrency-safe manner. For more information, Linux contains a + * reference implementation for guest OSes (arch/xen/kernel/grant_table.c). + * + * NB. WMB is a no-op on current-generation x86 processors. However, a + * compiler barrier will still be required. + * + * Introducing a valid entry into the grant table: + * 1. Write ent->domid. + * 2. Write ent->frame: + * GTF_permit_access: Frame to which access is permitted. + * GTF_accept_transfer: Pseudo-phys frame slot being filled by new + * frame, or zero if none. + * 3. Write memory barrier (WMB). + * 4. Write ent->flags, inc. valid type. + * + * Invalidating an unused GTF_permit_access entry: + * 1. flags = ent->flags. + * 2. Observe that !(flags & (GTF_reading|GTF_writing)). + * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0). + * NB. No need for WMB as reuse of entry is control-dependent on success of + * step 3, and all architectures guarantee ordering of ctrl-dep writes. + * + * Invalidating an in-use GTF_permit_access entry: + * This cannot be done directly. Request assistance from the domain controller + * which can set a timeout on the use of a grant entry and take necessary + * action. (NB. This is not yet implemented!). + * + * Invalidating an unused GTF_accept_transfer entry: + * 1. flags = ent->flags. + * 2. Observe that !(flags & GTF_transfer_committed). [*] + * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0). + * NB. No need for WMB as reuse of entry is control-dependent on success of + * step 3, and all architectures guarantee ordering of ctrl-dep writes. + * [*] If GTF_transfer_committed is set then the grant entry is 'committed'. + * The guest must /not/ modify the grant entry until the address of the + * transferred frame is written. It is safe for the guest to spin waiting + * for this to occur (detect by observing GTF_transfer_completed in + * ent->flags). + * + * Invalidating a committed GTF_accept_transfer entry: + * 1. Wait for (ent->flags & GTF_transfer_completed). + * + * Changing a GTF_permit_access from writable to read-only: + * Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing. + * + * Changing a GTF_permit_access from read-only to writable: + * Use SMP-safe bit-setting instruction. + */ + +/* + * A grant table comprises a packed array of grant entries in one or more + * page frames shared between Xen and a guest. + * [XEN]: This field is written by Xen and read by the sharing guest. + * [GST]: This field is written by the guest and read by Xen. + */ +typedef struct grant_entry { + /* GTF_xxx: various type and flag information. [XEN,GST] */ + u16 flags; + /* The domain being granted foreign privileges. [GST] */ + domid_t domid; + /* + * GTF_permit_access: Frame that @domid is allowed to map and access. [GST] + * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN] + */ + u32 frame; +} grant_entry_t; + +/* + * Type of grant entry. + * GTF_invalid: This grant entry grants no privileges. + * GTF_permit_access: Allow @domid to map/access @frame. + * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame + * to this guest. Xen writes the page number to @frame. + */ +#define GTF_invalid (0U<<0) +#define GTF_permit_access (1U<<0) +#define GTF_accept_transfer (2U<<0) +#define GTF_type_mask (3U<<0) + +/* + * Subflags for GTF_permit_access. + * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST] + * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN] + * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN] + */ +#define _GTF_readonly (2) +#define GTF_readonly (1U<<_GTF_readonly) +#define _GTF_reading (3) +#define GTF_reading (1U<<_GTF_reading) +#define _GTF_writing (4) +#define GTF_writing (1U<<_GTF_writing) + +/* + * Subflags for GTF_accept_transfer: + * GTF_transfer_committed: Xen sets this flag to indicate that it is committed + * to transferring ownership of a page frame. When a guest sees this flag + * it must /not/ modify the grant entry until GTF_transfer_completed is + * set by Xen. + * GTF_transfer_completed: It is safe for the guest to spin-wait on this flag + * after reading GTF_transfer_committed. Xen will always write the frame + * address, followed by ORing this flag, in a timely manner. + */ +#define _GTF_transfer_committed (2) +#define GTF_transfer_committed (1U<<_GTF_transfer_committed) +#define _GTF_transfer_completed (3) +#define GTF_transfer_completed (1U<<_GTF_transfer_completed) + + +/*********************************** + * GRANT TABLE QUERIES AND USES + */ + +/* + * Reference to a grant entry in a specified domain's grant table. + */ +typedef u16 grant_ref_t; + +/* + * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access + * by devices and/or host CPUs. If successful, <handle> is a tracking number + * that must be presented later to destroy the mapping(s). On error, <handle> + * is a negative status code. + * NOTES: + * 1. If GNTPIN_map_for_dev is specified then <dev_bus_addr> is the address + * via which I/O devices may access the granted frame. + * 2. If GNTPIN_map_for_host is specified then a mapping will be added at + * virtual address <host_virt_addr> in the current address space. + * 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a + * host mapping is destroyed by other means then it is *NOT* guaranteed + * to be accounted to the correct grant reference! + */ +#define GNTTABOP_map_grant_ref 0 +typedef struct gnttab_map_grant_ref { + /* IN parameters. */ + memory_t host_virt_addr; + domid_t dom; + grant_ref_t ref; + u16 flags; /* GNTMAP_* */ + /* OUT parameters. */ + s16 handle; /* +ve: handle; -ve: GNTST_* */ + memory_t dev_bus_addr; +} gnttab_map_grant_ref_t; + +/* + * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings + * tracked by <handle>. If <host_virt_addr> or <dev_bus_addr> is zero, that + * field is ignored. If non-zero, they must refer to a device/host mapping + * that is tracked by <handle> + * NOTES: + * 1. The call may fail in an undefined manner if either mapping is not + * tracked by <handle>. + * 3. After executing a batch of unmaps, it is guaranteed that no stale + * mappings will remain in the device or host TLBs. + */ +#define GNTTABOP_unmap_grant_ref 1 +typedef struct gnttab_unmap_grant_ref { + /* IN parameters. */ + memory_t host_virt_addr; + memory_t dev_bus_addr; + u16 handle; + /* OUT parameters. */ + s16 status; /* GNTST_* */ +} gnttab_unmap_grant_ref_t; + +#define GNTUNMAP_DEV_FROM_VIRT (~0U) + +/* + * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least + * <nr_frames> pages. The frame addresses are written to the <frame_list>. + * Only <nr_frames> addresses are written, even if the table is larger. + * NOTES: + * 1. <dom> may be specified as DOMID_SELF. + * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF. + * 3. Xen may not support more than a single grant-table page per domain. + */ +#define GNTTABOP_setup_table 2 +typedef struct gnttab_setup_table { + /* IN parameters. */ + domid_t dom; + u16 nr_frames; + /* OUT parameters. */ + s16 status; /* GNTST_* */ + unsigned long *frame_list; +} gnttab_setup_table_t; + +/* + * GNTTABOP_dump_table: Dump the contents of the grant table to the + * xen console. Debugging use only. + */ +#define GNTTABOP_dump_table 3 +typedef struct gnttab_dump_table { + /* IN parameters. */ + domid_t dom; + /* OUT parameters. */ + s16 status; /* GNTST_* */ +} gnttab_dump_table_t; + +/* + * GNTTABOP_donate_grant_ref: Donate <frame> to a foreign domain. The + * foreign domain has previously registered the details of the transfer. + * These can be identified from <handle>, a grant reference. + */ +#define GNTTABOP_donate 4 +typedef struct { + memory_t mfn; /* 0 */ + domid_t domid; /* 4 */ + u16 handle; /* 8 */ + s16 status; /* 10: GNTST_* */ + u32 __pad; +} gnttab_donate_t; /* 14 bytes */ + +/* + * Bitfield values for update_pin_status.flags. + */ + /* Map the grant entry for access by I/O devices. */ +#define _GNTMAP_device_map (0) +#define GNTMAP_device_map (1<<_GNTMAP_device_map) + /* Map the grant entry for access by host CPUs. */ +#define _GNTMAP_host_map (1) +#define GNTMAP_host_map (1<<_GNTMAP_host_map) + /* Accesses to the granted frame will be restricted to read-only access. */ +#define _GNTMAP_readonly (2) +#define GNTMAP_readonly (1<<_GNTMAP_readonly) + /* + * GNTMAP_host_map subflag: + * 0 => The host mapping is usable only by the guest OS. + * 1 => The host mapping is usable by guest OS + current application. + */ +#define _GNTMAP_application_map (3) +#define GNTMAP_application_map (1<<_GNTMAP_application_map) + +/* + * Values for error status returns. All errors are -ve. + */ +#define GNTST_okay (0) +#define GNTST_general_error (-1) /* General undefined error. */ +#define GNTST_bad_domain (-2) /* Unrecognsed domain id. */ +#define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */ +#define GNTST_bad_handle (-4) /* Unrecognised or inappropriate handle. */ +#define GNTST_bad_virt_addr (-5) /* Inappropriate virtual address to map. */ +#define GNTST_bad_dev_addr (-6) /* Inappropriate device address to unmap.*/ +#define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */ +#define GNTST_permission_denied (-8) /* Not enough privilege for operation. */ + +#define GNTTABOP_error_msgs { \ + "okay", \ + "undefined error", \ + "unrecognised domain id", \ + "invalid grant reference", \ + "invalid mapping handle", \ + "invalid virtual address", \ + "invalid device address", \ + "no spare translation slot in the I/O MMU", \ + "permission denied" \ +} + +#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/blkif.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/blkif.h @@ -0,0 +1,99 @@ +/****************************************************************************** + * blkif.h + * + * Unified block-device I/O interface for Xen guest OSes. + * + * Copyright (c) 2003-2004, Keir Fraser + */ + +#ifndef __XEN_PUBLIC_IO_BLKIF_H__ +#define __XEN_PUBLIC_IO_BLKIF_H__ + +#include "ring.h" + +#ifndef blkif_vdev_t +#define blkif_vdev_t u16 +#endif +#define blkif_sector_t u64 + +#define BLKIF_OP_READ 0 +#define BLKIF_OP_WRITE 1 +#define BLKIF_OP_PROBE 2 + +/* NB. Ring size must be small enough for sizeof(blkif_ring_t) <= PAGE_SIZE. */ +#define BLKIF_RING_SIZE 64 + +/* + * Maximum scatter/gather segments per request. + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. + * NB. This could be 12 if the ring indexes weren't stored in the same page. + */ +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 + +typedef struct blkif_request { + u8 operation; /* BLKIF_OP_??? */ + u8 nr_segments; /* number of segments */ + blkif_vdev_t device; /* only for read/write requests */ + unsigned long id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect */ +#ifdef CONFIG_XEN_BLKDEV_GRANT + /* @f_a_s[:16]= grant reference (16 bits) */ +#else + /* @f_a_s[:12]=@frame: machine page frame number. */ +#endif + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +} blkif_request_t; + +#define blkif_first_sect(_fas) (((_fas)>>3)&7) +#define blkif_last_sect(_fas) ((_fas)&7) + +#ifdef CONFIG_XEN_BLKDEV_GRANT +#define blkif_gref_from_fas(_fas) ((_fas)>>16) +#endif + +typedef struct blkif_response { + unsigned long id; /* copied from request */ + u8 operation; /* copied from request */ + s16 status; /* BLKIF_RSP_??? */ +} blkif_response_t; + +#define BLKIF_RSP_ERROR -1 /* non-specific 'error' */ +#define BLKIF_RSP_OKAY 0 /* non-specific 'okay' */ + +/* + * Generate blkif ring structures and types. + */ + +DEFINE_RING_TYPES(blkif, blkif_request_t, blkif_response_t); + +/* + * BLKIF_OP_PROBE: + * The request format for a probe request is constrained as follows: + * @operation == BLKIF_OP_PROBE + * @nr_segments == size of probe buffer in pages + * @device == unused (zero) + * @id == any value (echoed in response message) + * @sector_num == unused (zero) + * @frame_and_sects == list of page-sized buffers. + * (i.e., @first_sect == 0, @last_sect == 7). + * + * The response is a list of vdisk_t elements copied into the out-of-band + * probe buffer. On success the response status field contains the number + * of vdisk_t elements. + */ + +#define VDISK_CDROM 0x1 +#define VDISK_REMOVABLE 0x2 +#define VDISK_READONLY 0x4 + +typedef struct vdisk { + blkif_sector_t capacity; /* Size in terms of 512-byte sectors. */ + blkif_vdev_t device; /* Device number (opaque 16 bit value). */ + u16 info; /* Device type and flags (VDISK_*). */ + u16 sector_size; /* Minimum alignment for requests. */ +} vdisk_t; /* 16 bytes */ + +#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/domain_controller.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/domain_controller.h @@ -0,0 +1,783 @@ +/****************************************************************************** + * domain_controller.h + * + * Interface to server controller (e.g., 'xend'). This header file defines the + * interface that is shared with guest OSes. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __XEN_PUBLIC_IO_DOMAIN_CONTROLLER_H__ +#define __XEN_PUBLIC_IO_DOMAIN_CONTROLLER_H__ + +#include "ring.h" + +/* + * CONTROLLER MESSAGING INTERFACE. + */ + +typedef struct control_msg { + u8 type; /* 0: echoed in response */ + u8 subtype; /* 1: echoed in response */ + u8 id; /* 2: echoed in response */ + u8 length; /* 3: number of bytes in 'msg' */ + u8 msg[60]; /* 4: type-specific message data */ +} control_msg_t; /* 64 bytes */ + +/* These are used by the control message deferred ring. */ +#define CONTROL_RING_SIZE 8 +typedef u32 CONTROL_RING_IDX; +#define MASK_CONTROL_IDX(_i) ((_i)&(CONTROL_RING_SIZE-1)) + +/* + * Generate control ring structures and types. + * + * CONTROL_RING_MEM is currently an 8-slot ring of ctrl_msg_t structs and + * two 32-bit counters: (64 * 8) + (2 * 4) = 520 + */ +#define CONTROL_RING_MEM 520 +DEFINE_RING_TYPES(ctrl, control_msg_t, control_msg_t); + +typedef struct control_if { + union { + ctrl_sring_t tx_ring; /* guest -> controller */ + char __x[CONTROL_RING_MEM]; + }; + union { + ctrl_sring_t rx_ring; /* controller -> guest */ + char __y[CONTROL_RING_MEM]; + }; +} control_if_t; + +/* + * Top-level command types. + */ +#define CMSG_CONSOLE 0 /* Console */ +#define CMSG_BLKIF_BE 1 /* Block-device backend */ +#define CMSG_BLKIF_FE 2 /* Block-device frontend */ +#define CMSG_NETIF_BE 3 /* Network-device backend */ +#define CMSG_NETIF_FE 4 /* Network-device frontend */ +#define CMSG_SHUTDOWN 6 /* Shutdown messages */ +#define CMSG_MEM_REQUEST 7 /* Memory reservation reqs */ +#define CMSG_USBIF_BE 8 /* USB controller backend */ +#define CMSG_USBIF_FE 9 /* USB controller frontend */ +#define CMSG_VCPU_HOTPLUG 10 /* Hotplug VCPU messages */ +#define CMSG_DEBUG 11 /* PDB backend */ + +/****************************************************************************** + * CONSOLE DEFINITIONS + */ + +/* + * Subtypes for console messages. + */ +#define CMSG_CONSOLE_DATA 0 + + +/****************************************************************************** + * BLOCK-INTERFACE FRONTEND DEFINITIONS + */ + +/* Messages from domain controller to guest. */ +#define CMSG_BLKIF_FE_INTERFACE_STATUS 0 + +/* Messages from guest to domain controller. */ +#define CMSG_BLKIF_FE_DRIVER_STATUS 32 +#define CMSG_BLKIF_FE_INTERFACE_CONNECT 33 +#define CMSG_BLKIF_FE_INTERFACE_DISCONNECT 34 +#define CMSG_BLKIF_FE_INTERFACE_QUERY 35 + +#ifndef blkif_vdev_t +#define blkif_vdev_t u16 +#endif +#define blkif_pdev_t u32 + +/* + * CMSG_BLKIF_FE_INTERFACE_STATUS: + * Notify a guest about a status change on one of its block interfaces. + * If the interface is DESTROYED or DOWN then the interface is disconnected: + * 1. The shared-memory frame is available for reuse. + * 2. Any unacknowledged messages pending on the interface were dropped. + */ +#define BLKIF_INTERFACE_STATUS_CLOSED 0 /* Interface doesn't exist. */ +#define BLKIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */ +#define BLKIF_INTERFACE_STATUS_CONNECTED 2 /* Exists and is connected. */ +#define BLKIF_INTERFACE_STATUS_CHANGED 3 /* A device has been added or removed. */ +typedef struct blkif_fe_interface_status { + u32 handle; + u32 status; + u16 evtchn; /* (only if status == BLKIF_INTERFACE_STATUS_CONNECTED). */ + domid_t domid; /* status != BLKIF_INTERFACE_STATUS_DESTROYED */ +} blkif_fe_interface_status_t; + +/* + * CMSG_BLKIF_FE_DRIVER_STATUS: + * Notify the domain controller that the front-end driver is DOWN or UP. + * When the driver goes DOWN then the controller will send no more + * status-change notifications. + * If the driver goes DOWN while interfaces are still UP, the domain + * will automatically take the interfaces DOWN. + * + * NB. The controller should not send an INTERFACE_STATUS_CHANGED message + * for interfaces that are active when it receives an UP notification. We + * expect that the frontend driver will query those interfaces itself. + */ +#define BLKIF_DRIVER_STATUS_DOWN 0 +#define BLKIF_DRIVER_STATUS_UP 1 +typedef struct blkif_fe_driver_status { + /* IN */ + u32 status; /* BLKIF_DRIVER_STATUS_??? */ + /* OUT */ + /* Driver should query interfaces [0..max_handle]. */ + u32 max_handle; +} blkif_fe_driver_status_t; + +/* + * CMSG_BLKIF_FE_INTERFACE_CONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_CONNECTED message. + */ +typedef struct blkif_fe_interface_connect { + u32 handle; + memory_t shmem_frame; + int shmem_ref; +} blkif_fe_interface_connect_t; + +/* + * CMSG_BLKIF_FE_INTERFACE_DISCONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_DISCONNECTED message. + */ +typedef struct blkif_fe_interface_disconnect { + u32 handle; +} blkif_fe_interface_disconnect_t; + +/* + * CMSG_BLKIF_FE_INTERFACE_QUERY: + */ +typedef struct blkif_fe_interface_query { + /* IN */ + u32 handle; + /* OUT */ + u32 status; + u16 evtchn; /* (only if status == BLKIF_INTERFACE_STATUS_CONNECTED). */ + domid_t domid; /* status != BLKIF_INTERFACE_STATUS_DESTROYED */ +} blkif_fe_interface_query_t; + + +/****************************************************************************** + * BLOCK-INTERFACE BACKEND DEFINITIONS + */ + +/* Messages from domain controller. */ +#define CMSG_BLKIF_BE_CREATE 0 /* Create a new block-device interface. */ +#define CMSG_BLKIF_BE_DESTROY 1 /* Destroy a block-device interface. */ +#define CMSG_BLKIF_BE_CONNECT 2 /* Connect i/f to remote driver. */ +#define CMSG_BLKIF_BE_DISCONNECT 3 /* Disconnect i/f from remote driver. */ +#define CMSG_BLKIF_BE_VBD_CREATE 4 /* Create a new VBD for an interface. */ +#define CMSG_BLKIF_BE_VBD_DESTROY 5 /* Delete a VBD from an interface. */ + +/* Messages to domain controller. */ +#define CMSG_BLKIF_BE_DRIVER_STATUS 32 + +/* + * Message request/response definitions for block-device messages. + */ + +/* Non-specific 'okay' return. */ +#define BLKIF_BE_STATUS_OKAY 0 +/* Non-specific 'error' return. */ +#define BLKIF_BE_STATUS_ERROR 1 +/* The following are specific error returns. */ +#define BLKIF_BE_STATUS_INTERFACE_EXISTS 2 +#define BLKIF_BE_STATUS_INTERFACE_NOT_FOUND 3 +#define BLKIF_BE_STATUS_INTERFACE_CONNECTED 4 +#define BLKIF_BE_STATUS_VBD_EXISTS 5 +#define BLKIF_BE_STATUS_VBD_NOT_FOUND 6 +#define BLKIF_BE_STATUS_OUT_OF_MEMORY 7 +#define BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND 8 +#define BLKIF_BE_STATUS_MAPPING_ERROR 9 + +/* This macro can be used to create an array of descriptive error strings. */ +#define BLKIF_BE_STATUS_ERRORS { \ + "Okay", \ + "Non-specific error", \ + "Interface already exists", \ + "Interface not found", \ + "Interface is still connected", \ + "VBD already exists", \ + "VBD not found", \ + "Out of memory", \ + "Extent not found for VBD", \ + "Could not map domain memory" } + +/* + * CMSG_BLKIF_BE_CREATE: + * When the driver sends a successful response then the interface is fully + * created. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct blkif_be_create { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + u32 blkif_handle; /* Domain-specific interface handle. */ + /* OUT */ + u32 status; +} blkif_be_create_t; + +/* + * CMSG_BLKIF_BE_DESTROY: + * When the driver sends a successful response then the interface is fully + * torn down. The controller will send a DESTROYED notification to the + * front-end driver. + */ +typedef struct blkif_be_destroy { + /* IN */ + domid_t domid; /* Identify interface to be destroyed. */ + u32 blkif_handle; /* ...ditto... */ + /* OUT */ + u32 status; +} blkif_be_destroy_t; + +/* + * CMSG_BLKIF_BE_CONNECT: + * When the driver sends a successful response then the interface is fully + * connected. The controller will send a CONNECTED notification to the + * front-end driver. + */ +typedef struct blkif_be_connect { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + u32 blkif_handle; /* Domain-specific interface handle. */ + memory_t shmem_frame; /* Page cont. shared comms window. */ + int shmem_ref; /* Grant table reference. */ + u32 evtchn; /* Event channel for notifications. */ + /* OUT */ + u32 status; +} blkif_be_connect_t; + +/* + * CMSG_BLKIF_BE_DISCONNECT: + * When the driver sends a successful response then the interface is fully + * disconnected. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct blkif_be_disconnect { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + u32 blkif_handle; /* Domain-specific interface handle. */ + /* OUT */ + u32 status; +} blkif_be_disconnect_t; + +/* CMSG_BLKIF_BE_VBD_CREATE */ +typedef struct blkif_be_vbd_create { + /* IN */ + domid_t domid; /* Identify blkdev interface. */ + u32 blkif_handle; /* ...ditto... */ + blkif_pdev_t pdevice; + u32 dev_handle; /* Extended device id field. */ + blkif_vdev_t vdevice; /* Interface-specific id for this VBD. */ + u16 readonly; /* Non-zero -> VBD isn't writable. */ + /* OUT */ + u32 status; +} blkif_be_vbd_create_t; + +/* CMSG_BLKIF_BE_VBD_DESTROY */ +typedef struct blkif_be_vbd_destroy { + /* IN */ + domid_t domid; /* Identify blkdev interface. */ + u32 blkif_handle; /* ...ditto... */ + blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ + /* OUT */ + u32 status; +} blkif_be_vbd_destroy_t; + +/* + * CMSG_BLKIF_BE_DRIVER_STATUS: + * Notify the domain controller that the back-end driver is DOWN or UP. + * If the driver goes DOWN while interfaces are still UP, the controller + * will automatically send DOWN notifications. + */ +typedef struct blkif_be_driver_status { + u32 status; /* BLKIF_DRIVER_STATUS_??? */ +} blkif_be_driver_status_t; + + +/****************************************************************************** + * NETWORK-INTERFACE FRONTEND DEFINITIONS + */ + +/* Messages from domain controller to guest. */ +#define CMSG_NETIF_FE_INTERFACE_STATUS 0 + +/* Messages from guest to domain controller. */ +#define CMSG_NETIF_FE_DRIVER_STATUS 32 +#define CMSG_NETIF_FE_INTERFACE_CONNECT 33 +#define CMSG_NETIF_FE_INTERFACE_DISCONNECT 34 +#define CMSG_NETIF_FE_INTERFACE_QUERY 35 + +/* + * CMSG_NETIF_FE_INTERFACE_STATUS: + * Notify a guest about a status change on one of its network interfaces. + * If the interface is CLOSED or DOWN then the interface is disconnected: + * 1. The shared-memory frame is available for reuse. + * 2. Any unacknowledged messgaes pending on the interface were dropped. + */ +#define NETIF_INTERFACE_STATUS_CLOSED 0 /* Interface doesn't exist. */ +#define NETIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */ +#define NETIF_INTERFACE_STATUS_CONNECTED 2 /* Exists and is connected. */ +#define NETIF_INTERFACE_STATUS_CHANGED 3 /* A device has been added or removed. */ +typedef struct netif_fe_interface_status { + u32 handle; + u32 status; + u16 evtchn; /* status == NETIF_INTERFACE_STATUS_CONNECTED */ + u8 mac[6]; /* status == NETIF_INTERFACE_STATUS_CONNECTED */ + domid_t domid; /* status != NETIF_INTERFACE_STATUS_DESTROYED */ +} netif_fe_interface_status_t; + +/* + * CMSG_NETIF_FE_DRIVER_STATUS: + * Notify the domain controller that the front-end driver is DOWN or UP. + * When the driver goes DOWN then the controller will send no more + * status-change notifications. + * If the driver goes DOWN while interfaces are still UP, the domain + * will automatically take the interfaces DOWN. + * + * NB. The controller should not send an INTERFACE_STATUS message + * for interfaces that are active when it receives an UP notification. We + * expect that the frontend driver will query those interfaces itself. + */ +#define NETIF_DRIVER_STATUS_DOWN 0 +#define NETIF_DRIVER_STATUS_UP 1 +typedef struct netif_fe_driver_status { + /* IN */ + u32 status; /* NETIF_DRIVER_STATUS_??? */ + /* OUT */ + /* Driver should query interfaces [0..max_handle]. */ + u32 max_handle; +} netif_fe_driver_status_t; + +/* + * CMSG_NETIF_FE_INTERFACE_CONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_CONNECTED message. + */ +typedef struct netif_fe_interface_connect { + u32 handle; + memory_t tx_shmem_frame; + memory_t rx_shmem_frame; +} netif_fe_interface_connect_t; + +/* + * CMSG_NETIF_FE_INTERFACE_DISCONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_DISCONNECTED message. + */ +typedef struct netif_fe_interface_disconnect { + u32 handle; +} netif_fe_interface_disconnect_t; + +/* + * CMSG_NETIF_FE_INTERFACE_QUERY: + */ +typedef struct netif_fe_interface_query { + /* IN */ + u32 handle; + /* OUT */ + u32 status; + u16 evtchn; /* status == NETIF_INTERFACE_STATUS_CONNECTED */ + u8 mac[6]; /* status == NETIF_INTERFACE_STATUS_CONNECTED */ + domid_t domid; /* status != NETIF_INTERFACE_STATUS_DESTROYED */ +} netif_fe_interface_query_t; + + +/****************************************************************************** + * NETWORK-INTERFACE BACKEND DEFINITIONS + */ + +/* Messages from domain controller. */ +#define CMSG_NETIF_BE_CREATE 0 /* Create a new net-device interface. */ +#define CMSG_NETIF_BE_DESTROY 1 /* Destroy a net-device interface. */ +#define CMSG_NETIF_BE_CONNECT 2 /* Connect i/f to remote driver. */ +#define CMSG_NETIF_BE_DISCONNECT 3 /* Disconnect i/f from remote driver. */ +#define CMSG_NETIF_BE_CREDITLIMIT 4 /* Limit i/f to a given credit limit. */ + +/* Messages to domain controller. */ +#define CMSG_NETIF_BE_DRIVER_STATUS 32 + +/* + * Message request/response definitions for net-device messages. + */ + +/* Non-specific 'okay' return. */ +#define NETIF_BE_STATUS_OKAY 0 +/* Non-specific 'error' return. */ +#define NETIF_BE_STATUS_ERROR 1 +/* The following are specific error returns. */ +#define NETIF_BE_STATUS_INTERFACE_EXISTS 2 +#define NETIF_BE_STATUS_INTERFACE_NOT_FOUND 3 +#define NETIF_BE_STATUS_INTERFACE_CONNECTED 4 +#define NETIF_BE_STATUS_OUT_OF_MEMORY 5 +#define NETIF_BE_STATUS_MAPPING_ERROR 6 + +/* This macro can be used to create an array of descriptive error strings. */ +#define NETIF_BE_STATUS_ERRORS { \ + "Okay", \ + "Non-specific error", \ + "Interface already exists", \ + "Interface not found", \ + "Interface is still connected", \ + "Out of memory", \ + "Could not map domain memory" } + +/* + * CMSG_NETIF_BE_CREATE: + * When the driver sends a successful response then the interface is fully + * created. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct netif_be_create { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + u32 netif_handle; /* Domain-specific interface handle. */ + u8 mac[6]; + u8 be_mac[6]; + /* OUT */ + u32 status; +} netif_be_create_t; + +/* + * CMSG_NETIF_BE_DESTROY: + * When the driver sends a successful response then the interface is fully + * torn down. The controller will send a DESTROYED notification to the + * front-end driver. + */ +typedef struct netif_be_destroy { + /* IN */ + domid_t domid; /* Identify interface to be destroyed. */ + u32 netif_handle; /* ...ditto... */ + /* OUT */ + u32 status; +} netif_be_destroy_t; + +/* + * CMSG_NETIF_BE_CREDITLIMIT: + * Limit a virtual interface to "credit_bytes" bytes per "period_usec" + * microseconds. + */ +typedef struct netif_be_creditlimit { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + u32 netif_handle; /* Domain-specific interface handle. */ + u32 credit_bytes; /* Vifs credit of bytes per period. */ + u32 period_usec; /* Credit replenishment period. */ + /* OUT */ + u32 status; +} netif_be_creditlimit_t; + +/* + * CMSG_NETIF_BE_CONNECT: + * When the driver sends a successful response then the interface is fully + * connected. The controller will send a CONNECTED notification to the + * front-end driver. + */ +typedef struct netif_be_connect { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + u32 netif_handle; /* Domain-specific interface handle. */ + memory_t tx_shmem_frame; /* Page cont. tx shared comms window. */ + memory_t rx_shmem_frame; /* Page cont. rx shared comms window. */ + u16 evtchn; /* Event channel for notifications. */ + /* OUT */ + u32 status; +} netif_be_connect_t; + +/* + * CMSG_NETIF_BE_DISCONNECT: + * When the driver sends a successful response then the interface is fully + * disconnected. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct netif_be_disconnect { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + u32 netif_handle; /* Domain-specific interface handle. */ + /* OUT */ + u32 status; +} netif_be_disconnect_t; + +/* + * CMSG_NETIF_BE_DRIVER_STATUS: + * Notify the domain controller that the back-end driver is DOWN or UP. + * If the driver goes DOWN while interfaces are still UP, the domain + * will automatically send DOWN notifications. + */ +typedef struct netif_be_driver_status { + u32 status; /* NETIF_DRIVER_STATUS_??? */ +} netif_be_driver_status_t; + + + +/****************************************************************************** + * USB-INTERFACE FRONTEND DEFINITIONS + */ + +/* Messages from domain controller to guest. */ +#define CMSG_USBIF_FE_INTERFACE_STATUS_CHANGED 0 + +/* Messages from guest to domain controller. */ +#define CMSG_USBIF_FE_DRIVER_STATUS_CHANGED 32 +#define CMSG_USBIF_FE_INTERFACE_CONNECT 33 +#define CMSG_USBIF_FE_INTERFACE_DISCONNECT 34 +/* + * CMSG_USBIF_FE_INTERFACE_STATUS_CHANGED: + * Notify a guest about a status change on one of its block interfaces. + * If the interface is DESTROYED or DOWN then the interface is disconnected: + * 1. The shared-memory frame is available for reuse. + * 2. Any unacknowledged messages pending on the interface were dropped. + */ +#define USBIF_INTERFACE_STATUS_DESTROYED 0 /* Interface doesn't exist. */ +#define USBIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */ +#define USBIF_INTERFACE_STATUS_CONNECTED 2 /* Exists and is connected. */ +typedef struct usbif_fe_interface_status_changed { + u32 status; + u16 evtchn; /* (only if status == BLKIF_INTERFACE_STATUS_CONNECTED). */ + domid_t domid; /* status != BLKIF_INTERFACE_STATUS_DESTROYED */ + u32 bandwidth; + u32 num_ports; +} usbif_fe_interface_status_changed_t; + +/* + * CMSG_USBIF_FE_DRIVER_STATUS_CHANGED: + * Notify the domain controller that the front-end driver is DOWN or UP. + * When the driver goes DOWN then the controller will send no more + * status-change notifications. + * If the driver goes DOWN while interfaces are still UP, the domain + * will automatically take the interfaces DOWN. + * + * NB. The controller should not send an INTERFACE_STATUS_CHANGED message + * for interfaces that are active when it receives an UP notification. We + * expect that the frontend driver will query those interfaces itself. + */ +#define USBIF_DRIVER_STATUS_DOWN 0 +#define USBIF_DRIVER_STATUS_UP 1 +typedef struct usbif_fe_driver_status_changed { + /* IN */ + u32 status; /* USBIF_DRIVER_STATUS_??? */ +} usbif_fe_driver_status_changed_t; + +/* + * CMSG_USBIF_FE_INTERFACE_CONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_CONNECTED message. + */ +typedef struct usbif_fe_interface_connect { + memory_t shmem_frame; +} usbif_fe_interface_connect_t; + +/* + * CMSG_USBIF_FE_INTERFACE_DISCONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_DISCONNECTED message. + */ +typedef struct usbif_fe_interface_disconnect { + int dummy; /* make struct non-empty */ +} usbif_fe_interface_disconnect_t; + + +/****************************************************************************** + * USB-INTERFACE BACKEND DEFINITIONS + */ + +/* Messages from domain controller. */ +#define CMSG_USBIF_BE_CREATE 0 /* Create a new block-device interface. */ +#define CMSG_USBIF_BE_DESTROY 1 /* Destroy a block-device interface. */ +#define CMSG_USBIF_BE_CONNECT 2 /* Connect i/f to remote driver. */ +#define CMSG_USBIF_BE_DISCONNECT 3 /* Disconnect i/f from remote driver. */ +#define CMSG_USBIF_BE_CLAIM_PORT 4 /* Claim host port for a domain. */ +#define CMSG_USBIF_BE_RELEASE_PORT 5 /* Release host port. */ +/* Messages to domain controller. */ +#define CMSG_USBIF_BE_DRIVER_STATUS_CHANGED 32 + +/* Non-specific 'okay' return. */ +#define USBIF_BE_STATUS_OKAY 0 +/* Non-specific 'error' return. */ +#define USBIF_BE_STATUS_ERROR 1 +/* The following are specific error returns. */ +#define USBIF_BE_STATUS_INTERFACE_EXISTS 2 +#define USBIF_BE_STATUS_INTERFACE_NOT_FOUND 3 +#define USBIF_BE_STATUS_INTERFACE_CONNECTED 4 +#define USBIF_BE_STATUS_OUT_OF_MEMORY 7 +#define USBIF_BE_STATUS_MAPPING_ERROR 9 + +/* This macro can be used to create an array of descriptive error strings. */ +#define USBIF_BE_STATUS_ERRORS { \ + "Okay", \ + "Non-specific error", \ + "Interface already exists", \ + "Interface not found", \ + "Interface is still connected", \ + "Out of memory", \ + "Could not map domain memory" } + +/* + * CMSG_USBIF_BE_CREATE: + * When the driver sends a successful response then the interface is fully + * created. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct usbif_be_create { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + /* OUT */ + u32 status; +} usbif_be_create_t; + +/* + * CMSG_USBIF_BE_DESTROY: + * When the driver sends a successful response then the interface is fully + * torn down. The controller will send a DESTROYED notification to the + * front-end driver. + */ +typedef struct usbif_be_destroy { + /* IN */ + domid_t domid; /* Identify interface to be destroyed. */ + /* OUT */ + u32 status; +} usbif_be_destroy_t; + +/* + * CMSG_USBIF_BE_CONNECT: + * When the driver sends a successful response then the interface is fully + * connected. The controller will send a CONNECTED notification to the + * front-end driver. + */ +typedef struct usbif_be_connect { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + memory_t shmem_frame; /* Page cont. shared comms window. */ + u32 evtchn; /* Event channel for notifications. */ + u32 bandwidth; /* Bandwidth allocated for isoch / int - us + * per 1ms frame (ie between 0 and 900 or 800 + * depending on USB version). */ + /* OUT */ + u32 status; +} usbif_be_connect_t; + +/* + * CMSG_USBIF_BE_DISCONNECT: + * When the driver sends a successful response then the interface is fully + * disconnected. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct usbif_be_disconnect { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + /* OUT */ + u32 status; +} usbif_be_disconnect_t; + +/* + * CMSG_USBIF_BE_DRIVER_STATUS_CHANGED: + * Notify the domain controller that the back-end driver is DOWN or UP. + * If the driver goes DOWN while interfaces are still UP, the controller + * will automatically send DOWN notifications. + */ +typedef struct usbif_be_driver_status_changed { + u32 status; /* USBIF_DRIVER_STATUS_??? */ +} usbif_be_driver_status_changed_t; + +#define USB_PATH_LEN 16 + +/* + * CMSG_USBIF_BE_CLAIM_PORT: + * Instruct the backend driver to claim any device plugged into the specified + * host port and to allow the specified domain to control that port. + */ +typedef struct usbif_be_claim_port { + /* IN */ + domid_t domid; /* which domain */ + u32 usbif_port; /* port on the virtual root hub */ + u32 status; /* status of operation */ + char path[USB_PATH_LEN]; /* Currently specified in the Linux style - may need to be + * converted to some OS-independent format at some stage. */ +} usbif_be_claim_port_t; + +/* + * CMSG_USBIF_BE_RELEASE_PORT: + * Instruct the backend driver to release any device plugged into the specified + * host port. + */ +typedef struct usbif_be_release_port { + char path[USB_PATH_LEN]; +} usbif_be_release_port_t; + +/****************************************************************************** + * SHUTDOWN DEFINITIONS + */ + +/* + * Subtypes for shutdown messages. + */ +#define CMSG_SHUTDOWN_POWEROFF 0 /* Clean shutdown (SHUTDOWN_poweroff). */ +#define CMSG_SHUTDOWN_REBOOT 1 /* Clean shutdown (SHUTDOWN_reboot). */ +#define CMSG_SHUTDOWN_SUSPEND 2 /* Create suspend info, then */ + /* SHUTDOWN_suspend. */ +#define CMSG_SHUTDOWN_SYSRQ 3 + +typedef struct shutdown_sysrq { + char key; /* sysrq key */ +} shutdown_sysrq_t; + +/****************************************************************************** + * VCPU HOTPLUG CONTROLS + */ + +/* + * Subtypes for shutdown messages. + */ +#define CMSG_VCPU_HOTPLUG_OFF 0 /* turn vcpu off */ +#define CMSG_VCPU_HOTPLUG_ON 1 /* turn vcpu on */ + +/* + * CMSG_VCPU_HOTPLUG: + * Indicate which vcpu's state should change + */ +typedef struct vcpu_hotplug { + u32 vcpu; /* VCPU's whose state will change */ + u32 status; /* Return code indicates success or failure. */ +} vcpu_hotplug_t; + +/****************************************************************************** + * MEMORY CONTROLS + */ + +#define CMSG_MEM_REQUEST_SET 0 /* Request a domain to set its mem footprint. */ + +/* + * CMSG_MEM_REQUEST: + * Request that the domain change its memory reservation. + */ +typedef struct mem_request { + /* OUT */ + u32 target; /* Target memory reservation in pages. */ + /* IN */ + u32 status; /* Return code indicates success or failure. */ +} mem_request_t; + + +/****************************************************************************** + * PDB INTERFACE DEFINITIONS + */ + +#define CMSG_DEBUG_CONNECTION_STATUS 0 +typedef struct pdb_Connection { +#define PDB_CONNECTION_STATUS_UP 1 +#define PDB_CONNECTION_STATUS_DOWN 2 + u32 status; + memory_t ring; /* status: UP */ + u32 evtchn; /* status: UP */ +} pdb_connection_t, *pdb_connection_p; + +#endif /* __XEN_PUBLIC_IO_DOMAIN_CONTROLLER_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ioreq.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ioreq.h @@ -0,0 +1,70 @@ +/* + * ioreq.h: I/O request definitions for device models + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#ifndef _IOREQ_H_ +#define _IOREQ_H_ + +#define IOREQ_READ 1 +#define IOREQ_WRITE 0 + +#define STATE_INVALID 0 +#define STATE_IOREQ_READY 1 +#define STATE_IOREQ_INPROCESS 2 +#define STATE_IORESP_READY 3 +#define STATE_IORESP_HOOK 4 + +/* VMExit dispatcher should cooperate with instruction decoder to + prepare this structure and notify service OS and DM by sending + virq */ +typedef struct { + u64 addr; /* physical address */ + u64 size; /* size in bytes */ + u64 count; /* for rep prefixes */ + union { + u64 data; /* data */ + void *pdata; /* pointer to data */ + } u; + u8 state:4; + u8 pdata_valid:1; /* if 1, use pdata above */ + u8 dir:1; /* 1=read, 0=write */ + u8 port_mm:1; /* 0=portio, 1=mmio */ + u8 df:1; +} ioreq_t; + +#define MAX_VECTOR 256 +#define BITS_PER_BYTE 8 +#define INTR_LEN (MAX_VECTOR/(BITS_PER_BYTE * sizeof(u64))) + +typedef struct { + u64 pic_intr[INTR_LEN]; + u64 pic_mask[INTR_LEN]; + int eport; /* Event channel port */ +} global_iodata_t; + +typedef struct { + ioreq_t vp_ioreq; + unsigned long vp_intr[INTR_LEN]; +} vcpu_iodata_t; + +typedef struct { + global_iodata_t sp_global; + vcpu_iodata_t vcpu_iodata[1]; +} shared_iopage_t; + +#endif /* _IOREQ_H_ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/netif.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/netif.h @@ -0,0 +1,108 @@ +/****************************************************************************** + * netif.h + * + * Unified network-device I/O interface for Xen guest OSes. + * + * Copyright (c) 2003-2004, Keir Fraser + */ + +#ifndef __XEN_PUBLIC_IO_NETIF_H__ +#define __XEN_PUBLIC_IO_NETIF_H__ + +typedef struct netif_tx_request { + memory_t addr; /* Machine address of packet. */ + u16 csum_blank:1; /* Proto csum field blank? */ + u16 id:15; /* Echoed in response message. */ + u16 size; /* Packet size in bytes. */ +} netif_tx_request_t; + +typedef struct netif_tx_response { + u16 id; + s8 status; +} netif_tx_response_t; + +typedef struct { + u16 id; /* Echoed in response message. */ +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + grant_ref_t gref; /* 2: Reference to incoming granted frame */ +#endif +} netif_rx_request_t; + +typedef struct { +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + u32 addr; /* 0: Offset in page of start of received packet */ +#else + memory_t addr; /* Machine address of packet. */ +#endif + u16 csum_valid:1; /* Protocol checksum is validated? */ + u16 id:15; + s16 status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */ +} netif_rx_response_t; + +/* + * We use a special capitalised type name because it is _essential_ that all + * arithmetic on indexes is done on an integer type of the correct size. + */ +typedef u32 NETIF_RING_IDX; + +/* + * Ring indexes are 'free running'. That is, they are not stored modulo the + * size of the ring buffer. The following macros convert a free-running counter + * into a value that can directly index a ring-buffer array. + */ +#define MASK_NETIF_RX_IDX(_i) ((_i)&(NETIF_RX_RING_SIZE-1)) +#define MASK_NETIF_TX_IDX(_i) ((_i)&(NETIF_TX_RING_SIZE-1)) + +#ifdef __x86_64__ +/* + * This restriction can be lifted when we move netfront/netback to use + * grant tables. This will remove memory_t fields from the above structures + * and thus relax natural alignment restrictions. + */ +#define NETIF_TX_RING_SIZE 128 +#define NETIF_RX_RING_SIZE 128 +#else +#define NETIF_TX_RING_SIZE 256 +#define NETIF_RX_RING_SIZE 256 +#endif + +/* This structure must fit in a memory page. */ +typedef struct netif_tx_interface { + /* + * Frontend places packets into ring at tx_req_prod. + * Frontend receives event when tx_resp_prod passes tx_event. + * 'req_cons' is a shadow of the backend's request consumer -- the frontend + * may use it to determine if all queued packets have been seen by the + * backend. + */ + NETIF_RING_IDX req_prod; + NETIF_RING_IDX req_cons; + NETIF_RING_IDX resp_prod; + NETIF_RING_IDX event; + union { + netif_tx_request_t req; + netif_tx_response_t resp; + } ring[NETIF_TX_RING_SIZE]; +} netif_tx_interface_t; + +/* This structure must fit in a memory page. */ +typedef struct netif_rx_interface { + /* + * Frontend places empty buffers into ring at rx_req_prod. + * Frontend receives event when rx_resp_prod passes rx_event. + */ + NETIF_RING_IDX req_prod; + NETIF_RING_IDX resp_prod; + NETIF_RING_IDX event; + union { + netif_rx_request_t req; + netif_rx_response_t resp; + } ring[NETIF_RX_RING_SIZE]; +} netif_rx_interface_t; + +/* Descriptor status values */ +#define NETIF_RSP_DROPPED -2 +#define NETIF_RSP_ERROR -1 +#define NETIF_RSP_OKAY 0 + +#endif Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ring.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ring.h @@ -0,0 +1,199 @@ +/* + * Shared producer-consumer ring macros. + * Tim Deegan and Andrew Warfield November 2004. + */ + +#ifndef __XEN_PUBLIC_IO_RING_H__ +#define __XEN_PUBLIC_IO_RING_H__ + +typedef unsigned int RING_IDX; + +/* Round a 32-bit unsigned constant down to the nearest power of two. */ +#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) +#define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x)) +#define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x)) +#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x)) +#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x)) + +/* + * Calculate size of a shared ring, given the total available space for the + * ring and indexes (_sz), and the name tag of the request/response structure. + * A ring contains as many entries as will fit, rounded down to the nearest + * power of two (so we can mask with (size-1) to loop around). + */ +#define __RING_SIZE(_s, _sz) \ + (__RD32(((_sz) - 2*sizeof(RING_IDX)) / sizeof((_s)->ring[0]))) + +/* + * Macros to make the correct C datatypes for a new kind of ring. + * + * To make a new ring datatype, you need to have two message structures, + * let's say request_t, and response_t already defined. + * + * In a header where you want the ring datatype declared, you then do: + * + * DEFINE_RING_TYPES(mytag, request_t, response_t); + * + * These expand out to give you a set of types, as you can see below. + * The most important of these are: + * + * mytag_sring_t - The shared ring. + * mytag_front_ring_t - The 'front' half of the ring. + * mytag_back_ring_t - The 'back' half of the ring. + * + * To initialize a ring in your code you need to know the location and size + * of the shared memory area (PAGE_SIZE, for instance). To initialise + * the front half: + * + * mytag_front_ring_t front_ring; + * + * SHARED_RING_INIT((mytag_sring_t *)shared_page); + * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); + * + * Initializing the back follows similarly... + */ + +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ + \ +/* Shared ring entry */ \ +union __name##_sring_entry { \ + __req_t req; \ + __rsp_t rsp; \ +}; \ + \ +/* Shared ring page */ \ +struct __name##_sring { \ + RING_IDX req_prod; \ + RING_IDX rsp_prod; \ + union __name##_sring_entry ring[1]; /* variable-length */ \ +}; \ + \ +/* "Front" end's private variables */ \ +struct __name##_front_ring { \ + RING_IDX req_prod_pvt; \ + RING_IDX rsp_cons; \ + unsigned int nr_ents; \ + struct __name##_sring *sring; \ +}; \ + \ +/* "Back" end's private variables */ \ +struct __name##_back_ring { \ + RING_IDX rsp_prod_pvt; \ + RING_IDX req_cons; \ + unsigned int nr_ents; \ + struct __name##_sring *sring; \ +}; \ + \ +/* Syntactic sugar */ \ +typedef struct __name##_sring __name##_sring_t; \ +typedef struct __name##_front_ring __name##_front_ring_t; \ +typedef struct __name##_back_ring __name##_back_ring_t; + +/* + * Macros for manipulating rings. + * + * FRONT_RING_whatever works on the "front end" of a ring: here + * requests are pushed on to the ring and responses taken off it. + * + * BACK_RING_whatever works on the "back end" of a ring: here + * requests are taken off the ring and responses put on. + * + * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. + * This is OK in 1-for-1 request-response situations where the + * requestor (front end) never has more than RING_SIZE()-1 + * outstanding requests. + */ + +/* Initialising empty rings */ +#define SHARED_RING_INIT(_s) do { \ + (_s)->req_prod = 0; \ + (_s)->rsp_prod = 0; \ +} while(0) + +#define FRONT_RING_INIT(_r, _s, __size) do { \ + (_r)->req_prod_pvt = 0; \ + (_r)->rsp_cons = 0; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->sring = (_s); \ +} while (0) + +#define BACK_RING_INIT(_r, _s, __size) do { \ + (_r)->rsp_prod_pvt = 0; \ + (_r)->req_cons = 0; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->sring = (_s); \ +} while (0) + +/* Initialize to existing shared indexes -- for recovery */ +#define FRONT_RING_ATTACH(_r, _s, __size) do { \ + (_r)->sring = (_s); \ + (_r)->req_prod_pvt = (_s)->req_prod; \ + (_r)->rsp_cons = (_s)->rsp_prod; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ +} while (0) + +#define BACK_RING_ATTACH(_r, _s, __size) do { \ + (_r)->sring = (_s); \ + (_r)->rsp_prod_pvt = (_s)->rsp_prod; \ + (_r)->req_cons = (_s)->req_prod; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ +} while (0) + +/* How big is this ring? */ +#define RING_SIZE(_r) \ + ((_r)->nr_ents) + +/* How many empty slots are on a ring? */ +#define RING_PENDING_REQUESTS(_r) \ + ( ((_r)->req_prod_pvt - (_r)->rsp_cons) ) + +/* Test if there is an empty slot available on the front ring. + * (This is only meaningful from the front. ) + */ +#define RING_FULL(_r) \ + (((_r)->req_prod_pvt - (_r)->rsp_cons) == RING_SIZE(_r)) + +/* Test if there are outstanding messages to be processed on a ring. */ +#define RING_HAS_UNCONSUMED_RESPONSES(_r) \ + ( (_r)->rsp_cons != (_r)->sring->rsp_prod ) + +#define RING_HAS_UNCONSUMED_REQUESTS(_r) \ + ( ((_r)->req_cons != (_r)->sring->req_prod ) && \ + (((_r)->req_cons - (_r)->rsp_prod_pvt) != \ + RING_SIZE(_r)) ) + +/* Test if there are messages waiting to be pushed. */ +#define RING_HAS_UNPUSHED_REQUESTS(_r) \ + ( (_r)->req_prod_pvt != (_r)->sring->req_prod ) + +#define RING_HAS_UNPUSHED_RESPONSES(_r) \ + ( (_r)->rsp_prod_pvt != (_r)->sring->rsp_prod ) + +/* Copy the private producer pointer into the shared ring so the other end + * can see the updates we've made. */ +#define RING_PUSH_REQUESTS(_r) do { \ + wmb(); \ + (_r)->sring->req_prod = (_r)->req_prod_pvt; \ +} while (0) + +#define RING_PUSH_RESPONSES(_r) do { \ + wmb(); \ + (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ +} while (0) + +/* Direct access to individual ring elements, by index. */ +#define RING_GET_REQUEST(_r, _idx) \ + (&((_r)->sring->ring[ \ + ((_idx) & (RING_SIZE(_r) - 1)) \ + ].req)) + +#define RING_GET_RESPONSE(_r, _idx) \ + (&((_r)->sring->ring[ \ + ((_idx) & (RING_SIZE(_r) - 1)) \ + ].rsp)) + +/* Loop termination condition: Would the specified index overflow the ring? */ +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ + (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) + +#endif /* __XEN_PUBLIC_IO_RING_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/usbif.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/usbif.h @@ -0,0 +1,66 @@ +/****************************************************************************** + * usbif.h + * + * Unified block-device I/O interface for Xen guest OSes. + * + * Copyright (c) 2003-2004, Keir Fraser + */ + +#ifndef __SHARED_USBIF_H__ +#define __SHARED_USBIF_H__ + +#define usbif_vdev_t u16 +#define usbif_sector_t u64 + +#define USBIF_OP_IO 0 /* Request IO to a device */ +#define USBIF_OP_PROBE 1 /* Is there a device on this port? */ +#define USBIF_OP_RESET 2 /* Reset a virtual USB port. */ + +typedef struct { + unsigned long id; /* private guest value, echoed in resp */ + u8 operation; /* USBIF_OP_??? */ + u8 __pad1; + usbif_vdev_t port; /* guest virtual USB port */ + unsigned long devnum :7; /* Device address, as seen by the guest.*/ + unsigned long endpoint :4; /* Device endpoint. */ + unsigned long direction :1; /* Pipe direction. */ + unsigned long speed :1; /* Pipe speed. */ + unsigned long pipe_type :2; /* Pipe type (iso, bulk, int, ctrl) */ + unsigned long __pad2 :18; + unsigned long transfer_buffer; /* Machine address */ + unsigned long length; /* Buffer length */ + unsigned long transfer_flags; /* For now just pass Linux transfer + * flags - this may change. */ + unsigned char setup[8]; /* Embed setup packets directly. */ + unsigned long iso_schedule; /* Machine address of transfer sched (iso + * only) */ + unsigned long num_iso; /* length of iso schedule */ + unsigned long timeout; /* timeout in ms */ +} usbif_request_t; + +/* Data we need to pass: + * - Transparently handle short packets or complain at us? + */ + +typedef struct { + unsigned long id; /* copied from request */ + u8 operation; /* copied from request */ + u8 data; /* Small chunk of in-band data */ + s16 status; /* USBIF_RSP_??? */ + unsigned long transfer_mutex; /* Used for cancelling requests atomically. */ + unsigned long length; /* How much data we really got */ +} usbif_response_t; + +#define USBIF_RSP_ERROR -1 /* non-specific 'error' */ +#define USBIF_RSP_OKAY 0 /* non-specific 'okay' */ + +DEFINE_RING_TYPES(usbif, usbif_request_t, usbif_response_t); + +typedef struct { + unsigned long length; /* IN = expected, OUT = actual */ + unsigned long buffer_offset; /* IN offset in buffer specified in main + packet */ + unsigned long status; /* OUT Status for this packet. */ +} usbif_iso_t; + +#endif /* __SHARED_USBIF_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/vmx_vlapic.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/vmx_vlapic.h @@ -0,0 +1,57 @@ +#ifndef _VMX_VLAPIC_H +#define _VMX_VLAPIC_H + +/* + We extended one bit for PIC type + */ +#define VLAPIC_DELIV_MODE_FIXED 0x0 +#define VLAPIC_DELIV_MODE_LPRI 0x1 +#define VLAPIC_DELIV_MODE_SMI 0x2 +#define VLAPIC_DELIV_MODE_NMI 0x4 +#define VLAPIC_DELIV_MODE_INIT 0x5 +#define VLAPIC_DELIV_MODE_STARTUP 0x6 +#define VLAPIC_DELIV_MODE_EXT 0x7 +#define VLAPIC_DELIV_MODE_MASK 0x8 + +#define VLAPIC_MSG_LEVEL 4 + +#define INTR_EXT 0 +#define INTR_APIC 1 +#define INTR_LAPIC 2 + +#define VL_STATE_EOI 1 +#define VL_STATE_EXT_LOCK 2 +#define VL_STATE_MSG_LOCK 3 +#define VL_STATE_EOI_LOCK 3 + +#define VLOCAL_APIC_MAX_INTS 256 +#define VLAPIC_INT_COUNT (VLOCAL_APIC_MAX_INTS/(BITS_PER_BYTE * sizeof(u64))) +#define VLAPIC_INT_COUNT_32 (VLOCAL_APIC_MAX_INTS/(BITS_PER_BYTE * sizeof(u32))) + +struct vapic_bus_message{ + u8 deliv_mode:4; /* deliver mode, including fixed, LPRI, etc */ + u8 level:1; /* level or edge */ + u8 trig_mod:1; /* assert or disassert */ + u8 reserved:2; + u8 vector; +}; + +typedef struct { + /* interrupt for PIC and ext type IOAPIC interrupt */ + u64 vl_ext_intr[VLAPIC_INT_COUNT]; + u64 vl_ext_intr_mask[VLAPIC_INT_COUNT]; + u64 vl_apic_intr[VLAPIC_INT_COUNT]; + u64 vl_apic_tmr[VLAPIC_INT_COUNT]; + u64 vl_eoi[VLAPIC_INT_COUNT]; + u32 vl_lapic_id; + u32 direct_intr; + u32 vl_apr; + u32 vl_logical_dest; + u32 vl_dest_format; + u32 vl_arb_id; + u32 vl_state; + u32 apic_msg_count; + struct vapic_bus_message vl_apic_msg[24]; +} vlapic_info; + +#endif /* _VMX_VLAPIC_H_ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/physdev.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/physdev.h @@ -0,0 +1,70 @@ + +#ifndef __XEN_PUBLIC_PHYSDEV_H__ +#define __XEN_PUBLIC_PHYSDEV_H__ + +/* Commands to HYPERVISOR_physdev_op() */ +#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4 +#define PHYSDEVOP_IRQ_STATUS_QUERY 5 +#define PHYSDEVOP_SET_IOPL 6 +#define PHYSDEVOP_SET_IOBITMAP 7 +#define PHYSDEVOP_APIC_READ 8 +#define PHYSDEVOP_APIC_WRITE 9 +#define PHYSDEVOP_ASSIGN_VECTOR 10 + +typedef struct physdevop_irq_status_query { + /* IN */ + u32 irq; + /* OUT */ +/* Need to call PHYSDEVOP_IRQ_UNMASK_NOTIFY when the IRQ has been serviced? */ +#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY (1<<0) + u32 flags; +} physdevop_irq_status_query_t; + +typedef struct physdevop_set_iopl { + /* IN */ + u32 iopl; +} physdevop_set_iopl_t; + +typedef struct physdevop_set_iobitmap { + /* IN */ + memory_t bitmap; + u32 nr_ports; +} physdevop_set_iobitmap_t; + +typedef struct physdevop_apic { + /* IN */ + u32 apic; + u32 offset; + /* IN or OUT */ + u32 value; +} physdevop_apic_t; + +typedef struct physdevop_irq { + /* IN */ + u32 irq; + /* OUT */ + u32 vector; +} physdevop_irq_t; + +typedef struct physdev_op { + u32 cmd; + union { + physdevop_irq_status_query_t irq_status_query; + physdevop_set_iopl_t set_iopl; + physdevop_set_iobitmap_t set_iobitmap; + physdevop_apic_t apic_op; + physdevop_irq_t irq_op; + } u; +} physdev_op_t; + +#endif /* __XEN_PUBLIC_PHYSDEV_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/policy_ops.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/policy_ops.h @@ -0,0 +1,70 @@ +/****************************************************************************** + * policy_ops.h + * + * Copyright (C) 2005 IBM Corporation + * + * Author: + * Reiner Sailer <sailer@xxxxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * Process policy command requests from guest OS. + * access checked by policy; not restricted to DOM0 + * + */ + +#ifndef __XEN_PUBLIC_POLICY_OPS_H__ +#define __XEN_PUBLIC_POLICY_OPS_H__ + +#include "xen-public/xen.h" +#include "xen-public/sched_ctl.h" + +/* + * Make sure you increment the interface version whenever you modify this file! + * This makes sure that old versions of policy tools will stop working in a + * well-defined way (rather than crashing the machine, for instance). + */ +#define POLICY_INTERFACE_VERSION 0xAAAA0003 + +/************************************************************************/ + +#define POLICY_SETPOLICY 4 +typedef struct policy_setpolicy { + /* IN variables. */ + u16 policy_type; + /* OUT variables */ + void *pushcache; + u16 pushcache_size; +} policy_setpolicy_t; + + +#define POLICY_GETPOLICY 5 +typedef struct policy_getpolicy { + /* IN variables. */ + u16 policy_type; + /* OUT variables */ + void *pullcache; + u16 pullcache_size; +} policy_getpolicy_t; + +#define POLICY_DUMPSTATS 6 +typedef struct policy_dumpstats { + void *pullcache; + u16 pullcache_size; +} policy_dumpstats_t; + + +typedef struct policy_op { + u32 cmd; + u32 interface_version; /* POLICY_INTERFACE_VERSION */ + union { + policy_setpolicy_t setpolicy; + policy_getpolicy_t getpolicy; + policy_dumpstats_t dumpstats; + } u; +} policy_op_t; + +#endif /* __XEN_PUBLIC_POLICY_OPS_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/sched_ctl.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/sched_ctl.h @@ -0,0 +1,58 @@ +/****************************************************************************** + * Generic scheduler control interface. + * + * Mark Williamson, (C) 2004 Intel Research Cambridge + */ + +#ifndef __XEN_PUBLIC_SCHED_CTL_H__ +#define __XEN_PUBLIC_SCHED_CTL_H__ + +/* Scheduler types. */ +#define SCHED_BVT 0 +#define SCHED_SEDF 4 + +/* Set or get info? */ +#define SCHED_INFO_PUT 0 +#define SCHED_INFO_GET 1 + +/* + * Generic scheduler control command - used to adjust system-wide scheduler + * parameters + */ +struct sched_ctl_cmd { + u32 sched_id; + u32 direction; + union { + struct bvt_ctl { + u32 ctx_allow; + } bvt; + } u; +}; + +struct sched_adjdom_cmd { + u32 sched_id; + u32 direction; + domid_t domain; + union { + struct bvt_adjdom + { + u32 mcu_adv; /* mcu advance: inverse of weight */ + u32 warpback; /* warp? */ + s32 warpvalue; /* warp value */ + long long warpl; /* warp limit */ + long long warpu; /* unwarp time requirement */ + } bvt; + + struct sedf_adjdom + { + u64 period; + u64 slice; + u64 latency; + u16 extratime; + u16 weight; + } sedf; + + } u; +}; + +#endif /* __XEN_PUBLIC_SCHED_CTL_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/trace.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/trace.h @@ -0,0 +1,59 @@ +/****************************************************************************** + * include/public/trace.h + * + * Mark Williamson, (C) 2004 Intel Research Cambridge + * Copyright (C) 2005 Bin Ren + */ + +#ifndef __XEN_PUBLIC_TRACE_H__ +#define __XEN_PUBLIC_TRACE_H__ + +/* Trace classes */ +#define TRC_GEN 0x00010000 /* General trace */ +#define TRC_SCHED 0x00020000 /* Xen Scheduler trace */ +#define TRC_DOM0OP 0x00040000 /* Xen DOM0 operation trace */ +#define TRC_VMX 0x00080000 /* Xen VMX trace */ +#define TRC_ALL 0xffff0000 + +/* Trace events per class */ + +#define TRC_SCHED_DOM_ADD (TRC_SCHED + 1) +#define TRC_SCHED_DOM_REM (TRC_SCHED + 2) +#define TRC_SCHED_SLEEP (TRC_SCHED + 3) +#define TRC_SCHED_WAKE (TRC_SCHED + 4) +#define TRC_SCHED_YIELD (TRC_SCHED + 5) +#define TRC_SCHED_BLOCK (TRC_SCHED + 6) +#define TRC_SCHED_SHUTDOWN (TRC_SCHED + 7) +#define TRC_SCHED_CTL (TRC_SCHED + 8) +#define TRC_SCHED_ADJDOM (TRC_SCHED + 9) +#define TRC_SCHED_SWITCH (TRC_SCHED + 10) +#define TRC_SCHED_S_TIMER_FN (TRC_SCHED + 11) +#define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12) +#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13) + +#define TRC_VMX_VMEXIT (TRC_VMX + 1) +#define TRC_VMX_VECTOR (TRC_VMX + 2) +#define TRC_VMX_INT (TRC_VMX + 3) + +/* This structure represents a single trace buffer record. */ +struct t_rec { + u64 cycles; /* cycle counter timestamp */ + u32 event; /* event ID */ + unsigned long data[5]; /* event data items */ +}; + +/* + * This structure contains the metadata for a single trace buffer. The head + * field, indexes into an array of struct t_rec's. + */ +struct t_buf { + /* Used by both Xen and user space. */ + atomic_t rec_idx; /* the next record to save to */ + unsigned int rec_num; /* number of records in this trace buffer */ + /* Used by Xen only. */ + struct t_rec *rec; /* start of records */ + /* Used by user space only. */ + unsigned long rec_addr; /* machine address of the start of records */ +}; + +#endif /* __XEN_PUBLIC_TRACE_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/vmx_assist.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/vmx_assist.h @@ -0,0 +1,101 @@ +/* + * vmx_assist.h: Context definitions for the VMXASSIST world switch. + * + * Leendert van Doorn, leendert@xxxxxxxxxxxxxx + * Copyright (c) 2005, International Business Machines Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#ifndef _VMX_ASSIST_H_ +#define _VMX_ASSIST_H_ + +#define VMXASSIST_BASE 0xD0000 +#define VMXASSIST_MAGIC 0x17101966 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8) + +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12) +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4) + +#ifndef __ASSEMBLY__ + +union vmcs_arbytes { + struct arbyte_fields { + unsigned int seg_type : 4, + s : 1, + dpl : 2, + p : 1, + reserved0 : 4, + avl : 1, + reserved1 : 1, + default_ops_size: 1, + g : 1, + null_bit : 1, + reserved2 : 15; + } fields; + unsigned int bytes; +}; + +/* + * World switch state + */ +typedef struct vmx_assist_context { + u32 eip; /* execution pointer */ + u32 esp; /* stack point */ + u32 eflags; /* flags register */ + u32 cr0; + u32 cr3; /* page table directory */ + u32 cr4; + u32 idtr_limit; /* idt */ + u32 idtr_base; + u32 gdtr_limit; /* gdt */ + u32 gdtr_base; + u32 cs_sel; /* cs selector */ + u32 cs_limit; + u32 cs_base; + union vmcs_arbytes cs_arbytes; + u32 ds_sel; /* ds selector */ + u32 ds_limit; + u32 ds_base; + union vmcs_arbytes ds_arbytes; + u32 es_sel; /* es selector */ + u32 es_limit; + u32 es_base; + union vmcs_arbytes es_arbytes; + u32 ss_sel; /* ss selector */ + u32 ss_limit; + u32 ss_base; + union vmcs_arbytes ss_arbytes; + u32 fs_sel; /* fs selector */ + u32 fs_limit; + u32 fs_base; + union vmcs_arbytes fs_arbytes; + u32 gs_sel; /* gs selector */ + u32 gs_limit; + u32 gs_base; + union vmcs_arbytes gs_arbytes; + u32 tr_sel; /* task selector */ + u32 tr_limit; + u32 tr_base; + union vmcs_arbytes tr_arbytes; + u32 ldtr_sel; /* ldtr selector */ + u32 ldtr_limit; + u32 ldtr_base; + union vmcs_arbytes ldtr_arbytes; +} vmx_assist_context_t; + +#endif /* __ASSEMBLY__ */ + +#endif /* _VMX_ASSIST_H_ */ + Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/xen.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/xen.h @@ -0,0 +1,466 @@ +/****************************************************************************** + * xen.h + * + * Guest OS interface to Xen. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __XEN_PUBLIC_XEN_H__ +#define __XEN_PUBLIC_XEN_H__ + +#if defined(__i386__) +#include "xen-public/arch-x86_32.h" +#elif defined(__x86_64__) +#include "xen-public/arch-x86_64.h" +#elif defined(__ia64__) +#include "xen-public/arch-ia64.h" +#else +#error "Unsupported architecture" +#endif + +/* + * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). + */ + +/* + * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5. + * EAX = return value + * (argument registers may be clobbered on return) + * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. + * RAX = return value + * (argument registers not clobbered on return; RCX, R11 are) + */ +#define __HYPERVISOR_set_trap_table 0 +#define __HYPERVISOR_mmu_update 1 +#define __HYPERVISOR_set_gdt 2 +#define __HYPERVISOR_stack_switch 3 +#define __HYPERVISOR_set_callbacks 4 +#define __HYPERVISOR_fpu_taskswitch 5 +#define __HYPERVISOR_sched_op 6 +#define __HYPERVISOR_dom0_op 7 +#define __HYPERVISOR_set_debugreg 8 +#define __HYPERVISOR_get_debugreg 9 +#define __HYPERVISOR_update_descriptor 10 +#define __HYPERVISOR_dom_mem_op 12 +#define __HYPERVISOR_multicall 13 +#define __HYPERVISOR_update_va_mapping 14 +#define __HYPERVISOR_set_timer_op 15 +#define __HYPERVISOR_event_channel_op 16 +#define __HYPERVISOR_xen_version 17 +#define __HYPERVISOR_console_io 18 +#define __HYPERVISOR_physdev_op 19 +#define __HYPERVISOR_grant_table_op 20 +#define __HYPERVISOR_vm_assist 21 +#define __HYPERVISOR_update_va_mapping_otherdomain 22 +#define __HYPERVISOR_switch_vm86 23 /* x86/32 only */ +#define __HYPERVISOR_switch_to_user 23 /* x86/64 only */ +#define __HYPERVISOR_boot_vcpu 24 +#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ +#define __HYPERVISOR_mmuext_op 26 +#define __HYPERVISOR_policy_op 27 + +/* + * VIRTUAL INTERRUPTS + * + * Virtual interrupts that a guest OS may receive from Xen. + */ +#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */ +#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */ +#define VIRQ_CONSOLE 2 /* (DOM0) bytes received on emergency console. */ +#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ +#define VIRQ_PARITY_ERR 4 /* (DOM0) NMI parity error. */ +#define VIRQ_IO_ERR 5 /* (DOM0) NMI I/O error. */ +#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */ +#define NR_VIRQS 7 + +/* + * MMU-UPDATE REQUESTS + * + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs. + * A foreigndom (FD) can be specified (or DOMID_SELF for none). + * Where the FD has some effect, it is described below. + * ptr[1:0] specifies the appropriate MMU_* command. + * + * ptr[1:0] == MMU_NORMAL_PT_UPDATE: + * Updates an entry in a page table. If updating an L1 table, and the new + * table entry is valid/present, the mapped frame must belong to the FD, if + * an FD has been specified. If attempting to map an I/O page then the + * caller assumes the privilege of the FD. + * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller. + * FD == DOMID_XEN: Map restricted areas of Xen's heap space. + * ptr[:2] -- Machine address of the page-table entry to modify. + * val -- Value to write. + * + * ptr[1:0] == MMU_MACHPHYS_UPDATE: + * Updates an entry in the machine->pseudo-physical mapping table. + * ptr[:2] -- Machine address within the frame whose mapping to modify. + * The frame must belong to the FD, if one is specified. + * val -- Value to write into the mapping entry. + */ +#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ +#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ + +/* + * MMU EXTENDED OPERATIONS + * + * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. + * A foreigndom (FD) can be specified (or DOMID_SELF for none). + * Where the FD has some effect, it is described below. + * + * cmd: MMUEXT_(UN)PIN_*_TABLE + * mfn: Machine frame number to be (un)pinned as a p.t. page. + * The frame must belong to the FD, if one is specified. + * + * cmd: MMUEXT_NEW_BASEPTR + * mfn: Machine frame number of new page-table base to install in MMU. + * + * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only] + * mfn: Machine frame number of new page-table base to install in MMU + * when in user space. + * + * cmd: MMUEXT_TLB_FLUSH_LOCAL + * No additional arguments. Flushes local TLB. + * + * cmd: MMUEXT_INVLPG_LOCAL + * linear_addr: Linear address to be flushed from the local TLB. + * + * cmd: MMUEXT_TLB_FLUSH_MULTI + * vcpumask: Pointer to bitmap of VCPUs to be flushed. + * + * cmd: MMUEXT_INVLPG_MULTI + * linear_addr: Linear address to be flushed. + * vcpumask: Pointer to bitmap of VCPUs to be flushed. + * + * cmd: MMUEXT_TLB_FLUSH_ALL + * No additional arguments. Flushes all VCPUs' TLBs. + * + * cmd: MMUEXT_INVLPG_ALL + * linear_addr: Linear address to be flushed from all VCPUs' TLBs. + * + * cmd: MMUEXT_FLUSH_CACHE + * No additional arguments. Writes back and flushes cache contents. + * + * cmd: MMUEXT_SET_LDT + * linear_addr: Linear address of LDT base (NB. must be page-aligned). + * nr_ents: Number of entries in LDT. + * + * cmd: MMUEXT_REASSIGN_PAGE + * mfn: Machine frame number to be reassigned to the FD. + * (NB. page must currently belong to the calling domain). + */ +#define MMUEXT_PIN_L1_TABLE 0 +#define MMUEXT_PIN_L2_TABLE 1 +#define MMUEXT_PIN_L3_TABLE 2 +#define MMUEXT_PIN_L4_TABLE 3 +#define MMUEXT_UNPIN_TABLE 4 +#define MMUEXT_NEW_BASEPTR 5 +#define MMUEXT_TLB_FLUSH_LOCAL 6 +#define MMUEXT_INVLPG_LOCAL 7 +#define MMUEXT_TLB_FLUSH_MULTI 8 +#define MMUEXT_INVLPG_MULTI 9 +#define MMUEXT_TLB_FLUSH_ALL 10 +#define MMUEXT_INVLPG_ALL 11 +#define MMUEXT_FLUSH_CACHE 12 +#define MMUEXT_SET_LDT 13 +#define MMUEXT_REASSIGN_PAGE 14 +#define MMUEXT_NEW_USER_BASEPTR 15 + +#ifndef __ASSEMBLY__ +struct mmuext_op { + unsigned int cmd; + union { + /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR, REASSIGN_PAGE */ + memory_t mfn; + /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ + memory_t linear_addr; + }; + union { + /* SET_LDT */ + unsigned int nr_ents; + /* TLB_FLUSH_MULTI, INVLPG_MULTI */ + void *vcpumask; + }; +}; +#endif + +/* These are passed as 'flags' to update_va_mapping. They can be ORed. */ +/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */ +/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */ +#define UVMF_NONE (0UL<<0) /* No flushing at all. */ +#define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */ +#define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */ +#define UVMF_FLUSHTYPE_MASK (3UL<<0) +#define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */ +#define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */ +#define UVMF_ALL (1UL<<2) /* Flush all TLBs. */ + +/* + * Commands to HYPERVISOR_sched_op(). + */ +#define SCHEDOP_yield 0 /* Give up the CPU voluntarily. */ +#define SCHEDOP_block 1 /* Block until an event is received. */ +#define SCHEDOP_shutdown 2 /* Stop executing this domain. */ +#define SCHEDOP_vcpu_down 3 /* make target VCPU not-runnable. */ +#define SCHEDOP_vcpu_up 4 /* make target VCPU runnable. */ +#define SCHEDOP_cmdmask 255 /* 8-bit command. */ +#define SCHEDOP_reasonshift 8 /* 8-bit reason code. (SCHEDOP_shutdown) */ +#define SCHEDOP_vcpushift 8 /* 8-bit VCPU target. (SCHEDOP_up|down) */ + +/* + * Reason codes for SCHEDOP_shutdown. These may be interpreted by control + * software to determine the appropriate action. For the most part, Xen does + * not care about the shutdown code (SHUTDOWN_crash excepted). + */ +#define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */ +#define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */ +#define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ +#define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ + +/* + * Commands to HYPERVISOR_console_io(). + */ +#define CONSOLEIO_write 0 +#define CONSOLEIO_read 1 + +/* + * Commands to HYPERVISOR_dom_mem_op(). + */ +#define MEMOP_increase_reservation 0 +#define MEMOP_decrease_reservation 1 + +/* + * Commands to HYPERVISOR_vm_assist(). + */ +#define VMASST_CMD_enable 0 +#define VMASST_CMD_disable 1 +#define VMASST_TYPE_4gb_segments 0 +#define VMASST_TYPE_4gb_segments_notify 1 +#define VMASST_TYPE_writable_pagetables 2 +#define MAX_VMASST_TYPE 2 + +#ifndef __ASSEMBLY__ + +typedef u16 domid_t; + +/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */ +#define DOMID_FIRST_RESERVED (0x7FF0U) + +/* DOMID_SELF is used in certain contexts to refer to oneself. */ +#define DOMID_SELF (0x7FF0U) + +/* + * DOMID_IO is used to restrict page-table updates to mapping I/O memory. + * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO + * is useful to ensure that no mappings to the OS's own heap are accidentally + * installed. (e.g., in Linux this could cause havoc as reference counts + * aren't adjusted on the I/O-mapping code path). + * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can + * be specified by any calling domain. + */ +#define DOMID_IO (0x7FF1U) + +/* + * DOMID_XEN is used to allow privileged domains to map restricted parts of + * Xen's heap space (e.g., the machine_to_phys table). + * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if + * the caller is privileged. + */ +#define DOMID_XEN (0x7FF2U) + +/* + * Send an array of these to HYPERVISOR_mmu_update(). + * NB. The fields are natural pointer/address size for this architecture. + */ +typedef struct +{ + u64 ptr; /* Machine address of PTE. */ + u64 val; /* New contents of PTE. */ +} mmu_update_t; + +/* + * Send an array of these to HYPERVISOR_multicall(). + * NB. The fields are natural register size for this architecture. + */ +typedef struct +{ + unsigned long op, result; + unsigned long args[6]; +} multicall_entry_t; + +/* Event channel endpoints per domain. */ +#define NR_EVENT_CHANNELS 1024 + +/* + * Per-VCPU information goes here. This will be cleaned up more when Xen + * actually supports multi-VCPU guests. + */ +typedef struct vcpu_info { + /* + * 'evtchn_upcall_pending' is written non-zero by Xen to indicate + * a pending notification for a particular VCPU. It is then cleared + * by the guest OS /before/ checking for pending work, thus avoiding + * a set-and-check race. Note that the mask is only accessed by Xen + * on the CPU that is currently hosting the VCPU. This means that the + * pending and mask flags can be updated by the guest without special + * synchronisation (i.e., no need for the x86 LOCK prefix). + * This may seem suboptimal because if the pending flag is set by + * a different CPU then an IPI may be scheduled even when the mask + * is set. However, note: + * 1. The task of 'interrupt holdoff' is covered by the per-event- + * channel mask bits. A 'noisy' event that is continually being + * triggered can be masked at source at this very precise + * granularity. + * 2. The main purpose of the per-VCPU mask is therefore to restrict + * reentrant execution: whether for concurrency control, or to + * prevent unbounded stack usage. Whatever the purpose, we expect + * that the mask will be asserted only for short periods at a time, + * and so the likelihood of a 'spurious' IPI is suitably small. + * The mask is read before making an event upcall to the guest: a + * non-zero mask therefore guarantees that the VCPU will not receive + * an upcall activation. The mask is cleared when the VCPU requests + * to block: this avoids wakeup-waiting races. + */ + u8 evtchn_upcall_pending; + u8 evtchn_upcall_mask; + u32 evtchn_pending_sel; +#ifdef __ARCH_HAS_VCPU_INFO + arch_vcpu_info_t arch; +#endif +} vcpu_info_t; + +typedef struct vcpu_time_info { + /* + * The following values are updated periodically (and not necessarily + * atomically!). The guest OS detects this because 'time_version1' is + * incremented just before updating these values, and 'time_version2' is + * incremented immediately after. See the Xen-specific Linux code for an + * example of how to read these values safely (arch/xen/kernel/time.c). + */ + u32 time_version1; + u32 time_version2; + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_time; /* Time, in nanosecs, since boot. */ + /* + * Current system time: + * system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul + * CPU frequency (Hz): + * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift + */ + u32 tsc_to_system_mul; + s8 tsc_shift; +} vcpu_time_info_t; + +/* + * Xen/kernel shared data -- pointer provided in start_info. + * NB. We expect that this struct is smaller than a page. + */ +typedef struct shared_info { + vcpu_info_t vcpu_data[MAX_VIRT_CPUS]; + + vcpu_time_info_t vcpu_time[MAX_VIRT_CPUS]; + + u32 n_vcpu; + + /* + * A domain can have up to 1024 "event channels" on which it can send + * and receive asynchronous event notifications. There are three classes + * of event that are delivered by this mechanism: + * 1. Bi-directional inter- and intra-domain connections. Domains must + * arrange out-of-band to set up a connection (usually the setup + * is initiated and organised by a privileged third party such as + * software running in domain 0). + * 2. Physical interrupts. A domain with suitable hardware-access + * privileges can bind an event-channel port to a physical interrupt + * source. + * 3. Virtual interrupts ('events'). A domain can bind an event-channel + * port to a virtual interrupt source, such as the virtual-timer + * device or the emergency console. + * + * Event channels are addressed by a "port index" between 0 and 1023. + * Each channel is associated with two bits of information: + * 1. PENDING -- notifies the domain that there is a pending notification + * to be processed. This bit is cleared by the guest. + * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING + * will cause an asynchronous upcall to be scheduled. This bit is only + * updated by the guest. It is read-only within Xen. If a channel + * becomes pending while the channel is masked then the 'edge' is lost + * (i.e., when the channel is unmasked, the guest must manually handle + * pending notifications as no upcall will be scheduled by Xen). + * + * To expedite scanning of pending notifications, any 0->1 pending + * transition on an unmasked channel causes a corresponding bit in a + * 32-bit selector to be set. Each bit in the selector covers a 32-bit + * word in the PENDING bitfield array. + */ + u32 evtchn_pending[32]; + u32 evtchn_mask[32]; + + /* + * Wallclock time: updated only by control software. Guests should base + * their gettimeofday() syscall on this wallclock-base value. + */ + u32 wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ + u32 wc_usec; /* Usecs 00:00:00 UTC, Jan 1, 1970. */ + + arch_shared_info_t arch; + +} shared_info_t; + +/* + * Start-of-day memory layout for the initial domain (DOM0): + * 1. The domain is started within contiguous virtual-memory region. + * 2. The contiguous region begins and ends on an aligned 4MB boundary. + * 3. The region start corresponds to the load address of the OS image. + * If the load address is not 4MB aligned then the address is rounded down. + * 4. This the order of bootstrap elements in the initial virtual region: + * a. relocated kernel image + * b. initial ram disk [mod_start, mod_len] + * c. list of allocated page frames [mfn_list, nr_pages] + * d. bootstrap page tables [pt_base, CR3 (x86)] + * e. start_info_t structure [register ESI (x86)] + * f. bootstrap stack [register ESP (x86)] + * 5. Bootstrap elements are packed together, but each is 4kB-aligned. + * 6. The initial ram disk may be omitted. + * 7. The list of page frames forms a contiguous 'pseudo-physical' memory + * layout for the domain. In particular, the bootstrap virtual-memory + * region is a 1:1 mapping to the first section of the pseudo-physical map. + * 8. All bootstrap elements are mapped read-writable for the guest OS. The + * only exception is the bootstrap page table, which is mapped read-only. + * 9. There is guaranteed to be at least 512kB padding after the final + * bootstrap element. If necessary, the bootstrap virtual region is + * extended by an extra 4MB to ensure this. + */ + +#define MAX_GUEST_CMDLINE 1024 +typedef struct start_info { + /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ + memory_t nr_pages; /* Total pages allocated to this domain. */ + memory_t shared_info; /* MACHINE address of shared info struct. */ + u32 flags; /* SIF_xxx flags. */ + u16 domain_controller_evtchn; + /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */ + memory_t pt_base; /* VIRTUAL address of page directory. */ + memory_t nr_pt_frames; /* Number of bootstrap p.t. frames. */ + memory_t mfn_list; /* VIRTUAL address of page-frame list. */ + memory_t mod_start; /* VIRTUAL address of pre-loaded module. */ + memory_t mod_len; /* Size (bytes) of pre-loaded module. */ + s8 cmd_line[MAX_GUEST_CMDLINE]; + memory_t store_mfn; /* MACHINE page number of shared page. */ + u16 store_evtchn; /* Event channel for store communication. */ +} start_info_t; + +/* These flags are passed in the 'flags' field of start_info_t. */ +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_BLK_BE_DOMAIN (1<<4) /* Is this a block backend domain? */ +#define SIF_NET_BE_DOMAIN (1<<5) /* Is this a net backend domain? */ +#define SIF_USB_BE_DOMAIN (1<<6) /* Is this a usb backend domain? */ +/* For use in guest OSes. */ +extern shared_info_t *HYPERVISOR_shared_info; + +typedef u64 cpumap_t; + +#endif /* !__ASSEMBLY__ */ + +#endif /* __XEN_PUBLIC_XEN_H__ */ Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_synch_bitops.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_synch_bitops.h @@ -0,0 +1,140 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + +#include <linux/config.h> + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +struct __synch_xchg_dummy { unsigned long a[100]; }; +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x)) + +#define synch_cmpxchg(ptr, old, new) \ +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ + (unsigned long)(old), \ + (unsigned long)(new), \ + sizeof(*(ptr)))) + +static inline unsigned long __synch_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#ifdef CONFIG_X86_64 + case 4: + __asm__ __volatile__("lock; cmpxchgl %k1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#else + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#endif + } + return old; +} + +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & + (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline__ int synch_var_test_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "btl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) ); + return oldbit; +} + +#define synch_test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + synch_const_test_bit((nr),(addr)) : \ + synch_var_test_bit((nr),(addr))) + +#endif /* __XEN_SYNCH_BITOPS_H__ */ Index: linux-2.6.12-xen0-arch/include/linux/xen/privcmd.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/linux/xen/privcmd.h @@ -0,0 +1,90 @@ +/****************************************************************************** + * privcmd.h + * + * Interface to /proc/xen/privcmd. + * + * Copyright (c) 2003-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __PRIVCMD_H__ +#define __PRIVCMD_H__ + +typedef struct privcmd_hypercall +{ + unsigned long op; + unsigned long arg[5]; +} privcmd_hypercall_t; + +typedef struct privcmd_mmap_entry { + unsigned long va; + unsigned long mfn; + unsigned long npages; +} privcmd_mmap_entry_t; + +typedef struct privcmd_mmap { + int num; + domid_t dom; /* target domain */ + privcmd_mmap_entry_t *entry; +} privcmd_mmap_t; + +typedef struct privcmd_mmapbatch { + int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + unsigned long addr; /* virtual address */ + unsigned long *arr; /* array of mfns - top nibble set on err */ +} privcmd_mmapbatch_t; + +typedef struct privcmd_blkmsg +{ + unsigned long op; + void *buf; + int buf_size; +} privcmd_blkmsg_t; + +/* + * @cmd: IOCTL_PRIVCMD_HYPERCALL + * @arg: &privcmd_hypercall_t + * Return: Value returned from execution of the specified hypercall. + */ +#define IOCTL_PRIVCMD_HYPERCALL \ + _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t)) + +/* + * @cmd: IOCTL_PRIVCMD_INITDOMAIN_EVTCHN + * @arg: n/a + * Return: Port associated with domain-controller end of control event channel + * for the initial domain. + */ +#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN \ + _IOC(_IOC_NONE, 'P', 1, 0) +#define IOCTL_PRIVCMD_MMAP \ + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) +#define IOCTL_PRIVCMD_MMAPBATCH \ + _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t)) +#define IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN \ + _IOC(_IOC_READ, 'P', 4, sizeof(unsigned long)) +#define IOCTL_PRIVCMD_INITDOMAIN_STORE \ + _IOC(_IOC_READ, 'P', 5, 0) + +#endif /* __PRIVCMD_H__ */ Index: linux-2.6.12-xen0-arch/include/linux/xen/suspend.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/linux/xen/suspend.h @@ -0,0 +1,43 @@ +/****************************************************************************** + * suspend.h + * + * Copyright (c) 2003-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __ASM_XEN_SUSPEND_H__ +#define __ASM_XEN_SUSPEND_H__ + +typedef struct suspend_record_st { + /* To be filled in before resume. */ + start_info_t resume_info; + /* + * The number of a machine frame containing, in sequence, the number of + * each machine frame that contains PFN -> MFN translation table data. + */ + unsigned long pfn_to_mfn_frame_list; + /* Number of entries in the PFN -> MFN translation table. */ + unsigned long nr_pfns; +} suspend_record_t; + +#endif /* __ASM_XEN_SUSPEND_H__ */ Index: linux-2.6.12-xen0-arch/include/linux/xen/xen_proc.h =================================================================== --- /dev/null +++ linux-2.6.12-xen0-arch/include/linux/xen/xen_proc.h @@ -0,0 +1,12 @@ +#ifndef __LINUX_XEN_PROC_H__ +#define __LINUX_XEN_PROC_H__ + +#include <linux/config.h> +#include <linux/proc_fs.h> + +extern struct proc_dir_entry *create_xen_proc_entry( + const char *name, mode_t mode); +extern void remove_xen_proc_entry( + const char *name); + +#endif /* __LINUX_XEN_PROC_H__ */ -- _______________________________________________ Xen-merge mailing list Xen-merge@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-merge

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.