[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [ANNOUNCE] CoW source released
Hello, I've been working on VM snapshots/CoW for Xen (as seen at Xen Summit this year). I'm happy to release my first version. There are some known issues with capturing page dirties for HVM guests. I think it's related to the QEMU code (I believe I'm not catching the pages that QEMU dirties). However, it works with both 32-bit and 64-bit PV guests. This release includes modifications to Linux, Xen, and some tools (a library, FUSE fs, and testing tool). The FUSE file system will take a snapshot if you try to create a file in the directory you mount the xencowfs file system too. The testing tool pauses a domain, enables CoW, takes a dump of the CoW image, takes two live memory dumps, unpauses the domain for a bit, then takes another CoW image. It compares all the images and reports on which pages are different. These patches are against xen-unstable revision 19425, however I had no trouble patching them against the current revision of xen-unstable (19553). Please remember this is an alpha release, so there is likely to be some problems. Please let me know if you find any! Patrick diff -r 832aac894efd drivers/xen/Kconfig --- a/drivers/xen/Kconfig Wed Nov 19 13:15:46 2008 +0000 +++ b/drivers/xen/Kconfig Mon Mar 16 00:01:12 2009 -0700 @@ -312,4 +312,7 @@ config XEN_DEVMEM def_bool y +config XEN_XENCOW + def_bool y + endif diff -r 832aac894efd drivers/xen/Makefile --- a/drivers/xen/Makefile Wed Nov 19 13:15:46 2008 +0000 +++ b/drivers/xen/Makefile Mon Mar 16 00:01:12 2009 -0700 @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) += sfc_netutil/ obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) += sfc_netfront/ obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) += sfc_netback/ +obj-$(CONFIG_XEN_XENCOW) += xencow/ diff -r 832aac894efd drivers/xen/xencow/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/xencow/Makefile Mon Mar 16 00:01:12 2009 -0700 @@ -0,0 +1,2 @@ + +obj-m := xencow.o diff -r 832aac894efd drivers/xen/xencow/common.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/xencow/common.h Mon Mar 16 00:01:12 2009 -0700 @@ -0,0 +1,74 @@ +/****************************************************************************** + * common.h + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_XENCOW_COMMON_H__ +#define __XEN_XENCOW_COMMON_H__ + + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <xen/interface/platform.h> +#include <xen/driver_util.h> +#include <asm/io.h> +#include <asm/uaccess.h> +#include <linux/config.h> +#include <linux/version.h> +#include <linux/cdev.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <xen/interface/io/ring.h> +#include <xen/interface/io/xencow.h> + + +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +#define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_cow: " fmt, ##args) + + +#endif /* __XEN_XENCOW_COMMON_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 832aac894efd drivers/xen/xencow/xencow.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/xencow/xencow.c Mon Mar 16 00:01:12 2009 -0700 @@ -0,0 +1,246 @@ +/****************************************************************************** + * xencow.c + * + * Xen Copy-on-Write Kernel Driver - Initialises CoW buffer for userspace + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + +#include "common.h" + + +static int xencow_major; + + +static void xencow_release_user_pages(struct page *pages[], int num) +{ + int i; + + for ( i = 0; i < num; i++ ) + { + struct page *page = pages[i]; + SetPageUptodate(page); + put_page(page); + } +} + +static int xencow_get_user_pages(unsigned long addr, + int num, + struct page *pages[]) +{ + int ret; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, addr, num, 0, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if ( ret != num ) + { + if ( ret >= 0 ) + { + xencow_release_user_pages(pages, ret); + ret = -E2BIG; + } + + return ret; + } + + return 0; +} + +static inline unsigned long xencow_page_to_mfn(struct page *page) +{ + unsigned long pfn; + unsigned long mfn; + + pfn = page_to_pfn(page); + mfn = pfn_to_mfn(pfn); + + return mfn; +} + +static int xencow_get_page_mfns(unsigned long addr, + int num, + unsigned long mfns[]) +{ + struct page *pages[num]; + unsigned long mfn; + int ret; + int i; + + /* Get user pages */ + ret = xencow_get_user_pages(addr, num, pages); + if ( ret != 0 ) + return ret; + + /* Get MFNs for the pages */ + for ( i = 0; i < num; i++ ) + { + mfn = xencow_page_to_mfn(pages[i]); + if ( mfn == 0 ) + return -EFAULT; + + mfns[i] = mfn; + } + + /* Return user pages */ + xencow_release_user_pages(pages, num); + + return 0; +} + +static int xencow_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int ret = 0; + + switch ( cmd ) + { + + case XEN_COW_IOCTL_INIT: + { + xencow_init_t *cow_init; + xencow_init_t __user *cow_init_u; + int num_mfns; + int i; + + cow_init_u = (xencow_init_t __user *)arg; + + /* Check access on user init struct */ + ret = -EFAULT; + if ( !access_ok(VERIFY_READ, cow_init_u, sizeof(xencow_init_t)) ) + break; + + /* Get the number of frames in the buffer */ + ret = __get_user(num_mfns, &cow_init_u->num_mfns); + if ( ret != 0 ) + break; + + /* Allocate space */ + ret = -ENOMEM; + cow_init = (xencow_init_t *) + kmalloc(sizeof(xencow_init_t) + + (num_mfns * sizeof(unsigned long)), + GFP_KERNEL); + if ( cow_init == NULL ) + break; + + cow_init->num_mfns = num_mfns; + + /* Get start address of buffer */ + ret = __get_user(cow_init->addr, &cow_init_u->addr); + if ( ret != 0 ) + goto init_out; + + /* Get page buffer MFNs */ + ret = xencow_get_page_mfns(cow_init->addr, + cow_init->num_mfns, + cow_init->mfns); + if ( ret != 0 ) + goto init_out; + + /* Check access on user page buffer MFNs array */ + ret = -EFAULT; + if ( !access_ok(VERIFY_WRITE, &cow_init_u->mfns, + cow_init->num_mfns * sizeof(unsigned long)) ) + goto init_out; + + /* Send page buffer MFNs to user */ + ret = 0; + for ( i = 0; i < cow_init->num_mfns; i++ ) + ret |= __put_user(cow_init->mfns[i], &cow_init_u->mfns[i]); + + init_out: + kfree(cow_init); + } + break; + + default: + ret = -ENOTTY; + break; + } + + return ret; +} + +static const struct file_operations xencow_fops = { + .owner = THIS_MODULE, + .ioctl = xencow_ioctl, +}; + +static int __init xencow_init(void) +{ + int ret; + struct class *class; + + if ( !is_running_on_xen() ) + return -ENODEV; + + ret = register_chrdev(0, "xencow", &xencow_fops); + if ( ret < 0 ) + { + WPRINTK("Couldn't register /dev/xen/xencow\n"); + return ret; + } + + xencow_major = ret; + + DPRINTK("Created misc_dev [/dev/xen/xencow%d]\n", xencow_major); + + /* Make sure the xen class exists */ + class = get_xen_class(); + if ( class != NULL ) + class_device_create(class, NULL, MKDEV(xencow_major, 0), + NULL, "xencow0"); + else + /* This is bad, but not fatal */ + WPRINTK("sysfs xen_class not created\n"); + + DPRINTK("XenCoW device successfully created\n"); + + return 0; +} + +static void __exit xencow_exit(void) +{ + int ret; + + ret = unregister_chrdev(xencow_major, "xencow"); + if ( ret < 0 ) + { + WPRINTK("Error: Couldn't unregister /dev/xen/xencow: %d\n", ret); + return; + } + + DPRINTK("XenCoW device successfully removed\n"); +} + +module_init(xencow_init); +module_exit(xencow_exit); + +MODULE_LICENSE("Dual BSD/GPL"); diff -r 832aac894efd include/xen/interface/io/xencow.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/xen/interface/io/xencow.h Mon Mar 16 00:01:12 2009 -0700 @@ -0,0 +1,70 @@ +/***************************************************************************** + * xencow.h + * + * XenCoW Common Structures + * + * Copyright (C) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#ifndef _XEN_PUBLIC_IO_XENCOW_H +#define _XEN_PUBLIC_IO_XENCOW_H + + +#include "ring.h" + + +#define XEN_COW_IOC_MAGIC 'w' +#define XEN_COW_IOCTL_INIT _IO(XEN_COW_IOC_MAGIC, 1) + + +/* Some definitions for the XenCow ring buffer. */ +typedef struct xencow_request_st { + ulong mfn; +} xencow_request_t; + +typedef struct xencow_response_st { + ulong pfn; +} xencow_response_t; + +DEFINE_RING_TYPES(xencow, xencow_request_t, xencow_response_t); + + +/* The structure used to initialise a XenCoW snapshot. */ +typedef struct xencow_init_st { + /* Start address of buffer */ + unsigned long addr; + /* Number of frames in buffer */ + int num_mfns; + /* MFNs of buffer frames */ + unsigned long mfns[]; +} xencow_init_t; + + + +#endif /* _XEN_PUBLIC_IO_XENCOW_H */ + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a tools/Makefile --- a/tools/Makefile Fri Mar 20 17:42:46 2009 +0000 +++ b/tools/Makefile Mon Apr 20 10:21:49 2009 -0700 @@ -26,6 +26,7 @@ SUBDIRS-$(CONFIG_Linux) += fs-back SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir SUBDIRS-y += xenpmd +SUBDIRS-y += xencow # These don't cross-compile ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH)) diff -r 0477f9061c8a tools/xencow/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/COPYING Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff -r 0477f9061c8a tools/xencow/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/Makefile Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,10 @@ +XEN_ROOT=../.. +include $(XEN_ROOT)/tools/Rules.mk + +SUBDIRS-y := +SUBDIRS-y += lib +SUBDIRS-y += xencowfs +SUBDIRS-y += test + +.PHONY: all clean install +all install clean: %: subdirs-% diff -r 0477f9061c8a tools/xencow/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/README Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,19 @@ +Xen Copy on Write +----------------------- +Provide copy on write functionality for the memory of Xen domains. + + + + +Usage Notes and issues +---------------------- + + +Future Work +----------- + +Authors +------- +Chris Matthews <cmatthew@xxxxxxxxxx> +Geoffrey Lifebvre <geoffrey@xxxxxxxxx> +Brendan Cully <brendan@xxxxxxxxx> diff -r 0477f9061c8a tools/xencow/lib/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/lib/Makefile Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,66 @@ +XEN_ROOT=../../.. +include $(XEN_ROOT)/tools/Rules.mk + +MAKE_LINK=ln -sf + +MAJOR = 0 +MINOR = 0 +SONAME = libxencow.so.$(MAJOR) + +CFLAGS += -I $(XEN_XC) +CFLAGS += -I ./ +CFLAGS += $(CFLAGS_libxenctrl) +LDFLAGS += $(LDFLAGS_libxenctrl) + +SRCS := +SRCS += xc.c xencow.c + +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -fPIC +CFLAGS += -g + +CTRL_LIB_OBJS := $(patsubst %.c,%.o,$(CTRL_SRCS-y)) +CTRL_PIC_OBJS += $(patsubst %.c,%.opic,$(CTRL_SRCS-y)) + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +OBJS = $(SRCS:.c=.o) +OBJS_PIC = $(SRCS:.c=.opic) +IBINS := + +LIB = libxencow.a libxencow.so.$(MAJOR).$(MINOR) + +.PHONY: all +all: $(LIB) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR) + $(MAKE_LINK) libxencow.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxencow.so.$(MAJOR) + $(MAKE_LINK) libxencow.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxencow.so + $(INSTALL_DATA) xencow.h $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_DATA) xencow_list.h $(DESTDIR)$(INCLUDEDIR) + +.PHONY: clean +clean: + rm -rf *.a *.so* *.o *.opic $(LIB) *~ $(DEPS) xen TAGS + +libxencow.so.$(MAJOR).$(MINOR): $(OBJS_PIC) + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libxencow.so.$(MAJOR) $(SHLIB_CFLAGS) -o $@ $^ + $(MAKE_LINK) libxencow.so.$(MAJOR).$(MINOR) libxencow.so.$(MAJOR) + $(MAKE_LINK) libxencow.so.$(MAJOR) libxencow.so + +libxencow.a: $(OBJS) + $(AR) rcs $@ $^ + +.PHONY: TAGS +TAGS: + etags -t $(SRCS) *.h + +-include $(DEPS) + diff -r 0477f9061c8a tools/xencow/lib/xc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/lib/xc.c Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,357 @@ +/****************************************************************************** + * tools/xencow/lib/xc.c + * + * libxc refactorisation. This should be put in libxc ultimately. + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <xg_private.h> +#include <xg_save_restore.h> +#include "xc.h" + + +/* + * Returns TRUE if the given machine frame number has a unique mapping + * in the guest's pseudophysical map. + */ +#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ + (((_mfn) < (max_mfn)) && \ + ((mfn_to_pfn(_mfn) < (p2m_size)) && \ + (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn)))) + + +xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot, + unsigned long *m2p_mfn0) +{ + struct xen_machphys_mfn_list xmml; + privcmd_mmap_entry_t *entries; + unsigned long m2p_chunks; + unsigned long m2p_size; + xen_pfn_t *m2p; + xen_pfn_t *extent_start; + int i; + + m2p = NULL; + m2p_size = M2P_SIZE(max_mfn); + m2p_chunks = M2P_CHUNKS(max_mfn); + + xmml.max_extents = m2p_chunks; + + extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t)); + if ( !extent_start ) + { + ERROR("failed to allocate space for m2p mfns"); + goto err0; + } + set_xen_guest_handle(xmml.extent_start, extent_start); + + if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) + || (xmml.nr_extents != m2p_chunks) ) + { + ERROR("xc_get_m2p_mfns"); + goto err1; + } + + entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t)); + if (entries == NULL) + { + ERROR("failed to allocate space for mmap entries"); + goto err1; + } + + for ( i = 0; i < m2p_chunks; i++ ) + entries[i].mfn = extent_start[i]; + + m2p = xc_map_foreign_ranges(xc_handle, DOMID_XEN, m2p_size, prot, + M2P_CHUNK_SIZE, entries, m2p_chunks); + if (m2p == NULL) + { + ERROR("xc_mmap_foreign_ranges failed"); + goto err2; + } + + *m2p_mfn0 = entries[0].mfn; + + err2: + free(entries); + err1: + free(extent_start); + err0: + return m2p; +} + +/* During transfer (or in the state file), all page-table pages must be + * converted into a 'canonical' form where references to actual mfns + * are replaced with references to the corresponding pfns. + * + * This function performs the appropriate conversion, taking into account + * which entries do not require canonicalisation (in particular, those + * entries which map the virtual address reserved for the hypervisor). */ +int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn, + const void *spage, void *dpage, + xen_pfn_t *live_p2m_table, + xen_pfn_t *live_m2p_table, unsigned long m2p_mfn0, + unsigned long p2m_size, unsigned long max_mfn, + unsigned long hvirt_start, unsigned int pt_levels, + unsigned int guest_width) +{ + uint64_t pte; + int pte_last; + int xen_start; + int xen_end; + int i; + int race = 0; + + /* + * We need to determine which entries in this page table hold + * reserved hypervisor mappings. This depends on the current + * page table type as well as the number of paging levels. + */ + xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8); + + if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) + xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); + + if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) ) + xen_start = L3_PAGETABLE_ENTRIES_PAE; + + /* + * In PAE only the L2 mapping the top 1GB contains Xen mappings. + * We can spot this by looking for the guest's mapping of the m2p. + * Guests must ensure that this check will fail for other L2s. + */ + if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) + { + int hstart; + uint64_t he; + + hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; + he = ((const uint64_t *)spage)[hstart]; + + if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) + { + /* hvirt starts with xen stuff... */ + xen_start = hstart; + } + else if ( hvirt_start != 0xf5800000 ) + { + /* old L2s from before hole was shrunk... */ + hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; + he = ((const uint64_t *)spage)[hstart]; + if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) + xen_start = hstart; + } + } + + if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) ) + { + /* + * XXX SMH: should compute these from hvirt_start (which we have) + * and hvirt_end (which we don't) + */ + xen_start = 256; + xen_end = 272; + } + + /* Now iterate through the page table, canonicalising each PTE */ + for ( i = 0; i < pte_last; i++ ) + { + unsigned long pfn; + unsigned long mfn; + + if ( pt_levels == 2 ) + pte = ((const uint32_t*)spage)[i]; + else + pte = ((const uint64_t*)spage)[i]; + + if ( (i >= xen_start) && (i < xen_end) ) + pte = 0; + + if ( pte & _PAGE_PRESENT ) + { + mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86; + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) + { + /* + * This will happen if the type info is stale which + * is quite feasible under live migration + */ + pfn = 0; /* zap it - we'll retransmit this page later */ + /* + * XXX: We can't spot Xen mappings in compat-mode L2es + * from 64-bit tools, but the only thing in them is the + * compat m2p, so we quietly zap them. This doesn't + * count as a race, so don't report it. + */ + if ( !(type == XEN_DOMCTL_PFINFO_L2TAB + && sizeof(unsigned long) > guest_width) ) + race = 1; /* inform the caller; fatal if !live */ + } + else + pfn = mfn_to_pfn(mfn); + + pte &= ~MADDR_MASK_X86; + pte |= (uint64_t)pfn << PAGE_SHIFT; + + /* + * PAE guest L3Es can contain these flags when running on + * a 64bit hypervisor. We zap these here to avoid any + * surprise at restore time... + */ + if ( (pt_levels == 3) + && (type == XEN_DOMCTL_PFINFO_L3TAB) + && (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) ) + pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); + } + + if ( pt_levels == 2 ) + ((uint32_t*)dpage)[i] = pte; + else + ((uint64_t*)dpage)[i] = pte; + } + + return race; +} + +xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id, + unsigned long p2m_size, + unsigned int guest_width) +{ + xc_dominfo_t info; + shared_info_t *live_shared_info = NULL; + xen_pfn_t *live_p2m_frame_list_list = NULL; + xen_pfn_t *live_p2m_frame_list = NULL; + xen_pfn_t *p2m_frame_list_list = NULL; + xen_pfn_t *p2m_frame_list = NULL; + xen_pfn_t *live_p2m_table = NULL; + int i; + + /* Map the shared info frame */ + if ( xc_domain_getinfo(xc_handle, domain_id, 1, &info) != 1 ) + { + ERROR("could not get domain info"); + goto out; + } + + live_shared_info = xc_map_foreign_range(xc_handle, domain_id, + PAGE_SIZE, PROT_READ, + info.shared_info_frame); + if ( live_shared_info == NULL ) + { + ERROR("could not map live shared info"); + goto out; + } + + /* Get the p2m frame list list */ + live_p2m_frame_list_list = + xc_map_foreign_range(xc_handle, domain_id, PAGE_SIZE, PROT_READ, + live_shared_info->arch.pfn_to_mfn_frame_list_list); + if ( live_p2m_frame_list_list == NULL ) + { + ERROR("could not map live p2m frame list list"); + goto out; + } + + /* Get a local copy of the live_P2M_frame_list_list */ + p2m_frame_list_list = malloc(PAGE_SIZE); + if ( !p2m_frame_list_list ) + { + ERROR("could not allocate p2m_frame_list_list array"); + goto out; + } + memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE); + + /* Canonicalise guest's unsigned long vs ours */ + if ( guest_width > sizeof(unsigned long) ) + for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ ) + if ( i < PAGE_SIZE/guest_width ) + p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i]; + else + p2m_frame_list_list[i] = 0; + else if ( guest_width < sizeof(unsigned long) ) + for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- ) + p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i]; + + /* Get the p2m frame list */ + live_p2m_frame_list = xc_map_foreign_batch(xc_handle, domain_id, PROT_READ, + p2m_frame_list_list, + P2M_FLL_ENTRIES); + if ( live_p2m_frame_list == NULL ) + { + ERROR("could not map live p2m frame list"); + goto out; + } + + /* Get a local copy of the live p2m frame_list */ + p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE); + if ( !p2m_frame_list ) + { + ERROR("could not allocate p2m frame list array"); + goto out; + } + memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE); + memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE); + + /* Canonicalise guest's unsigned long vs ours */ + if ( guest_width > sizeof(unsigned long) ) + for ( i = 0; i < P2M_FL_ENTRIES; i++ ) + p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i]; + else if ( guest_width < sizeof(unsigned long) ) + for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- ) + p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i]; + + /* Get the p2m table */ + live_p2m_table = xc_map_foreign_batch(xc_handle, domain_id, PROT_READ, + p2m_frame_list, + P2M_FL_ENTRIES); + if ( live_p2m_table == NULL ) + { + ERROR("could not map live p2m table"); + goto out; + } + + out: + if ( live_p2m_frame_list ) + munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); + + if ( live_p2m_frame_list_list ) + munmap(live_p2m_frame_list_list, PAGE_SIZE); + + if ( live_shared_info ) + munmap(live_shared_info, PAGE_SIZE); + + if ( p2m_frame_list ) + free(p2m_frame_list); + + if ( p2m_frame_list_list ) + free(p2m_frame_list_list); + + return live_p2m_table; +} + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a tools/xencow/lib/xc.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/lib/xc.h Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,72 @@ +/****************************************************************************** + * tools/xencow/lib/xc.h + * + * libxc refactorisation. This should be put in libxc ultimately. + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#define mfn_to_pfn(_mfn) (live_m2p_table[(_mfn)]) + +#define pfn_to_mfn(_pfn) \ + ((xen_pfn_t) (((guest_width)==8) \ + ? (((uint64_t *)live_p2m_table)[(_pfn)]) \ + : ((((uint32_t *)live_p2m_table)[(_pfn)]) == 0xffffffffU \ + ? (-1UL) : (((uint32_t *)live_p2m_table)[(_pfn)])))) + + +#if 0 +typedef struct xc_domain_st { + domid_t domain_id + xen_pfn_t *live_p2m_table; + xen_pfn_t *live_m2p_table; + unsigned long m2p_mfn0; + unsigned long p2m_size; + unsigned long max_mfn; + unsigned long hvirt_start; + unsigned int pt_levels; + unsigned int guest_width; +} xc_domain_t; +#endif + + +xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot, + unsigned long *m2p_mfn0); + +int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn, + const void *spage, void *dpage, + xen_pfn_t *live_p2m_table, + xen_pfn_t *live_m2p_table, unsigned long m2p_mfn0, + unsigned long p2m_size, unsigned long max_mfn, + unsigned long hvirt_start, unsigned int pt_levels, + unsigned int guest_width); + +xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id, + unsigned long p2m_size, + unsigned int guest_width); + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a tools/xencow/lib/xencow.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/lib/xencow.c Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,1072 @@ +/****************************************************************************** + * tools/xencow/lib/xencow.c + * + * VM memory Copy-on-Write library. + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <stdio.h> +#include <fcntl.h> +#include <inttypes.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/poll.h> + +#include <pthread.h> +#include <signal.h> + +#include <xen/domctl.h> + +#include <xc_private.h> +#include <xg_save_restore.h> + +#include "xc.h" +#include "xencow.h" + + +static int xencow_create_file(const char *filename) +{ + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + int flags = O_CREAT | O_TRUNC | O_RDWR; + int fd; + + fd = open(filename, flags, mode); + if ( fd < 0 ) + { + ERROR("Error opening file %s", filename); + return -EIO; + } + close(fd); + + return 0; +} + +/* Send an ioctl to the xencow device. */ +static int xencow_send_ioctl(int cmd, unsigned long arg) +{ + int fd; + int ret; + + ret = -EIO; + fd = open("/dev/xencow0", O_RDWR); + if ( fd < 0 ) + { + ERROR("Failed to open xencow device (/dev/xencow0)"); + goto out; + } + + ret = ioctl(fd, cmd, arg); + if ( ret != 0 ) + ERROR("Error during ioctl of xencow device"); + + close(fd); + + out: + return ret; +} + +static void xencow_free(xencow_t *cow) +{ + munlock(cow->buffer, BUFFER_SIZE); + free(cow->buffer); + free(cow->mfns); + + if ( cow->live_p2m_table ) + free(cow->live_p2m_table); +} + +static int xencow_alloc_bitmap(unsigned long **bitmap, unsigned long bitmap_size) +{ + if ( *bitmap == NULL ) + { + *bitmap = calloc(bitmap_size / BITS_PER_LONG, sizeof(unsigned long)); + if ( *bitmap == NULL ) + return -ENOMEM; + } + + memset(*bitmap, 0, bitmap_size / 8); + + return 0; +} + +static void *xencow_handle_events(void *c) +{ + xencow_t *cow = (xencow_t *)c; + + IPRINTF("Starting resume thread\n"); + + while (1) + { + int port = xencow_wait_for_event_or_timeout(cow, 10); + + if ( port == cow->buffer_port || port == -1 ) + { + if ( port == cow->buffer_port ) + DPRINTF("Got buffer event\n"); + xencow_flush_buffer(cow); + } + else if ( port == cow->pause_port ) + { + /* If it was a pause event, flush buffer and resume domain */ + int rc; + + DPRINTF("Got pause event\n"); + + xencow_flush_buffer(cow); + + rc = xencow_resume(cow); + if ( rc != 0 ) + ERROR("Failed to resume domain"); + } + else + ERROR("Unknown event"); + } +} + +static int xencow_start_thread(xencow_t *cow, void *(*__start_routine) (void *)) +{ + pthread_t thread; + sigset_t oldset; + sigset_t newset; + int ret; + + sigemptyset(&newset); + sigaddset(&newset, SIGTERM); + sigaddset(&newset, SIGINT); + sigaddset(&newset, SIGHUP); + sigaddset(&newset, SIGQUIT); + pthread_sigmask(SIG_BLOCK, &newset, &oldset); + + ret = pthread_create(&thread, NULL, __start_routine, cow); + if ( ret != 0 ) + { + ERROR("Failed to create thread"); + return -EIO; + } + + pthread_detach(thread); + pthread_sigmask(SIG_SETMASK, &oldset, NULL); + + return 0; +} + +static int xencow_init_buffer(xencow_t *cow) +{ + void *buffer; + cow_init_t *cow_init; + cow_request_t req; + RING_IDX req_prod; + int num_pages; + int i; + int ret; + + DPRINTF("buffer size: %ld\n", BUFFER_SIZE); + + /* Allocated page aligned buffer */ + ret = posix_memalign(&buffer, PAGE_SIZE, BUFFER_SIZE); + if ( ret != 0 ) + goto out_alloc; + + /* Lock buffer in memory so it can't be paged out */ + ret = mlock(buffer, BUFFER_SIZE); + if ( ret != 0 ) + goto out_lock; + + cow->buffer = buffer; + cow->page_buffer = buffer + XEN_COW_RING_SIZE; + + /* Initialise ring */ + SHARED_RING_INIT((cow_sring_t *)cow->buffer); + FRONT_RING_INIT(&cow->front_ring, (cow_sring_t *)cow->buffer, XEN_COW_RING_SIZE); + + num_pages = XEN_COW_RING_PAGES + RING_SIZE(&cow->front_ring); + + DPRINTF("number of ring entries: %u\n", RING_SIZE(&cow->front_ring)); + + /* Allocate memory for ioctl struct */ + ret = -ENOMEM; + cow_init = malloc(sizeof(cow_init_t) + (sizeof(unsigned long) * num_pages)); + if ( cow_init == NULL ) + goto out_lock; + + /* Initialise ioctl struct */ + cow_init->addr = (unsigned long)(cow->buffer); + cow_init->num_mfns = num_pages; + + /* Get MFNs */ + ret = xencow_send_ioctl(XEN_COW_IOCTL_INIT, (unsigned long)cow_init); + if ( ret != 0 ) + goto out; + + /* Allocate memory for CoW struct */ + cow->num_mfns = RING_SIZE(&cow->front_ring); + cow->mfns = calloc(cow->num_mfns, sizeof(unsigned long)); + + /* Copy MFNs */ + cow->sring_mfn = cow_init->mfns[0]; + memcpy(cow->mfns, &cow_init->mfns[XEN_COW_RING_PAGES], + sizeof(unsigned long) * cow->num_mfns); + + /* Fill ring with page buffer MFNs */ + req_prod = cow->front_ring.req_prod_pvt; + for ( i = 0; i < cow->num_mfns; i++ ) + { + req.mfn = cow->mfns[i]; + memcpy(RING_GET_REQUEST(&cow->front_ring, req_prod + i), &req, + sizeof(cow_request_t)); + } + + cow->front_ring.req_prod_pvt = req_prod + i; + RING_PUSH_REQUESTS(&cow->front_ring); + + free(cow_init); + return 0; + + out: + free(cow_init); + out_init: + munlock(buffer, BUFFER_SIZE); + out_lock: + free(buffer); + out_alloc: + return ret; +} + +static int xencow_init_xen(xencow_t *cow) +{ + /* Open connection to Xen */ + cow->xc_handle = xc_interface_open(); + if ( cow->xc_handle < 0 ) + { + ERROR("Failed to connect to Xen"); + goto err; + } + + /* Open event channel */ + cow->xce_handle = xc_evtchn_open(); + if ( cow->xce_handle < 0 ) + { + ERROR("Failed to open event channel"); + goto err; + } + + /* Bind VIRQ ports for event notification */ + cow->buffer_port = xc_evtchn_bind_virq(cow->xce_handle, VIRQ_COW_BUFFER); + if ( cow->buffer_port < 0 ) + { + ERROR("Failed to bind VIRQ"); + goto err; + } + + cow->pause_port = xc_evtchn_bind_virq(cow->xce_handle, VIRQ_COW_PAUSE); + if ( cow->pause_port < 0 ) + { + ERROR("Failed to bind VIRQ"); + goto err; + } + + return 0; + + err: + return -EINVAL; +} + +static int xencow_init_domain_info(xencow_t *cow) +{ + xc_dominfo_t info; + int rc; + + /* Get HVM info */ + rc = xc_domain_getinfo(cow->xc_handle, cow->domain_id, 1, &info); + if ( rc != 1 ) + { + ERROR("Failed to get domain info"); + goto err; + } + cow->is_hvm = info.hvm; + + /* Get memory size */ + cow->p2m_size = xc_memory_op(cow->xc_handle, XENMEM_maximum_gpfn, + &cow->domain_id) + 1; + + /* Get platform info */ + rc = get_platform_info(cow->xc_handle, cow->domain_id, + &cow->platform_info.max_mfn, + &cow->platform_info.hvirt_start, + &cow->platform_info.pt_levels, + &cow->platform_info.guest_width); + if ( rc != 1 ) + { + ERROR("Failed to get platform info"); + goto err; + } + + return 0; + + err: + return -EINVAL; +} + +xencow_t *xencow_init(domid_t domid) +{ + xencow_t *cow; + int rc; + + /* Initialise CoW struct */ + cow = malloc(sizeof(xencow_t)); + if ( cow == NULL) + { + errno = ENOMEM; + goto cow_out; + } + + memset(cow, 0, sizeof(xencow_t)); + + cow->domain_id = domid; + cow->xc_handle = -1; + cow->xce_handle = -1; + + INIT_LIST_HEAD(&cow->snapshots); + + /* Initialise locks */ + cow_ring_lock_init(cow); + cow_snapshots_lock_init(cow); + + /* Initialise buffer */ + IPRINTF("Initialising buffer\n"); + rc = xencow_init_buffer(cow); + if ( rc != 0 ) + { + ERROR("Failed to initialise buffer"); + goto out; + } + + /* Initialise connection to Xen */ + rc = xencow_init_xen(cow); + if ( rc != 0 ) + { + ERROR("Failed to initialise connection to Xen"); + goto out; + } + + /* Get domain info */ + rc = xencow_init_domain_info(cow); + if ( rc != 0 ) + { + ERROR("Failed to get domain info"); + goto out; + } + + /* Start event handler thread */ + xencow_start_thread(cow, xencow_handle_events); + + return cow; + + out: + xencow_free(cow); + cow_out: + return NULL; +} + +static int xencow_open_snapshot_file_for_reading(xencow_snapshot_t *snapshot, + unsigned long pfn) +{ + int open_flags = O_RDONLY; + mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH; + char *filename; + + /* Open file */ + if ( state_pfn(pfn) ) + filename = snapshot->state_file; + else + filename = snapshot->backing_file; + + return open(filename, open_flags, open_mode); +} + +static int xencow_open_snapshot_file_for_writing(xencow_t *cow, + xencow_snapshot_t **snapshot, + RING_IDX now, + unsigned long pfn) +{ + int open_flags = O_RDWR; + mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP | S_IWOTH; + char *filename; + + /* Get the right snapshot to copy this page to */ + list_for_each_entry_reverse ( (*snapshot), &cow->snapshots, list ) + if ( (*snapshot)->when <= now ) + break; + + /* Open file */ + if ( state_pfn(pfn) ) + filename = (*snapshot)->state_file; + else + filename = (*snapshot)->backing_file; + + return open(filename, open_flags, open_mode); +} + +static int xencow_read_page(int fd, unsigned long pfn, void *buffer_page) +{ + off_t offset; + off_t seek_ret; + int total_read; + int ret; + + offset = pfn_offset(pfn); + + seek_ret = lseek64(fd, offset, SEEK_SET); +#if 0 + if ( ret < 0 ) + { + ERROR("Error seeking: %ld (%lx)", (long)offset, offset_pfn(offset)); + ret = -errno; + goto err; + } +#endif + + total_read = 0; + while ( total_read < PAGE_SIZE ) + { + void *p = buffer_page + total_read; + int bytes_read = read(fd, p, PAGE_SIZE - total_read); + if ( bytes_read <= 0 ) + { + ret = -errno; + goto err; + } + + total_read += bytes_read; + } + + return 0; + + err: + ERROR("Read error"); + return ret; +} + +static int xencow_read_live_page(xencow_t *cow, unsigned long pfn, void *buffer_page) +{ + unsigned long mfn; + int ret; + + /* Get MFN */ + mfn = xencow_p2m(cow, pfn); + + /* Check if MFN is mapped */ + if ( is_mapped(mfn) ) + { + void *page = xc_map_foreign_batch(cow->xc_handle, cow->domain_id, PROT_READ, &mfn, 1); + int copy_frame = 0; + + if ( cow->is_hvm ) + { + if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB ) + copy_frame = 1; + } + else + { + ((uint32_t *)(&mfn))[0] = mfn; + + ret = xc_get_pfn_type_batch(cow->xc_handle, cow->domain_id, 1, (uint32_t *)(&mfn)); + if ( ret != 0 ) + { + ERROR("get_pfn_type_batch failed"); + goto err; + } + mfn = (uint32_t)mfn; + + if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB ) + { + /* Canonicalise mfn -> pfn */ + mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn; + copy_frame = 1; + } + } + + if ( copy_frame ) + { + /* Copy live page */ + if ( page != NULL ) + memcpy(buffer_page, page, PAGE_SIZE); + else + memset(buffer_page, 0, PAGE_SIZE); + } + else + /* Copy blank page */ + memset(buffer_page, 0, PAGE_SIZE); + + munmap(page, PAGE_SIZE); + } + else + /* Copy blank page */ + memset(buffer_page, 0, PAGE_SIZE); + + return 0; + + err: + return ret; +} + +static int xencow_read_snapshot_page(xencow_snapshot_t *snapshot, + unsigned long pfn, + void *buffer_page) +{ + int ret; + + if ( test_bit(pfn, snapshot->bitmap) ) + { + /* Open file for reading */ + int fd = xencow_open_snapshot_file_for_reading(snapshot, pfn); + if ( fd < 0 ) + { + ERROR("Error opening file"); + ret = -errno; + goto out; + } + + /* Read file and close */ + ret = xencow_read_page(fd, pfn, buffer_page); + close(fd); + } + else + ret = -ENOENT; + + out: + return ret; +} + +int xencow_read_buffer(xencow_t *cow, int snapshot_num, unsigned long start_pfn, + int num_pages, void *buffer) +{ + int open_flags = O_RDONLY; + mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH; + xencow_snapshot_t *snapshot; + int current_num; + int fd; + int i; + int ret; + + cow_snapshots_lock(cow); + + current_num = 0; + list_for_each_entry ( snapshot, &cow->snapshots, list ) + { + if ( current_num == snapshot_num ) + break; + current_num++; + } + + /* Open file */ + fd = xencow_open_snapshot_file_for_reading(snapshot, start_pfn); + if ( fd < 0 ) + { + ret = -errno; + goto out_open; + } + + /* Read pages */ + for ( i = 0; i < num_pages; i++ ) + { + void *buffer_page = buffer + (i * PAGE_SIZE); + unsigned long pfn = start_pfn + i; + + /* Check bitmap for page */ + if ( test_bit(pfn, snapshot->bitmap) ) + { + ret = xencow_read_page(fd, pfn, buffer_page); + if ( ret != 0 ) + goto out; + } + else + { + int found = 0; + + /* Check later snapshots */ + list_for_each_entry_continue ( snapshot, &cow->snapshots, list ) + { + ret = xencow_read_snapshot_page(snapshot, pfn, buffer_page); + if ( ret == 0 ) + { + found = 1; + break; + } + } + + if ( !found ) + { + /* If not found, read page from live domain */ + ret = xencow_read_live_page(cow, pfn, buffer_page); + if ( ret != 0 ) + goto out; + + /* Flush buffer */ + cow_snapshots_unlock(cow); + xencow_flush_buffer(cow); + cow_snapshots_lock(cow); + + /* Check (latest) bitmap for page again */ + snapshot = list_bottom(&cow->snapshots, xencow_snapshot_t, list); + + ret = xencow_read_snapshot_page(snapshot, pfn, buffer_page); + if ( ret == 0 ) + DPRINTF("Page dirtied since read from live\n"); + } + } + } + + ret = 0; + + out: + close(fd); + out_open: + cow_snapshots_unlock(cow); + return ret; +} + +static int xencow_write_page(int fd, off_t offset, void *page) +{ + int total_written; + int ret; + off_t seek_ret; + + seek_ret = lseek64(fd, offset, SEEK_SET); +#if 0 + if ( ret < 0 ) + { + ERROR("Error seeking: %ld (%lx)\n", (long)offset, offset_pfn(offset)); + ret = -errno; + goto out; + } +#endif + + /* Write page */ + total_written = 0; + while ( total_written < PAGE_SIZE ) + { + void *p = page + total_written; + int bytes_written = write(fd, p, PAGE_SIZE - total_written); + + DPRINTF("Writing first chunk: %lx\n", *((unsigned long *)p)); + + if ( bytes_written <= 0 ) + { + ERROR("Error writing"); + ret = -errno; + goto out; + } + + total_written += bytes_written; + + DPRINTF("Wrote %d bytes\n", bytes_written); + } + + ret = 0; + + out: + return ret; +} + +static int xencow_flush_page(xencow_t *cow, xencow_snapshot_t *snapshot, + RING_IDX now, int fd, unsigned long pfn, + void *page) +{ + off_t offset; + int ret; + + if ( state_pfn(pfn) ) + offset = pfn_offset(now - snapshot->when); + else if ( !test_and_set_bit(pfn, snapshot->bitmap) ) + offset = pfn_offset(pfn); + else + return -1; + + /* Write to file */ + ret = xencow_write_page(fd, offset, page); + if ( ret != 0 && !state_pfn(pfn) ) + clear_bit(pfn, snapshot->bitmap); + + return ret; +} + +#define BATCH_REQS 1 + +void xencow_flush_buffer(xencow_t *cow) +{ + cow_request_t req; + RING_IDX req_prod; + RING_IDX rsp_prod; + RING_IDX i; + xencow_snapshot_t *snapshot = NULL; + char *filename; + int fd = -1; + int fd_is_state = 0; + + if ( cow->next_snapshot == 0 ) + return; + + cow_ring_lock(cow); + + rsp_prod = cow->front_ring.sring->rsp_prod; + req_prod = cow->front_ring.sring->req_prod; + + /* Flush buffer pages */ + for ( i = cow->front_ring.rsp_cons; i != rsp_prod; i++ ) + { + cow_response_t rsp; + void *page = cow->page_buffer + + (RING_MASK(&cow->front_ring, i) << PAGE_SHIFT); + + memcpy(&rsp, RING_GET_RESPONSE(&cow->front_ring, i), + sizeof(cow_response_t)); + + DPRINTF("num: %lx; pfn: %lx; page first chunk: %lx\n", + (unsigned long)i, rsp.pfn, *((unsigned long *)page)); + + /* Open appropriate file */ + cow_snapshots_lock(cow); + + if ( (fd < 0) || + (state_pfn(rsp.pfn) && !fd_is_state) || + (!state_pfn(rsp.pfn) && fd_is_state)) + { + if ( fd >= 0 ) + close(fd); + + fd = xencow_open_snapshot_file_for_writing(cow, &snapshot, i, rsp.pfn); + if ( fd < 0 ) + { + ERROR("Error opening file"); + return; + } + + fd_is_state = state_pfn(rsp.pfn); + if ( fd_is_state ) + DPRINTF("state page: %d\n", i); + else + DPRINTF("normal page: %d\n", i); + } + + cow_snapshots_unlock(cow); + + /* Flush buffer page */ + xencow_flush_page(cow, snapshot, i, fd, rsp.pfn, page); + +#if !BATCH_REQS + cow->front_ring.rsp_cons = i + 1; + cow->front_ring.sring->rsp_event = i + 2; +#endif + + /* Put buffer page MFN in ring */ + req.mfn = cow->mfns[RING_MASK(&cow->front_ring, i)]; + memcpy(RING_GET_REQUEST(&cow->front_ring, req_prod), &req, + sizeof(cow_request_t)); + req_prod++; + +#if !BATCH_REQS + /* Push added MFN out */ + cow->front_ring.req_prod_pvt = req_prod; + RING_PUSH_REQUESTS(&cow->front_ring); +#endif + } + + if ( fd >= 0 ) + close(fd); + +#if BATCH_REQS + cow->front_ring.rsp_cons = i; + cow->front_ring.sring->rsp_event = i + 1; + + /* Push added MFNs out */ + cow->front_ring.req_prod_pvt = req_prod; + RING_PUSH_REQUESTS(&cow->front_ring); +#endif + + cow_ring_unlock(cow); +} + +int xencow_resume(xencow_t *cow) +{ + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_cow_resume; + domctl.domain = cow->domain_id; + + return do_domctl(cow->xc_handle, &domctl); +} + +int xencow_enable(xencow_t *cow) +{ + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_cow_enable; + domctl.domain = cow->domain_id; + domctl.u.cow_enable.mfn = cow->sring_mfn; + + return do_domctl(cow->xc_handle, &domctl); +} + +int xencow_disable(xencow_t *cow) +{ + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_cow_disable; + domctl.domain = cow->domain_id; + + return do_domctl(cow->xc_handle, &domctl); +} + +unsigned long xencow_p2m(xencow_t *cow, unsigned long pfn) +{ + unsigned long mnf; + unsigned long *live_p2m_table; + + if ( cow->is_hvm ) + return pfn; + + if ( cow->live_p2m_table == NULL ) + cow->live_p2m_table = + xc_get_live_p2m_table(cow->xc_handle, cow->domain_id, cow->p2m_size, + cow->platform_info.guest_width); + + return cow->live_p2m_table[pfn]; +} + +static int xencow_create_snapshot_files(xencow_t *cow, xencow_snapshot_t *snapshot) +{ + char *fuse_file = malloc(8 * sizeof(int) + 4); + char *backing_file = malloc(200); + char *state_file = malloc(200); + int ret; + + /* FIXME: Don't hardcode the path */ + /* Get file names */ + sprintf(fuse_file, "%d.%d", cow->domain_id, cow->next_snapshot); + sprintf(backing_file, "/tmp/xencow%s", fuse_file); + sprintf(state_file, "%s.state", backing_file); + + DPRINTF("fuse: %s; backing: %s; state: %s\n", fuse_file, backing_file, state_file); + + /* Create backing files */ + ret = xencow_create_file(backing_file); + if ( ret != 0 ) + return ret; + + ret = xencow_create_file(state_file); + if ( ret != 0 ) + return ret; + + /* Store file names */ + snapshot->xencowfs_file = malloc(strlen(fuse_file) + 1); + strncpy(snapshot->xencowfs_file, fuse_file, strlen(fuse_file) + 1); + + snapshot->backing_file = malloc(strlen(backing_file) + 1); + strncpy(snapshot->backing_file, backing_file, strlen(backing_file) + 1); + + snapshot->state_file = malloc(strlen(state_file) + 1); + strncpy(snapshot->state_file, state_file, strlen(state_file) + 1); + + return 0; +} + +static int xencow_init_snapshot(xencow_t *cow) +{ + xencow_snapshot_t *snapshot = malloc(sizeof(xencow_snapshot_t)); + int ret; + + memset(snapshot, 0, sizeof(xencow_snapshot_t)); + + ret = xencow_alloc_bitmap(&snapshot->bitmap, cow->p2m_size); + if ( ret != 0 ) + { + ERROR("Error allocating bitmap"); + return ret; + } + + ret = xencow_create_snapshot_files(cow, snapshot); + if ( ret != 0 ) + { + ERROR("Error creating backing files"); + return ret; + } + + list_add_tail(&snapshot->list, &cow->snapshots); + + return 0; +} + +static int xencow_take_snapshot(xencow_t *cow) +{ + struct timeval before; + struct timeval after; + double time_diff; + xencow_snapshot_t *snapshot; + int ret; + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_cow_snapshot; + domctl.domain = cow->domain_id; + + gettimeofday(&before, NULL); + + ret = do_domctl(cow->xc_handle, &domctl); + if ( ret != 0 ) + { + ERROR("Error taking snapshot"); + return ret; + } + + gettimeofday(&after, NULL); + + time_diff = difftime(after.tv_usec, before.tv_usec); + IPRINTF("Time spent paused: %fus\n", time_diff); + + ret = xencow_init_snapshot(cow); + if ( ret != 0 ) + { + ERROR("Error initialising snapshot"); + return ret; + } + + snapshot = list_bottom(&cow->snapshots, xencow_snapshot_t, list); + snapshot->when = domctl.u.cow_snapshot.when; + + cow->next_snapshot++; + + DPRINTF("when = %d\n", snapshot->when); + + return ret; +} + +void xencow_cleanup(xencow_t *cow) +{ + /* Disable CoW */ + xencow_disable(cow); + + /* Reset variables */ + cow->p2m_size = 0; + cow->num_mfns = 0; + cow->sring_mfn = 0; + + /* Close event channel */ + xc_evtchn_close(cow->xce_handle); + cow->xce_handle = -1; + + /* Close connection to Xen */ + xc_interface_close(cow->xc_handle); + cow->xc_handle = -1; + + /* Free memory */ + xencow_free(cow); +} + +int xencow_snapshot(xencow_t *cow) +{ + int ret = -1; + + cow_snapshots_lock(cow); + ret = xencow_take_snapshot(cow); + cow_snapshots_unlock(cow); + + if ( ret != 0 ) + { + ERROR("Error taking snapshot"); + return ret; + } + + return 0; +} + +int xencow_wait_for_event_or_timeout(xencow_t *cow, unsigned long ms) +{ + struct pollfd fd = { .fd = cow->xce_handle, .events = POLLIN | POLLERR }; + int port; + int rc; + + rc = poll(&fd, 1, ms); + if ( rc == -1 ) + { + if (errno == EINTR) + return 0; + ERROR("Poll exited with an error"); + return -2; + } + + if ( rc == 1 ) + { + port = xc_evtchn_pending(cow->xce_handle); + if ( port == -1 ) + { + ERROR("Failed to read port from event channel"); + return -2; + } + + rc = xc_evtchn_unmask(cow->xce_handle, port); + if ( rc == -1 ) + { + ERROR("Failed to unmask event channel port"); + return -2; + } + } + else + port = -1; + + return port; +} + +int xencow_wait_for_event(xencow_t *cow) +{ + return xencow_wait_for_event_or_timeout(cow, -1); +} + +int xencow_page_type(xencow_t *cow, unsigned long pfn, + unsigned long *count_info, unsigned long *type_info) +{ + unsigned long mfn; + int ret; + DECLARE_DOMCTL; + + mfn = xencow_p2m(cow, pfn); + + domctl.cmd = XEN_DOMCTL_cow_page_type; + domctl.domain = cow->domain_id; + domctl.u.cow_page_type.mfn = mfn; + + ret = do_domctl(cow->xc_handle, &domctl); + if ( ret != 0 ) + { + *count_info = domctl.u.cow_page_type.count_info; + *type_info = domctl.u.cow_page_type.type_info; + } + + return ret; +} + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a tools/xencow/lib/xencow.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/lib/xencow.h Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,284 @@ +/****************************************************************************** + * tools/xencow/lib/xencow.h + * + * VM memory Copy-on-Write library. + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#ifndef __XEN_COW_H__ +#define __XEN_COW_H__ + + +#include <inttypes.h> +#include <xen/xen.h> +#include <xen/io/cow.h> +#include <xen/event_channel.h> +#include <xen/domctl.h> +#include <xenctrl.h> +#include <xc_private.h> +#include "xencow_list.h" + + +#define STATE_MFN ((unsigned long)(-1)) + +#define state_pfn(_pfn) ((_pfn) == STATE_MFN) + +#define BUFFER_SIZE \ + ((((PAGE_SIZE >> 1) / sizeof(unsigned long)) + XEN_COW_RING_PAGES) \ + << PAGE_SHIFT) + + +#define offset_pfn(_offset) ((_offset) >> PAGE_SHIFT) + +#define pfn_offset(_pfn) (((off_t)(_pfn)) << PAGE_SHIFT) + + +#define BITS_PER_LONG (sizeof(unsigned long) * 8) + +/* XXX: stolen from xen/asm/bitops.h */ +/* XXX: should these be in here? are they required to work externally? */ +#ifdef CONFIG_SMP +#define LOCK_PREFIX "lock ; " +#else +#define LOCK_PREFIX "" +#endif + +#define ADDR (*(volatile long *) addr) +#define CONST_ADDR (*(const volatile long *) addr) + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void clear_bit(int nr, volatile void *addr) +{ + asm volatile ( + LOCK_PREFIX + "btrl %1,%0" + : "=m" (ADDR) + : "Ir" (nr), "m" (ADDR) : "memory"); +} + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_set_bit(int nr, volatile void *addr) +{ + int oldbit; + + asm volatile ( + LOCK_PREFIX + "btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) + : "Ir" (nr), "m" (ADDR) : "memory"); + return oldbit; +} + +static inline int test_bit(int nr, const volatile void *addr) +{ + int oldbit; + + asm volatile ( + "btl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit) + : "m" (CONST_ADDR), "Ir" (nr) : "memory" ); + return oldbit; +} + +static inline int testandset (int *p) +{ + long int readval = 0; + + __asm__ __volatile__ ("lock; cmpxchgl %2, %0" + : "+m" (*p), "+a" (readval) + : "r" (1) + : "cc"); + return readval; +} + + +/* Spin lock */ +typedef int spinlock_t; + +#define SPIN_LOCK_UNLOCKED 0 + +static inline void spin_lock(spinlock_t *lock) +{ +// while ( test_and_set_bit(1, lock) ); + while ( testandset(lock) ); +} + +static inline void spin_lock_init(spinlock_t *lock) +{ + *lock = SPIN_LOCK_UNLOCKED; +} + +static inline void spin_unlock(spinlock_t *lock) +{ + *lock = SPIN_LOCK_UNLOCKED; +} + +static inline int spin_trylock(spinlock_t *lock) +{ + return !testandset(lock); +} + +/* CoW ring lock */ +#define cow_ring_lock_init(_c) spin_lock_init(&(_c)->ring_lock) +#define cow_ring_lock(_c) spin_lock(&(_c)->ring_lock) +#define cow_ring_unlock(_c) spin_unlock(&(_c)->ring_lock) + +/* CoW snapshots list */ +#define cow_snapshots_lock_init(_c) spin_lock_init(&(_c)->snapshots_lock) +#define cow_snapshots_lock(_c) spin_lock(&(_c)->snapshots_lock) +#define cow_snapshots_unlock(_c) spin_unlock(&(_c)->snapshots_lock) + + +typedef struct xencow_snapshot_st { + struct list_head list; + + /* bitmap of PFNs that have been saved */ + unsigned long *bitmap; + + /* when the snapshot was taken */ + RING_IDX when; + + /* files for snapshot image */ + char *xencowfs_file; + char *state_file; + char *backing_file; +} xencow_snapshot_t; + +typedef struct platform_info_st { + unsigned long max_mfn; + unsigned long hvirt_start; + unsigned int pt_levels; + unsigned int guest_width; +} platform_info_t; + +typedef struct xencow_st { + domid_t domain_id; + int is_hvm; + unsigned long p2m_size; + unsigned long *live_p2m_table; + + int xc_handle; + int xce_handle; + + evtchn_port_t buffer_port; + evtchn_port_t pause_port; + + platform_info_t platform_info; + + size_t buffer_size; + void *buffer; + + int num_mfns; + unsigned long *mfns; + + unsigned long sring_mfn; + cow_front_ring_t front_ring; + + spinlock_t ring_lock; + + void *page_buffer; + + struct list_head snapshots; + unsigned int next_snapshot; + + spinlock_t snapshots_lock; +} xencow_t; + + +xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot, + unsigned long *m2p_mfn0); + +int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn, + const void *spage, void *dpage, + xen_pfn_t *live_p2m_table, + xen_pfn_t *live_m2p_table, unsigned long m2p_mfn0, + unsigned long p2m_size, unsigned long max_mfn, + unsigned long hvirt_start, unsigned int pt_levels, + unsigned int guest_width); + +xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id, + unsigned long p2m_size, + unsigned int guest_width); + + +/* Initialise CoW for a domain */ +xencow_t *xencow_init(domid_t domid); + +/* Enable CoW */ +int xencow_enable(xencow_t *cow); + +/* Disable CoW */ +int xencow_disable(xencow_t *cow); + +/* Take a snapshot */ +int xencow_snapshot(xencow_t *cow); + +/* Resume a domain paused because of CoW */ +int xencow_resume(xencow_t *cow); + +/* Get the MFN for a PFN */ +unsigned long xencow_p2m(xencow_t *cow, unsigned long pfn); + +/* Cleanup a CoW struct */ +void xencow_cleanup(xencow_t *cow); + +/* Wait for an event */ +int xencow_wait_for_event(xencow_t *cow); +int xencow_wait_for_event_or_timeout(xencow_t *cow, unsigned long ms); + +/* Flush the pre-dirtied page buffer */ +void xencow_flush_buffer(xencow_t *cow); + +/* Read pages from the pre-dirtied buffer */ +int xencow_read_buffer(xencow_t *cow, int snapshot_num, unsigned long start_pfn, + int num_pages, void *buffer); + +/* Get info for a page */ +int xencow_page_type(xencow_t *cow, unsigned long pfn, + unsigned long *count_info, unsigned long *type_info); + + +#endif /* __XEN_COW_H__ */ + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a tools/xencow/lib/xencow_list.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/lib/xencow_list.h Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,555 @@ +/****************************************************************************** + * tools/xencow/lib/xencow_list.h + * + * Linked list. + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#ifndef __XEN_COW_LIST_H__ +#define __XEN_COW_LIST_H__ + + +/* Taken from Linux kernel code, but de-kernelized for userspace. */ +#include <stddef.h> + +#undef LIST_HEAD_INIT +#undef LIST_HEAD +#undef INIT_LIST_HEAD + +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +#define container_of(ptr, type, member) ({ \ + typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +#define list_top(head, type, member) \ +({ \ + struct list_head *_head = (head); \ + list_empty(_head) ? NULL : list_entry(_head->next, type, member); \ +}) + +#define list_bottom(head, type, member) \ +({ \ + struct list_head *_head = (head); \ + list_empty(_head) ? NULL : list_entry(_head->prev, type, member); \ +}) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static __inline__ void __list_add_rcu(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + new->next = next; + new->prev = prev; + next->prev = new; + prev->next = new; +} + +/** + * list_add_rcu - add a new entry to rcu-protected list + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static __inline__ void list_add_rcu(struct list_head *new, struct list_head *head) +{ + __list_add_rcu(new, head, head->next); +} + +/** + * list_add_tail_rcu - add a new entry to rcu-protected list + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static __inline__ void list_add_tail_rcu(struct list_head *new, struct list_head *head) +{ + __list_add_rcu(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +/** + * list_del_rcu - deletes entry from list without re-initialization + * @entry: the element to delete from the list. + * + * Note: list_empty on entry does not return true after this, + * the entry is in an undefined state. It is useful for RCU based + * lockfree traversal. + * + * In particular, it means that we can not poison the forward + * pointers that may still be used for walking the list. + */ +static inline void list_del_rcu(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = LIST_POISON2; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + + +/** + * list_for_each_entry_continue - iterate over list of given type + * continuing after existing point + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + + +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#define HLIST_HEAD_INIT { .first = NULL } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) +#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL) + +static __inline__ int hlist_unhashed(struct hlist_node *h) +{ + return !h->pprev; +} + +static __inline__ int hlist_empty(struct hlist_head *h) +{ + return !h->first; +} + +static __inline__ void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static __inline__ void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +/** + * hlist_del_rcu - deletes entry from hash list without re-initialization + * @entry: the element to delete from the hash list. + * + * Note: list_unhashed() on entry does not return true after this, + * the entry is in an undefined state. It is useful for RCU based + * lockfree traversal. + * + * In particular, it means that we can not poison the forward + * pointers that may still be used for walking the hash list. + */ +static inline void hlist_del_rcu(struct hlist_node *n) +{ + __hlist_del(n); + n->pprev = LIST_POISON2; +} + +static __inline__ void hlist_del_init(struct hlist_node *n) +{ + if (n->pprev) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +#define hlist_del_rcu_init hlist_del_init + +static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + n->pprev = &h->first; + if (first) + first->pprev = &n->next; + h->first = n; +} + +/* next must be != NULL */ +static __inline__ void hlist_add_before(struct hlist_node *n, struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static __inline__ void hlist_add_after(struct hlist_node *n, + struct hlist_node *next) +{ + next->next = n->next; + *(next->pprev) = n; + n->next = next; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +/* Cannot easily do prefetch unfortunately */ +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos; pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; n = pos ? pos->next : 0, pos; \ + pos = n) + +/** + * hlist_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_continue - iterate over a hlist continuing after existing point + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(tpos, pos, member) \ + for (pos = (pos)->next; \ + pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_from - iterate over a hlist continuing from existing point + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(tpos, pos, member) \ + for (; pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ + for (pos = (head)->first; \ + pos && ({ n = pos->next; 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = n) + + +#endif /* __XEN_COW_LIST_H__ */ + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a tools/xencow/test/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/test/Makefile Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,32 @@ +XEN_ROOT=../../.. +include $(XEN_ROOT)/tools/Rules.mk + +CFLAGS += -I $(XEN_XC) +CFLAGS += $(CFLAGS_libxenctrl) + +SRCS += cow_compare.c + +CFLAGS += -Werror +CFLAGS += -g +CFLAGS += -Wl,-rpath,.. + +LDFLAGS += $(LDFLAGS_libxenctrl) -lxencow + +OBJS = $(SRCS:.c=.o) +IBINS = cow_compare + +all: $(IBINS) + +cow_compare: $(OBJS) + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +install: all + $(INSTALL_DIR) $(DESTDIR)$(SBINDIR) + $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(SBINDIR) + +clean: + rm -f *.o *~ $(DEPS) xen TAGS $(IBINS) $(LIB) + +.PHONY: clean install + +-include $(DEPS) diff -r 0477f9061c8a tools/xencow/test/cow_compare.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/test/cow_compare.c Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,896 @@ +/****************************************************************************** + * tools/xencow/test/cow_compare.c + * + * Test application to compare CoW iamge and live memory dumps + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <sys/types.h> +#include <unistd.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> + +#include <string.h> +#include <errno.h> +#include <sys/mman.h> + +#if 0 +#include <xen/xen.h> +#include <xc_private.h> +#endif +#include <xg_private.h> +#include <xg_save_restore.h> +#include <xenctrl.h> + +#include "../lib/xc.h" +#include "../lib/xencow.h" + + +#define COW 1 + +#if COW +#define COW_FILE1 "vm_dump.cow1" +#define COW_FILE2 "vm_dump.cow2" +#endif +#define LIVE_FILE1 "vm_dump.live1" +#define LIVE_FILE2 "vm_dump.live2" + + +#define SLEEP_TIME (10 * 1) /* 1 min */ + + +/* Printing functions */ +#if 1 +#define pr_debug(_f, _a...) \ + printf("%s(): " _f, __func__, ##_a) +#else +#define pr_debug(_f, _a...) ((void)0) +#endif + +#define warning(_f, _a...) \ + fprintf(stderr, "%s(): " _f, __func__, ##_a) + + +static int debug_mode = 0; + +#if 0 +typedef struct page_info_st { + unsigned long count_info; + unsigned long type_info; +} page_info_t; + +page_info_t get_page_type_info(int xc_handle, domid_t domain_id, xen_pfn_t mfn) +{ + page_info_t page_info; + int rc; + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_cow_page_type; + domctl.domain = domain_id; + domctl.u.cow_page_type.mfn = mfn; + + rc = do_domctl(xc_handle, &domctl); + if ( rc != 0 ) + { + printf("error getting page type for %lx\n", mfn); + page_info.count_info = 0; + page_info.type_info = 0; + return page_info; + } + + page_info.count_info = domctl.u.cow_page_type.count_info; + page_info.type_info = domctl.u.cow_page_type.type_info; + + return page_info; +} +#endif + +int compare_pages(void *page1, void *page2, uint32_t page_size) +{ + uint32_t i; + int rc = 0; + + for ( i = 0; i < page_size; i++ ) + { + if ( ((char *)page1)[i] != ((char *)page2)[i] ) + { + rc--; +#if VERBOSE + printf("images do not match at offset %x (%u): ", i, i); + printf("(%x) (%x)\n", ((char *)page1)[i], ((char *)page2)[i]); +#endif + } + } + + return rc; +} + +int compare(char *file1, char *file2, int xc_handle, domid_t domain_id, + xen_pfn_t *live_p2m_table, xen_pfn_t *live_m2p_table, + unsigned long m2p_mfn0, + unsigned long p2m_size, unsigned long max_mfn, + unsigned long hvirt_start, unsigned int pt_levels, + unsigned int guest_width) +{ + int open_flags = O_RDONLY; + mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH; + int fd1; + int fd2; + void *page1 = malloc(PAGE_SIZE); + void *page2 = malloc(PAGE_SIZE); + unsigned long pfn; + int hvm = 0; + int rc; + int ret = 0; + + if ( !live_p2m_table ) + hvm = 1; + + /* Open image files */ + fd1 = open(file1, open_flags, open_mode); + if ( fd1 < 0 ) + { + perror("failed to open file1"); + return -1; + } + + fd2 = open(file2, open_flags, open_mode); + if ( fd2 < 0 ) + { + perror("failed to open file2"); + return -1; + } + + /* Read images */ + pfn = 0; + while ( pfn < p2m_size ) + { + /* Read pages */ + off64_t offset = pfn_offset(pfn); + off64_t ret_seek; + int total_read; + + ret_seek = lseek64(fd1, offset, SEEK_SET); +#if 0 + if ( ret_fd1 < 0 ) + { + perror("failed to seek file1"); + return -1; + } +#endif + + ret_seek = lseek64(fd2, offset, SEEK_SET); +#if 0 + if ( ret_fd2 < 0 ) + { + perror("failed to seek file2"); + return -1; + } +#endif + + total_read = 0; + while ( total_read < PAGE_SIZE ) + { + void *p = page1 + total_read; + int bytes_read = read(fd1, p, PAGE_SIZE - total_read); + if ( bytes_read <= 0 ) + { + perror("failed to read from file1"); + return -1; + } + total_read += bytes_read; + } + + total_read = 0; + while ( total_read < PAGE_SIZE ) + { + void *p = page2 + total_read; + int bytes_read = read(fd2, p, PAGE_SIZE - total_read); + if ( bytes_read <= 0 ) + { + perror("failed to read from file2"); + return -1; + } + total_read += bytes_read; + } + + rc = compare_pages(page1, page2, PAGE_SIZE); + + /* Check if the pages are different */ + if ( rc != 0 ) + { + xen_pfn_t mfn; + int check_frame = 0; + int nonhypervisor_bytes = 0; + + /* Get MFN */ + if ( hvm ) + mfn = pfn; + else + mfn = pfn_to_mfn(pfn); + +#if 1 + if ( is_mapped(mfn) ) + { +#endif + if ( hvm ) + { + if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB ) + check_frame = 1; + } + else + { + ((uint32_t *)(&mfn))[0] = mfn; + mfn = (uint32_t)mfn; + + if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB ) + { + /* Canonicalise mfn -> pfn */ + mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn; + check_frame = 1; + } + } + + if ( check_frame ) + { + unsigned long addr; + unsigned long type; + + addr = mfn & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + type = mfn & XEN_DOMCTL_PFINFO_LTAB_MASK; + + /* Check if the page is present */ + if ( type != XEN_DOMCTL_PFINFO_XTAB ) + { + type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + if ( (type >= XEN_DOMCTL_PFINFO_L1TAB) && + (type <= XEN_DOMCTL_PFINFO_L4TAB) ) + { + int pte_last; + int xen_start; + int xen_end; + int i; + + printf("page table page: %lx\n", pfn); + + /* + * We need to determine which entries in this page table hold + * reserved hypervisor mappings. This depends on the current + * page table type as well as the number of paging levels. + */ + xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8); + + if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) + xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); + + if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) ) + xen_start = L3_PAGETABLE_ENTRIES_PAE; + + /* + * In PAE only the L2 mapping the top 1GB contains Xen mappings. + * We can spot this by looking for the guest's mapping of the m2p. + * Guests must ensure that this check will fail for other L2s. + */ + if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) + { + int hstart; + uint64_t he; + + hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; + he = ((const uint64_t *)page1)[hstart]; + + if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) + { + /* hvirt starts with xen stuff... */ + xen_start = hstart; + } + else if ( hvirt_start != 0xf5800000 ) + { + /* old L2s from before hole was shrunk... */ + hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; + he = ((const uint64_t *)page1)[hstart]; + if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) + xen_start = hstart; + } + } + + if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) ) + { + /* + * XXX SMH: should compute these from hvirt_start (which we have) + * and hvirt_end (which we don't) + */ + xen_start = 256; + xen_end = 272; + } + + /* + * Scan for changed bytes that aren't reserved by + * the hypervisor + */ + for ( i = 0; i < pte_last; i++ ) + if ( ((char *)page1)[i] != ((char *)page2)[i] ) + if ( !((i >= xen_start) && (i < xen_end)) ) + nonhypervisor_bytes++; + } + + switch (type) + { + + case XEN_DOMCTL_PFINFO_NOTAB: + { + printf(" normal page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + break; + + case XEN_DOMCTL_PFINFO_L1TAB: + { + printf(" l1 table page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + break; + + case XEN_DOMCTL_PFINFO_L2TAB: + { + printf(" l2 table page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + break; + + case XEN_DOMCTL_PFINFO_L3TAB: + { + printf(" l3 table page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + break; + + case XEN_DOMCTL_PFINFO_L4TAB: + { + printf(" l4 table page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + break; + + case XEN_DOMCTL_PFINFO_LPINTAB: + { + printf(" pin page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + break; + + case XEN_DOMCTL_PFINFO_XTAB: + { + printf(" invalid page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + break; + + default: + printf(" unknown page: %lx: %d %d\n", pfn, -rc, nonhypervisor_bytes); + } + + } + } +#if 1 + } +#endif + + if ( debug_mode ) + printf("images do not match at page %lx (%lx): %d (%d) bytes different\n", + pfn, mfn, -rc, nonhypervisor_bytes); + + if ( nonhypervisor_bytes + rc != 0) + ret--; + } + + /* Move to next page */ + pfn++; + } + + close(fd1); + close(fd2); + + return ret; +} + +int dump_memory(char *filename, int xc_handle, domid_t domain_id, + xen_pfn_t *live_p2m_table, xen_pfn_t *live_m2p_table, + unsigned long m2p_mfn0, + unsigned long p2m_size, unsigned long max_mfn, + unsigned long hvirt_start, unsigned int pt_levels, + unsigned int guest_width) +{ + int open_flags = O_CREAT | O_TRUNC | O_RDWR; + mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP | S_IWOTH; + int fd; + unsigned long pfn; + void *page; + int hvm = 0; +#if 1 + int rc; +#endif + + if ( !live_p2m_table ) + hvm = 1; + + /* Open file */ + fd = open(filename, open_flags, open_mode); + if ( fd < 0 ) + { + perror("failed to open file"); + return -1; + } + + /* Write out memory contents */ + pfn = 0; + while ( pfn < p2m_size ) + { + size_t bytes_written; + xen_pfn_t mfn; + int copy_frame = 0; + int pt_page = 0; + + page = NULL; + + if ( hvm ) + mfn = pfn; + else + mfn = pfn_to_mfn(pfn); + + /* Read page */ + if ( is_mapped(mfn) ) + { + if ( hvm ) + { + if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB ) + { + page = xc_map_foreign_batch(xc_handle, domain_id, PROT_READ, &mfn, 1); + copy_frame = 1; + } + } + else + { + page = xc_map_foreign_range(xc_handle, domain_id, PAGE_SIZE, PROT_READ, mfn); + + ((uint32_t *)(&mfn))[0] = mfn; + + rc = xc_get_pfn_type_batch(xc_handle, domain_id, 1, (uint32_t *)(&mfn)); + if ( rc ) + { + ERROR("get_pfn_type_batch failed"); + goto out; + } + mfn = (uint32_t)mfn; + + if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB ) + { + /* Canonicalise mfn -> pfn */ + mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn; + copy_frame = 1; + } + else + { + munmap(page, PAGE_SIZE); + page = NULL; + } + } + +#if 0 + if ( copy_frame ) + { + unsigned long addr; + unsigned long type; + + addr = mfn & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + type = mfn & XEN_DOMCTL_PFINFO_LTAB_MASK; + + /* Check if the page is present */ + if ( type != XEN_DOMCTL_PFINFO_XTAB ) + { + type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + if ( (type >= XEN_DOMCTL_PFINFO_L1TAB) && + (type <= XEN_DOMCTL_PFINFO_L4TAB) ) + { + int race; + void *dpage = malloc(PAGE_SIZE); + + race = xc_canonicalise_pagetable(type, addr, page, dpage, + live_p2m_table, + live_m2p_table, + m2p_mfn0, + p2m_size, max_mfn, + hvirt_start, pt_levels, + guest_width); + + munmap(page, PAGE_SIZE); + page = dpage; + pt_page = 1; + } + } + else + { + munmap(page, PAGE_SIZE); + copy_frame = 0; + } + } +#endif + if ( copy_frame ) + { + off64_t seek_ret; + int total_written = 0; + + seek_ret = lseek64(fd, pfn_offset(pfn), SEEK_SET); + + while ( total_written < PAGE_SIZE ) + { + void *p = page + total_written; + bytes_written = write(fd, p, PAGE_SIZE - total_written); + + if ( bytes_written <= 0 ) + { + perror("failed to write to file"); + return -1; + } + + total_written += bytes_written; + } + + /* Free or unmap page if needed */ + if ( pt_page ) + free(page); + else + munmap(page, PAGE_SIZE); + } + } + + pfn++; + } + +#if 1 + out: +#endif + close(fd); + + return 0; +} + +#if COW +int dump_cow(char *filename, unsigned long p2m_size, char *cow_file) +{ + int open_flags = O_CREAT | O_TRUNC | O_RDWR; + mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP | S_IWOTH; + int fd_cow; + int fd_dump; + unsigned long pfn; + void *page = malloc(PAGE_SIZE); + + /* Open files */ + fd_cow = open(cow_file, O_RDONLY, open_mode); + if ( fd_cow < 0 ) + { + perror("failed to open cow file"); + return -1; + } + + fd_dump = open(filename, open_flags, open_mode); + if ( fd_dump < 0 ) + { + perror("failed to open dump file"); + return -1; + } + + /* Write out memory contents */ + pfn = 0; + while ( pfn < p2m_size ) + { + off64_t seek_ret; + int total_read; + int total_written; + + /* Read page */ + seek_ret = lseek64(fd_cow, pfn_offset(pfn), SEEK_SET); +#if 0 + if ( ret < 0 ) + { + perror("failed to seek cow file"); + return -1; + } +#endif + + total_read = 0; + while ( total_read < PAGE_SIZE ) + { + void *p = page + total_read; + int bytes_read = read(fd_cow, p, PAGE_SIZE - total_read); + if ( bytes_read <= 0 ) + { + perror("failed to read cow file"); + return -1; + } + + total_read += bytes_read; + } + + /* Write memory contents to file */ + seek_ret = lseek64(fd_dump, pfn_offset(pfn), SEEK_SET); +#if 0 + if ( ret < 0 ) + { + perror("failed to seek cow file"); + return -1; + } +#endif + + total_written = 0; + while ( total_written < PAGE_SIZE ) + { + void *p = page + total_written; + int bytes_written = write(fd_dump, p, PAGE_SIZE - total_written); + if ( bytes_written <= 0 ) + { + perror("failed to write dump file"); + return -1; + } + + total_written += bytes_written; + } + + pfn++; + } + + close(fd_dump); + close(fd_cow); + + return 0; +} +#endif + +int main(int argc, char *argv[]) +{ + xc_dominfo_t info; + domid_t domain_id; + int snapshot_num; + int xc_handle; + xen_pfn_t *live_m2p_table; + xen_pfn_t *live_p2m_table; + unsigned long m2p_mfn0; + unsigned long p2m_size; + unsigned long max_mfn; + unsigned long hvirt_start; + unsigned int pt_levels; + unsigned int guest_width; +#if 1 +#if COW + char fuse_file[200]; +#endif +#endif + int rc; + + domain_id = atoi(argv[1]); + /* TODO: find this automatically */ + snapshot_num = atoi(argv[2]); + + if ( argc > 3 ) + { + if ( strcmp(argv[3], "-d") == 0 ) + debug_mode = 1; + } + + /* Open connection to Xen */ + rc = xc_interface_open(); + if ( rc < 0 ) + { + warning("failed to connect to Xen\n"); + goto out; + } + xc_handle = rc; + + /* Get some info */ + p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &domain_id) + 1; + + rc = get_platform_info(xc_handle, domain_id, &max_mfn, &hvirt_start, + &pt_levels, &guest_width); + if ( rc != 1 ) + { + warning("failed to get platform info\n"); + goto out; + } + + /* Get HVM info */ + rc = xc_domain_getinfo(xc_handle, domain_id, 1, &info); + if ( rc != 1 ) + { + warning("failed to get domain info\n"); + goto out; + } + + /* Print info */ + if ( debug_mode ) + { + printf("p2m_size: %lu\n", p2m_size); + printf("max_mfn: %lx (%lu)\n", max_mfn, max_mfn); + printf("hvirt_start: %lx (%lu)\n", hvirt_start, hvirt_start); + printf("pt_levels: %x (%u)\n", pt_levels, pt_levels); + printf("guest_width: %x (%u)\n", guest_width, guest_width); + printf("shared_info_frame: %lx (%lu)\n", info.shared_info_frame, info.shared_info_frame); + } + + /* Setup the ofn to mfn table mapping */ + if ( info.hvm ) + { + if ( debug_mode ) + printf("HVM guest\n"); + + live_p2m_table = NULL; + } + else + { + if ( debug_mode ) + printf("PV guest\n"); + + /* Get live p2m table */ + live_p2m_table = xc_get_live_p2m_table(xc_handle, domain_id, p2m_size, + guest_width); + if ( !live_p2m_table ) + { + warning("failed to get live p2m table\n"); + goto out; + } + } + + /* Setup the mfn to pfn table mapping */ + live_m2p_table = xc_map_m2p(xc_handle, max_mfn, PROT_READ, &m2p_mfn0); + if ( !live_m2p_table ) + { + warning("failed to map live m2p table\n"); + goto out; + } + + /* Pause domain */ + printf("Pausing domain\n"); + rc = xc_domain_pause(xc_handle, domain_id); + if ( rc != 0 ) + { + warning("failed to pause domain"); + goto out; + } + sleep(1); + +#if 1 +#if COW + /* Take snapshot */ + printf("Taking snapshot\n"); + rc = system("touch /tmp/foo/1"); + sleep(1); + + /* Dump CoW image */ + printf("Dumping CoW image (1)... "); + fflush(stdout); + sprintf(fuse_file, "/tmp/foo/%d.%d", domain_id, snapshot_num); + dump_cow(COW_FILE1, p2m_size, fuse_file); + printf("done\n"); +#endif +#endif + + /* Dump live VM image */ + printf("Dumping live VM image (1)... "); + fflush(stdout); + dump_memory(LIVE_FILE1, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + printf("done\n"); + + /* Dump another live VM image */ + printf("Dumping live VM image (2)... "); + fflush(stdout); + dump_memory(LIVE_FILE2, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + printf("done\n"); + + /* Unpause domain */ + printf("Unpausing domain\n"); + rc = xc_domain_unpause(xc_handle, domain_id); + if ( rc != 0 ) + { + warning("failed to unpause domain"); + goto out; + } + +#if 1 +#if COW + /* Let domain run for a bit */ + printf("Sleeping for %d seconds\n", SLEEP_TIME); + sleep(SLEEP_TIME); + + /* Dump CoW image */ + printf("Dumping CoW image (2)... "); + fflush(stdout); + sprintf(fuse_file, "/tmp/foo/%d.0", domain_id); + dump_cow(COW_FILE2, p2m_size, fuse_file); + printf("done\n"); +#endif +#endif + + /* Compare images*/ + printf("-- Comparing images --\n"); + + printf("Comparing live1 live2...\n"); + rc = compare(LIVE_FILE1, LIVE_FILE2, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + if ( rc != 0 ) + printf("Images do not match (%d pages different)\n", -rc); + else + printf("Images match\n"); + +#if COW + printf("Comparing cow1 cow2...\n"); + rc = compare(COW_FILE1, COW_FILE2, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + if ( rc != 0 ) + printf("Images do not match (%d pages different)\n", -rc); + else + printf("Images match\n"); + + printf("Comparing live1 cow1...\n"); + rc = compare(LIVE_FILE1, COW_FILE1, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + if ( rc != 0 ) + printf("Images do not match (%d pages different)\n", -rc); + else + printf("Images match\n"); + + printf("Comparing live1 cow2...\n"); +// rc = compare(LIVE_FILE1, COW_FILE2, PAGE_SIZE, p2m_size, xc_handle, domain_id, live_p2m_table); + rc = compare(LIVE_FILE1, COW_FILE2, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + if ( rc != 0 ) + printf("Images do not match (%d pages different)\n", -rc); + else + printf("Images match\n"); + + printf("Comparing live2 cow1...\n"); +// rc = compare(LIVE_FILE2, COW_FILE1, PAGE_SIZE, p2m_size, xc_handle, domain_id, live_p2m_table); + rc = compare(LIVE_FILE2, COW_FILE1, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + if ( rc != 0 ) + printf("Images do not match (%d pages different)\n", -rc); + else + printf("Images match\n"); + + printf("Comparing live2 cow2...\n"); +// rc = compare(LIVE_FILE2, COW_FILE2, PAGE_SIZE, p2m_size, xc_handle, domain_id, live_p2m_table); + rc = compare(LIVE_FILE2, COW_FILE2, xc_handle, domain_id, live_p2m_table, live_m2p_table, + m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels, guest_width); + if ( rc != 0 ) + printf("Images do not match (%d pages different)\n", -rc); + else + printf("Images match\n"); +#endif + + out: + return 0; +} + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a tools/xencow/xencowfs/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/xencowfs/Makefile Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,35 @@ +XEN_ROOT=../../.. +include $(XEN_ROOT)/tools/Rules.mk + +CFLAGS += -I $(XEN_XC) +CFLAGS += -I ../lib +CFLAGS += $(CFLAGS_libxenctrl) + +SRCS += xencowfs.c + +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -D_FILE_OFFSET_BITS=64 +CFLAGS += -g +CFLAGS += -Wl,-rpath,.. + +LDFLAGS += $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) -L../lib -lxencow -lfuse -lpthread + +OBJS = $(SRCS:.c=.o) +IBINS = xencowfs + +all: $(IBINS) + +xencowfs: $(OBJS) + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +install: all + $(INSTALL_DIR) $(DESTDIR)$(SBINDIR) + $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(SBINDIR) + +clean: + rm -f *.o *~ $(DEPS) xen TAGS $(IBINS) $(LIB) + +.PHONY: clean install + +-include $(DEPS) diff -r 0477f9061c8a tools/xencow/xencowfs/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/xencowfs/README Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,24 @@ + + +First, launch the desired target domain. Create a mount point for the xencow FUSE module to use and from the tools/xencow/xencowfs directory, run: + +sudo ./xencowfs <mount point> <domid> + +This will initialise CoW for the domain. To take a snapshot, simply poke the FUSE mount point (e.g. touch <mount point>/1). The file name doesn't matter as FUSE will create it's own file named: + +<mount point>/<domid>.<snapshot> + +e.g. xencow/1.0, xencow/1.1 + +Currently, the backing files are hardcoded to appear in /tmp with the following names: + +/tmp/xencow<domid>.<snapshot> +/tmp/xencow<domid>.<snapshot>.state + +The <domid>.<snapshot> pair corresponds to the FUSE file. The .state file contains state pages (e.g. CPU registers), while the other file contains the pre-dirtied pages for that domain. + +It is possible to use XenAccess in file mode to access the snapshot image. A slightly modified version of the memory-dump example from XenAccess 0.5 is included which is designed to work on the CoW image. To use it, run (I've only tried running it from the xenaccess-0.5/examples/): + +sudo dump-memory-cow <FUSE image file> <output file> + +This will create a complete memory image of the running domain at the time the snapshot was taken. \ No newline at end of file diff -r 0477f9061c8a tools/xencow/xencowfs/xencowfs.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xencow/xencowfs/xencowfs.c Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,253 @@ +/****************************************************************************** + * tools/xencow/xencowfs/xencowfs.c + * + * VM memory Copy-on-Write FUSE module + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#define FUSE_USE_VERSION 26 + + +#include <fuse.h> +#include <string.h> +#include <errno.h> +//#include <xc_private.h> + +#include "../lib/xencow.h" + + +#define DEBUG_OUTPUT 0 + + +static xencow_t *cow; + +static inline int get_snapshot_num(const char *path) +{ + return atoi(strrchr(path, '.')); +} + +static int path_exists(const char *path) +{ + xencow_snapshot_t *snapshot; + + list_for_each_entry ( snapshot, &cow->snapshots, list ) + if ( (strcmp(snapshot->xencowfs_file, path) == 0) + || ((path[0] == '/') + && (strcmp(snapshot->xencowfs_file, path + 1) == 0)) ) + return 1; + + return 0; +} + +static int xencowfs_create(const char *path, mode_t mode, + struct fuse_file_info *fi) +{ + return xencow_snapshot(cow); +} + +static void xencowfs_destroy(void *data) +{ + (void) data; + + xencow_disable(cow); +} + +static int xencowfs_getattr(const char *path, struct stat *stat) +{ + int res = 0; + + memset(stat, 0, sizeof(struct stat)); + + if ( strcmp(path, "/") == 0 ) + { + stat->st_mode = S_IFDIR | 0755; + stat->st_nlink = 2; + } + else if ( path_exists(path) ) + { + stat->st_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH; + stat->st_nlink = 1; + stat->st_size = cow->p2m_size << PAGE_SHIFT; + stat->st_blksize = PAGE_SIZE; + stat->st_blocks = cow->p2m_size; + } + else + res = -ENOENT; + + return res; +} + +static void *xencowfs_init(struct fuse_conn_info *conn) +{ + return NULL; +} + +static int xencowfs_open(const char *path, struct fuse_file_info *fi) +{ + if ( !path_exists(path) ) + return -ENOENT; + + if ( (fi->flags & 3) != O_RDONLY ) + return -EACCES; + + return 0; +} + +static int xencowfs_read(const char *path, char *buffer, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + int snapshot_num; + unsigned long start_pfn; + int num_pages; + int ret; + + (void) fi; + + /* TODO: Worry about offsets not page aligned */ + start_pfn = offset_pfn(offset); + num_pages = size >> PAGE_SHIFT; + + /* Page align check */ + if ( pfn_offset(start_pfn) != offset ) + ERROR("Offset not page aligned!"); + + /* Check that it doesn't read past the end of the domain's memory */ + if ( start_pfn + num_pages > cow->p2m_size ) + num_pages = cow->p2m_size - start_pfn; + + IPRINTF("size = %lx; offset = %lx; num_pages = %d\n", + (unsigned long)size, (unsigned long)offset, num_pages); + + /* Get the appropriate snapshot */ + snapshot_num = get_snapshot_num(path); + + ret = xencow_read_buffer(cow, snapshot_num, start_pfn, num_pages, buffer); + if ( ret != 0 ) + goto out; + + ret = size; + + out: + return ret; +} + +static int xencowfs_readdir(const char *path, void *buffer, + fuse_fill_dir_t filler, off_t offset, + struct fuse_file_info *fi) +{ + xencow_snapshot_t *snapshot; + + (void) offset; + (void) fi; + + if ( strcmp(path, "/") != 0) + return -ENOENT; + + filler(buffer, ".", NULL, 0); + filler(buffer, "..", NULL, 0); + + list_for_each_entry(snapshot, &cow->snapshots, list) + filler(buffer, snapshot->xencowfs_file, NULL, 0); + + return 0; +} + +static int xencowfs_statfs(const char *path, struct statvfs *buf) +{ + (void) path; + + buf->f_bsize = PAGE_SIZE; + buf->f_blocks = cow->p2m_size; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_fsid = 0; + buf->f_namemax = 255; + buf->f_favail = 0; + buf->f_frsize = buf->f_blocks; + buf->f_flag = 0; + + return 0; +} + +static struct fuse_operations xencowfs_oper = { + .create = xencowfs_create, + .destroy = xencowfs_destroy, + .getattr = xencowfs_getattr, + .init = xencowfs_init, + .open = xencowfs_open, + .read = xencowfs_read, + .readdir = xencowfs_readdir, + .statfs = xencowfs_statfs, +}; + +int main(int argc, char *argv[]) +{ + domid_t domid; + int rc; + + IPRINTF("Start\n"); + + /* The last arg is the domain number */ + rc = -EINVAL; + domid = atoi(argv[argc - 1]); + if ( domid == 0 ) + { + ERROR("Invalid domain"); + exit(rc); + } + + /* Initialise CoW */ + IPRINTF("Initialise CoW\n"); + rc = -ENOMEM; + cow = xencow_init(domid); + if ( cow == NULL ) + { + ERROR("Could not initialise CoW"); + exit(rc); + } + + /* Enable CoW */ + IPRINTF("Enable CoW\n"); + rc = xencow_enable(cow); + if ( rc != 0 ) + { + ERROR("Could not enable CoW: rc = %d", rc); + exit(rc); + } + IPRINTF("CoW enabled\n"); + + rc = fuse_main(argc - 1, argv, &xencowfs_oper, NULL); + if ( rc != 0 ) + xencow_disable(cow); + + return rc; +} + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/arch/x86/domctl.c Mon Apr 20 10:21:49 2009 -0700 @@ -28,6 +28,7 @@ #include <asm/processor.h> #include <xsm/xsm.h> #include <xen/iommu.h> +#include <asm/cow.h> long arch_do_domctl( struct xen_domctl *domctl, @@ -1087,6 +1088,132 @@ } break; + /* TODO: replace with XEN_DOMCTL_cow_op */ + case XEN_DOMCTL_cow_enable: + { + struct domain *d; + void *ring_page; + + ret = -EINVAL; + if ( domctl->domain == current->domain->domain_id ) + break; + + ret = -ESRCH; + d = rcu_lock_domain_by_id(domctl->domain); + if ( d == NULL ) + break; + + /* FIXME: Some other error code? */ + ret = -EINVAL; + ring_page = map_domain_page_global(domctl->u.cow_enable.mfn); + if ( ring_page == NULL ) + goto cow_enable_out; + + BACK_RING_INIT(&d->arch.paging.cow.back_ring, (cow_sring_t *)ring_page, PAGE_SIZE); + + ret = 0; + + cow_enable_out: + printk("CoW: enabled: ret: %ld\n", ret); + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_cow_snapshot: + { + struct domain *d; + RING_IDX when; + + ret = -EINVAL; + if ( domctl->domain == current->domain->domain_id ) + break; + + ret = -ESRCH; + d = rcu_lock_domain_by_id(domctl->domain); + if ( d == NULL ) + break; + + ret = cow_snapshot(d, &when); + if ( ret != 0 ) + goto cow_snapshot_out; + + domctl->u.cow_snapshot.when = when; + ret = 0; + + if ( copy_to_guest(u_domctl, domctl, 1) ) + ret = -EFAULT; + + cow_snapshot_out: + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_cow_resume: + { + struct domain *d; + + + ret = -EINVAL; + if ( domctl->domain == current->domain->domain_id ) + break; + + ret = -ESRCH; + d = rcu_lock_domain_by_id(domctl->domain); + if ( d == NULL ) + break; + + ret = cow_resume(d); + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_cow_disable: + { + struct domain *d; + + ret = -EINVAL; + if ( domctl->domain == current->domain->domain_id ) + break; + + ret = -ESRCH; + d = rcu_lock_domain_by_id(domctl->domain); + if ( d == NULL ) + break; + + ret = paging_log_dirty_disable(d); + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_cow_page_type: + { + struct domain *d; + struct page_info *page; + + ret = -EINVAL; + if ( domctl->domain == current->domain->domain_id ) + break; + + ret = -ESRCH; + d = rcu_lock_domain_by_id(domctl->domain); + if ( d == NULL ) + break; + + page = mfn_to_page(domctl->u.cow_page_type.mfn); + + domctl->u.cow_page_type.count_info = page->count_info; + domctl->u.cow_page_type.type_info = page->u.inuse.type_info; + ret = 0; + + if ( copy_to_guest(u_domctl, domctl, 1) ) + ret = -EFAULT; + + rcu_unlock_domain(d); + } + break; + default: ret = -ENOSYS; break; diff -r 0477f9061c8a xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/arch/x86/hvm/hvm.c Mon Apr 20 10:21:49 2009 -0700 @@ -1542,8 +1542,8 @@ } else { + paging_mark_dirty(curr->domain, mfn); memcpy(p, buf, count); - paging_mark_dirty(curr->domain, mfn); } } else diff -r 0477f9061c8a xen/arch/x86/mm/Makefile --- a/xen/arch/x86/mm/Makefile Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/arch/x86/mm/Makefile Mon Apr 20 10:21:49 2009 -0700 @@ -6,6 +6,7 @@ obj-y += guest_walk_2.o obj-y += guest_walk_3.o obj-$(x86_64) += guest_walk_4.o +obj-y += cow.o guest_walk_%.o: guest_walk.c Makefile $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@ diff -r 0477f9061c8a xen/arch/x86/mm/cow.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/cow.c Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,832 @@ +/****************************************************************************** + * arch/x86/mm/cow.c + * + * CoW paging support + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * Parts based on earlier work by Geoffrey Lefebvre + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <asm/cow.h> +#include <asm/paging.h> +#include <xen/event.h> + + +#define COW_DOMAIN_PAUSE 0 +#define COW_DEBUG_OUTPUT 0 + + +/* Printouts */ +#define PAGING_PRINTK(_f, _a...) \ + debugtrace_printk("pg: %s(): " _f, __func__, ##_a) +#define PAGING_ERROR(_f, _a...) \ + printk("pg error: %s(): " _f, __func__, ##_a) +#if COW_DEBUG_OUTPUT +#define PAGING_DEBUG(flag, _f, _a...) \ + do { \ + if (PAGING_DEBUG_ ## flag) \ + printk("pgdebug: %s(): " _f, __func__, ##_a); \ + } while (0) +#else +#define PAGING_DEBUG(flag, _f, _a...) \ + do { \ + if (PAGING_DEBUG_ ## flag) \ + debugtrace_printk("pgdebug: %s(): " _f, __func__, ##_a); \ + } while (0) +#endif + + +#define STATE_MFN ((unsigned long)(-1)) + +#define xen_mb() mb() +#define xen_rmb() rmb() +#define xen_wmb() wmb() + +/* Override macros from asm/page.h to make them work with mfn_t */ +#undef mfn_to_page +#define mfn_to_page(_m) (frame_table + mfn_x(_m)) +#undef mfn_valid +#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page) +#undef page_to_mfn +#define page_to_mfn(_pg) (_mfn((_pg) - frame_table)) + +/* The CoW lock. This protects the log-dirty bitmap from concurrent accesses + * (and teardowns, etc). + * + * Locking discipline: always acquire log dirty lock before this one. */ + +#define cow_lock_init(_d) \ + do { \ + spin_lock_init(&(_d)->arch.paging.cow.lock); \ + (_d)->arch.paging.cow.locker = -1; \ + (_d)->arch.paging.cow.locker_function = "nobody"; \ + } while (0) + +#define cow_lock(_d) \ + do { \ + if (unlikely((_d)->arch.paging.cow.locker==current->processor)) \ + { \ + printk("Error: paging cow lock held by %s\n", \ + (_d)->arch.paging.cow.locker_function); \ + BUG(); \ + } \ + spin_lock(&(_d)->arch.paging.cow.lock); \ + ASSERT((_d)->arch.paging.cow.locker == -1); \ + (_d)->arch.paging.cow.locker = current->processor; \ + (_d)->arch.paging.cow.locker_function = __func__; \ + } while (0) + +#define cow_unlock(_d) \ + do { \ + ASSERT((_d)->arch.paging.cow.locker == current->processor); \ + (_d)->arch.paging.cow.locker = -1; \ + (_d)->arch.paging.cow.locker_function = "nobody"; \ + spin_unlock(&(_d)->arch.paging.cow.lock); \ + } while (0) + + +/* XXX: ugly cut and paste from common/grant_table.c */ +#define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry)) +#define active_entry(t, e) ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE]) + + +static void cow_notify_dom0_pause(unsigned long unused) +{ + printk("cow: notifying dom0 that domain is paused\n"); + send_guest_global_virq(dom0, VIRQ_COW_PAUSE); +} +static DECLARE_TASKLET(cow_notify_dom0_pause_tasklet, cow_notify_dom0_pause, 0); + +static void cow_notify_dom0_high_water(unsigned long flag_addr) +{ + printk("cow: notifying dom0 that ring buffer passed high water mark\n"); + send_guest_global_virq(dom0, VIRQ_COW_BUFFER); + (*(bool_t *)flag_addr) = 0; +} +static DECLARE_TASKLET(cow_notify_dom0_high_water_tasklet, cow_notify_dom0_high_water, 0); + +static void paging_free_cow_bitmap(unsigned long **bitmap) +{ + if ( likely(*bitmap != NULL) ) + { + printk("cow: freeing bitmap\n"); + xfree(*bitmap); + *bitmap = NULL; + } +} + +static void paging_free_cow(struct domain *d) +{ + printk("cow: freeing bitmaps\n"); + paging_free_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap); + paging_free_cow_bitmap(&d->arch.paging.cow.bitmap); +} + +static int paging_alloc_cow_bitmap(unsigned long **bitmap, + unsigned long bitmap_size) +{ + BUG_ON(bitmap_size == 0); + + if ( unlikely(*bitmap == NULL) ) + { + *bitmap = xmalloc_array(unsigned long, bitmap_size / BITS_PER_LONG); + + if ( unlikely(*bitmap == NULL) ) + return -ENOMEM; + } + + memset(*bitmap, 0, bitmap_size / 8); + + return 0; +} + +/* Get address of current buffer page for a given domain */ +static unsigned long cow_get_buffer_page(struct domain *d) +{ + cow_request_t req; + cow_back_ring_t *back_ring; + RING_IDX req_cons; + + cow_ring_lock(d); + + back_ring = &d->arch.paging.cow.back_ring; + req_cons = back_ring->req_cons; + +#if COW_DEBUG_OUTPUT + printk("cow: xen_page_for_domain %d\n", d->domain_id); +#endif + + /* Get buffer page */ + memcpy(&req, (RING_GET_REQUEST(back_ring, req_cons)), sizeof(req)); + req_cons++; + + back_ring->req_cons = req_cons; + back_ring->sring->req_event = req_cons + 1; + +#if COW_DEBUG_OUTPUT + printk("cow: num: %ld buffer mfn %" PRI_mfn "\n", (unsigned long)req_cons, req.mfn); +#endif + + cow_ring_unlock(d); + + return req.mfn; +} + +static void cow_copy_page(struct domain *d, unsigned long guest_mfn, + void *guest_page) +{ + mfn_t gmfn; + unsigned long pfn; + unsigned long buffer_mfn; + void *buffer_page; + cow_response_t rsp; + cow_back_ring_t *back_ring; + +#if COW_DEBUG_OUTPUT + printk("cow: copy page: start\n"); +#endif + + /* We /really/ mean PFN here, even for non-translated guests. */ + if ( guest_mfn != STATE_MFN ) + { + gmfn = _mfn(guest_mfn); + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + BUG_ON(!VALID_M2P(pfn)); + } + else + pfn = STATE_MFN; + +#if COW_DEBUG_OUTPUT + printk("cow: copy page: locking ring\n"); +#endif + + buffer_mfn = cow_get_buffer_page(d); + buffer_page = map_domain_page(buffer_mfn); + +#if COW_DEBUG_OUTPUT + printk("cow: copy page: mapped buffer page\n"); +#endif + + /* Copy page */ +#if COW_DEBUG_OUTPUT + printk("cow: copy guest page\n"); +#endif + memcpy(buffer_page, guest_page, PAGE_SIZE); + + PAGING_DEBUG(COW, + "copied page: mfn %" PRI_mfn + "; pfn %lx; page first chunk (%lx, %lx) from dom %d\n", + guest_mfn, pfn, *((unsigned long*)guest_page), + *((unsigned long*)buffer_page), d->domain_id); + + /* Unmap pages */ + unmap_domain_page(buffer_page); + + /* Replace mfn in ring with pfn */ + cow_ring_lock(d); + + back_ring = &d->arch.paging.cow.back_ring; + + rsp.pfn = pfn; + memcpy(RING_GET_RESPONSE(back_ring, back_ring->rsp_prod_pvt), + &rsp, sizeof(rsp)); + + /* Update number of pages copied */ + back_ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES(back_ring); + + cow_ring_unlock(d); +} + +/* Save a page into a buffer */ +static void cow_save_page(struct domain *d, unsigned long guest_mfn) +{ + void *guest_page; + + guest_page = map_domain_page(guest_mfn); + cow_copy_page(d, guest_mfn, guest_page); + unmap_domain_page(guest_page); +} + +static void cow_pause_domain(struct domain *d, unsigned long guest_mfn, + bool_t is_pre_dirty) +{ +#if !COW_DOMAIN_PAUSE + struct vcpu *v; +#endif + + if ( d->arch.paging.cow.is_paused ) + { + PAGING_DEBUG(COW, + "domain already paused domain %d; mfn: %" PRI_mfn "\n", + d->domain_id, guest_mfn); + return; + } + + d->arch.paging.cow.is_paused = 1; + d->arch.paging.cow.is_paused_pre_dirty = is_pre_dirty; + d->arch.paging.cow.paused_guest_mfn = guest_mfn; + + PAGING_DEBUG(COW, + "not enough buffer space, pausing domain %d; mfn: %" + PRI_mfn "\n", d->domain_id, guest_mfn); + + printk("cow: pausing domain\n"); + +#if COW_DOMAIN_PAUSE + domain_pause(d); +#else + atomic_inc(&d->pause_count); + + for_each_vcpu( d, v ) +// vcpu_pause_nosync(v); + vcpu_sleep_nosync(v); +#endif + + tasklet_schedule(&cow_notify_dom0_pause_tasklet); +} + +static int cow_new_snapshot(struct domain *d) +{ + int ret; + + ret = paging_alloc_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap, + d->arch.paging.cow.bitmap_size); + if ( unlikely(ret != 0) ) + goto free_log_dirty; + + ret = paging_alloc_cow_bitmap(&d->arch.paging.cow.bitmap, + d->arch.paging.cow.bitmap_size); + if ( unlikely(ret != 0) ) + goto free_precow_foreign; + +#if COW_DEBUG_OUTPUT + printk("cow enabled for dom %d\n", d->domain_id); +#endif + + return 0; + + free_precow_foreign: + paging_free_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap); + free_log_dirty: + /* FIXME: This probably shouldn't be here any more... */ + paging_free_log_dirty_bitmap(d); + return ret; +} + +/* Check to make sure there's enough space in the buffer to continue */ +static int cow_check_threshold(struct domain *d) +{ + RING_IDX req_prod; + RING_IDX req_cons; + RING_IDX free_slots; + + req_prod = d->arch.paging.cow.back_ring.sring->req_prod; + req_cons = d->arch.paging.cow.back_ring.req_cons; + + if ( unlikely(d->arch.paging.cow.is_paused) ) + { + printk("cow_paging: check_threshold: domain still paused\n"); + return -EBUSY; + } + + free_slots = req_prod - req_cons; + + if ( unlikely(free_slots < XEN_COW_RING_THRESHOLD) ) + { + printk("cow_paging: check_threshold: no space left: req_prod = %d;" + "req_cons = %d; free_slots = %d\n", + req_prod, req_cons, free_slots); + return -ENOSPC; + } + + /* Notify ring buffer consumer that we've crossed the high water mark */ + if ( !d->arch.paging.cow.notified_high_water + && (free_slots < d->arch.paging.cow.ring_high_water) ) + { +#if COW_DEBUG_OUTPUT + printk("cow: check_threshold: passed high water mark\n"); +#endif + + d->arch.paging.cow.notified_high_water = 1; + cow_notify_dom0_high_water_tasklet.data = (unsigned long)&d->arch.paging.cow.notified_high_water; + tasklet_schedule(&cow_notify_dom0_high_water_tasklet); + } + + return 0; +} + +static int cow_save_state(struct domain *d) +{ + struct vcpu *v; + void *vcpu_page; + int i = 0; + +#if COW_DEBUG_OUTPUT + printk("cow: save state: start\n"); +#endif + + vcpu_page = xmalloc_bytes(PAGE_SIZE); + if ( unlikely(vcpu_page == NULL) ) + return -ENOMEM; + +#if COW_DEBUG_OUTPUT + printk("cow: save state: allocated page\n"); +#endif + + memset(vcpu_page, 0, PAGE_SIZE); + +#if COW_DEBUG_OUTPUT + printk("cow: save state: cleared page\n"); +#endif + + /* Save state for each vcpu */ + for_each_vcpu(d, v) + { + void *p = vcpu_page + (i * sizeof(v->arch.guest_context.user_regs)); + memcpy(p, &v->arch.guest_context.user_regs, + sizeof(v->arch.guest_context.user_regs)); + + i++; + } + +#if COW_DEBUG_OUTPUT + printk("cow: save state: copied CPU info\n"); +#endif + +#if 0 + cow_copy_page(d, STATE_MFN, d->shared_info); +#endif + cow_copy_page(d, STATE_MFN, vcpu_page); + +#if COW_DEBUG_OUTPUT + printk("cow: save state: copied pages\n"); +#endif + + xfree(vcpu_page); + +#if COW_DEBUG_OUTPUT + printk("cow: save state: done\n"); +#endif + + return 0; +} + +static int cow_scan_foreign_mapping(struct domain *d) +{ + RING_IDX req_prod; + RING_IDX req_cons; + int free_slots; + unsigned int num_entries; + unsigned int i; + int ret; + + ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL); + + /* Get the grant table lock */ + spin_lock(&d->grant_table->lock); + + /* Find active entires */ + num_entries = 0; + for ( i = 0; i < nr_grant_entries(d->grant_table); i++ ) + { + struct active_grant_entry *act = &active_entry(d->grant_table, i); + + /* XXX: Is pin guaranteed to be zero for an inactive grant? */ + /* XXX: Do I need to worry about device mapping? */ + if ( act->pin & GNTPIN_hstw_mask || act->pin & GNTPIN_devw_mask ) + num_entries++; + } + + /* Make sure there's enough buffer space for this */ + req_prod = d->arch.paging.cow.back_ring.sring->req_prod; + req_cons = d->arch.paging.cow.back_ring.req_cons; + free_slots = req_prod - req_cons; + + ret = -ENOSPC; + if ( unlikely(free_slots < num_entries + XEN_COW_RING_THRESHOLD) ) + { + printk("cow_paging: scan_foreign: not enough space left\n"); + d->arch.paging.cow.is_paused_scan_foreign = 1; + goto out; + } + + /* For each entry in the active list, save the page */ + for ( i = 0; i < nr_grant_entries(d->grant_table); i++ ) + { + struct active_grant_entry *act = &active_entry(d->grant_table, i); + + if ( act->pin & GNTPIN_hstw_mask || act->pin & GNTPIN_devw_mask ) + { + mfn_t gmfn; + unsigned long pfn; + + gmfn = _mfn(act->frame); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + ASSERT(VALID_M2P(pfn)); + ASSERT(mfn_valid(gmfn)); + + /* Set the bit in the precow bitmap */ +#if 1 + __set_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap); +#else + set_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap); +#endif + + /* + * If we have mapping with other domain, we won't be able + * to coordinate with them so just save page to be safe + */ + cow_save_page(d, act->frame); + } + } + + ret = 0; + out: + /* Release lock */ + spin_unlock(&d->grant_table->lock); + return ret; +} + +static int cow_take_snapshot(struct domain *d) +{ + int ret; + +#if COW_DEBUG_OUTPUT + printk("cow checking threshold\n"); +#endif + + ret = cow_check_threshold(d); + if ( unlikely(ret != 0) ) + return ret; + +#if COW_DEBUG_OUTPUT + printk("cow new snapshot\n"); +#endif + + ret = cow_new_snapshot(d); + if ( unlikely(ret != 0) ) + return ret; + +#if COW_DEBUG_OUTPUT + printk("cow saving state\n"); +#endif + + ret = cow_save_state(d); + if ( unlikely(ret != 0) ) + return ret; + +#if COW_DEBUG_OUTPUT + printk("cow scan foreign\n"); +#endif + + /* + * Fill the precow bitmap by scanning the active grant list. + * We are racing with devices here, so we need to coordinate + * We will probably only coordinate with dom0. + */ + cow_scan_foreign_mapping(d); + +#if COW_DEBUG_OUTPUT + printk("cow snapshot taken\n"); +#endif + + return 0; +} + +void cow_init(struct domain *d) +{ + cow_lock_init(d); + cow_ring_lock_init(d); + disable_cow(d); +} + +void cow_teardown(struct domain *d) +{ + cow_lock(d); + paging_free_cow(d); + cow_unlock(d); +} + +int cow_enable(struct domain *d) +{ + int ret; + + cow_lock(d); + + ret = -EINVAL; + if ( cow_enabled(d) ) + goto out; + + d->arch.paging.cow.bitmap_size = + (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); + + /* 50% high water mark */ + d->arch.paging.cow.ring_high_water = RING_SIZE(&d->arch.paging.cow.back_ring) >> 1; + d->arch.paging.cow.notified_high_water = 0; + + d->arch.paging.cow.is_paused = 0; + d->arch.paging.cow.is_paused_pre_dirty = 0; + d->arch.paging.cow.is_paused_scan_foreign = 0; + d->arch.paging.cow.paused_guest_mfn = 0; + + enable_cow(d); + + ret = 0; + + out: + cow_unlock(d); + return ret; +} + +void cow_disable(struct domain *d) +{ + printk("cow: disable cow for domain %d\n", d->domain_id); + + disable_cow(d); + + cow_lock(d); + paging_free_cow(d); + cow_unlock(d); +} + +/* Take proper action when a page is mapped writable in a foreign domain */ +void cow_pre_dirty(struct domain *d, unsigned long guest_mfn) +{ + unsigned long pfn; + mfn_t gmfn; + int rc; + + gmfn = _mfn(guest_mfn); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + BUG_ON(!VALID_M2P(pfn)); + + cow_lock(d); + + ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL); + ASSERT(d->arch.paging.cow.bitmap != NULL); + BUG_ON( test_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap) ); + + if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) ) + { +#if COW_DEBUG_OUTPUT + printk("cow: pre dirty: mfn = %lx\n", guest_mfn); +#endif + + rc = cow_check_threshold(d); + if ( rc != 0 ) + { + __clear_bit(pfn, d->arch.paging.cow.bitmap); + cow_pause_domain(d, guest_mfn, 1); + goto out; + } + + cow_save_page(d, guest_mfn); + } + + out: + cow_unlock(d); +} + +void cow_mark_dirty(struct domain *d, unsigned long guest_mfn) +{ + unsigned long pfn; + mfn_t gmfn; + int rc; + + cow_lock(d); + + ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL); + ASSERT(d->arch.paging.cow.bitmap != NULL); + + gmfn = _mfn(guest_mfn); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + /* + * Values with the MSB set denote MFNs that aren't really part of the + * domain's pseudo-physical memory map (e.g., the shared info frame). + * Nothing to do here... + */ + if ( unlikely(!VALID_M2P(pfn)) ) + goto out; + + /* Test saved_page bitmap */ +#if 0 +#if COW_DEBUG_OUTPUT + if ( test_bit(pfn, d->arch.paging.cow.bitmap) ) + printk("cow: already marked dirty: mfn = %lx\n", guest_mfn); +#endif +#endif + + /* Test precow bitmap */ + if ( test_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap) ) + { + /* + * This is either a ring page(ok) or + * the guest is racing with a device to + * write to the page but since we are racing + * with a device, we can really save the page either. + * In the latter case, the checkpoint will most + * likely be broken. + */ + PAGING_DEBUG(COW, + "write to precow foreign page %" PRI_mfn + " (pfn=%lx), dom %d\n", + mfn_x(gmfn), pfn, d->domain_id); + + /* + * We clear this bit, since the state of the page is now defined + * and part of the snapshot, so we want to protect the page if we + * write to it. + */ + __clear_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap); + + if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) ) + { + PAGING_DEBUG(COW, + "marked precow foreign mfn %" + PRI_mfn " (pfn=%lx), dom %d\n", + mfn_x(gmfn), pfn, d->domain_id); + } + } + else if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) ) + { + /* Save the page */ +#if COW_DEBUG_OUTPUT + printk("cow: mark dirty: mfn = %lx\n", guest_mfn); +#endif + rc = cow_check_threshold(d); + if ( rc != 0 ) + { + __clear_bit(pfn, d->arch.paging.cow.bitmap); + cow_pause_domain(d, guest_mfn, 0); + goto out; + } + + cow_save_page(d, guest_mfn); + } + + out: + cow_unlock(d); +} + +int cow_snapshot(struct domain *d, RING_IDX *when) +{ + int ret; + + /* FIXME: Try not to pause/unpause all the time */ + domain_pause(d); + + /* FIXME: Try not to disable/enable log dirty all the time */ + if ( cow_enabled(d) ) + paging_log_dirty_disable(d); + + if ( !cow_enabled(d) ) + { + ret = paging_log_dirty_enable(d, 1); + if ( ret != 0 ) + goto out; + } + + cow_lock(d); + *when = d->arch.paging.cow.back_ring.sring->rsp_prod; +#if COW_DEBUG_OUTPUT + printk("cow: snapshot: when = %d\n", *when); +#endif + ret = cow_take_snapshot(d); +#if COW_DEBUG_OUTPUT + printk("cow: snapshot: took snapshot = %d\n", ret); +#endif + cow_unlock(d); + + out: + domain_unpause(d); + + return ret; +} + +int cow_resume(struct domain *d) +{ +#if !COW_DOMAIN_PAUSE + struct vcpu *v; +#endif + int ret; + + cow_lock(d); + + ret = -EINVAL; + if ( !cow_enabled(d) ) + goto out; + + if ( d->arch.paging.cow.is_paused == 0 ) + { + ret = 0; + goto out; + } + + d->arch.paging.cow.is_paused = 0; + ret = cow_check_threshold(d); + if ( ret != 0 ) + { + d->arch.paging.cow.is_paused = 1; + goto out; + } + + if ( d->arch.paging.cow.is_paused_pre_dirty ) + { + d->arch.paging.cow.is_paused_pre_dirty = 0; + cow_unlock(d); + cow_pre_dirty(d, d->arch.paging.cow.paused_guest_mfn); + } + else if ( d->arch.paging.cow.is_paused_scan_foreign ) + { + d->arch.paging.cow.is_paused_scan_foreign = 0; + cow_scan_foreign_mapping(d); + cow_unlock(d); + } + else + { + cow_unlock(d); + cow_mark_dirty(d, d->arch.paging.cow.paused_guest_mfn); + } + + d->arch.paging.cow.paused_guest_mfn = 0; + +#if COW_DOMAIN_PAUSE + domain_unpause(d); +#else + if ( atomic_dec_and_test(&d->pause_count) ) + for_each_vcpu( d, v ) +// vcpu_unpause(v); + vcpu_wake(v); +#endif + + return 0; + + out: + cow_unlock(d); + return ret; +} diff -r 0477f9061c8a xen/arch/x86/mm/paging.c --- a/xen/arch/x86/mm/paging.c Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/arch/x86/mm/paging.c Mon Apr 20 10:21:49 2009 -0700 @@ -26,8 +26,10 @@ #include <asm/p2m.h> #include <asm/hap.h> #include <asm/guest_access.h> +#include <asm/cow.h> #include <xen/numa.h> #include <xsm/xsm.h> +#include <xen/grant_table.h> #define hap_enabled(d) (is_hvm_domain(d) && (d)->arch.hvm_domain.hap_enabled) @@ -158,7 +160,7 @@ { d->arch.paging.log_dirty.allocs--; free_domheap_page(mfn_to_page(mfn)); -} +} void paging_free_log_dirty_bitmap(struct domain *d) { @@ -207,7 +209,7 @@ d->arch.paging.log_dirty.failed_allocs = 0; } -int paging_log_dirty_enable(struct domain *d) +int paging_log_dirty_enable(struct domain *d, bool_t enable_cow) { int ret; @@ -226,6 +228,9 @@ paging_free_log_dirty_bitmap(d); goto out; } + + if ( enable_cow ) + cow_enable(d); log_dirty_unlock(d); @@ -253,11 +258,33 @@ ret = d->arch.paging.log_dirty.disable_log_dirty(d); log_dirty_lock(d); if ( !paging_mode_log_dirty(d) ) + { paging_free_log_dirty_bitmap(d); + + if ( cow_enabled(d) ) + cow_disable(d); + } log_dirty_unlock(d); domain_unpause(d); return ret; +} + +void paging_pre_dirty(struct domain *d, unsigned long guest_mfn) +{ + mfn_t gmfn; + + gmfn = _mfn(guest_mfn); + + if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ) + return; + + log_dirty_lock(d); + + if ( cow_enabled(d) ) + cow_pre_dirty(d, guest_mfn); + + log_dirty_unlock(d); } /* Mark a page as dirty */ @@ -327,11 +354,14 @@ unmap_domain_page(l1); if ( changed ) { - PAGING_DEBUG(LOGDIRTY, + PAGING_DEBUG(LOGDIRTY, "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n", mfn_x(gmfn), pfn, d->domain_id); d->arch.paging.log_dirty.dirty_count++; } + + if ( cow_enabled(d) ) + cow_mark_dirty(d, guest_mfn); out: log_dirty_unlock(d); @@ -471,13 +501,20 @@ d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty; d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap; d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); + + cow_init(d); } /* This function fress log dirty bitmap resources. */ void paging_log_dirty_teardown(struct domain*d) { log_dirty_lock(d); + paging_free_log_dirty_bitmap(d); + + if ( cow_enabled(d) ) + cow_teardown(d); + log_dirty_unlock(d); } /************************************************/ @@ -552,11 +589,11 @@ switch ( sc->op ) { case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: - return paging_log_dirty_enable(d); + return paging_log_dirty_enable(d, 0); case XEN_DOMCTL_SHADOW_OP_ENABLE: if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY ) - return paging_log_dirty_enable(d); + return paging_log_dirty_enable(d, 0); case XEN_DOMCTL_SHADOW_OP_OFF: if ( paging_mode_log_dirty(d) ) diff -r 0477f9061c8a xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/arch/x86/mm/shadow/multi.c Mon Apr 20 10:21:49 2009 -0700 @@ -36,6 +36,7 @@ #include <asm/hvm/cacheattr.h> #include <asm/mtrr.h> #include <asm/guest_pt.h> +#include <asm/paging.h> #include "private.h" #include "types.h" @@ -4598,6 +4599,8 @@ } #endif + paging_pre_dirty(v->domain, mfn_x(sh_ctxt->mfn1)); + /* Unaligned writes mean probably this isn't a pagetable */ if ( vaddr & (bytes - 1) ) sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ ); @@ -4623,6 +4626,8 @@ MAPPING_EXCEPTION : (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ? MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE); + + paging_pre_dirty(v->domain, mfn_x(sh_ctxt->mfn2)); /* Cross-page writes mean probably not a pagetable */ sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ ); diff -r 0477f9061c8a xen/common/grant_table.c --- a/xen/common/grant_table.c Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/common/grant_table.c Mon Apr 20 10:21:49 2009 -0700 @@ -352,6 +352,15 @@ goto undo_out; } + /* + * If the mapping is writable, do something before the page is mapped. + * We may end up doing something useless if the mapping fails but + * otherwise we could end up racing with the guest + * (altough very unlikely) + */ + if ( !(op->flags & GNTMAP_readonly) ) + gnttab_pre_dirty(rd, frame); + rc = create_grant_host_mapping( op->host_addr, frame, op->flags, cache_flags); if ( rc != GNTST_okay ) @@ -582,7 +591,7 @@ /* If just unmapped a writable mapping, mark as dirtied */ if ( !(op->flags & GNTMAP_readonly) ) - gnttab_mark_dirty(rd, op->frame); + gnttab_post_dirty(rd, op->frame); unmap_out: op->status = rc; @@ -1255,7 +1264,7 @@ } else { - gnttab_mark_dirty(rd, r_frame); + gnttab_post_dirty(rd, r_frame); act->pin -= GNTPIN_hstw_inc; if ( !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) ) @@ -1444,6 +1453,8 @@ goto error_out; } + gnttab_pre_dirty(dd, d_frame); + sp = map_domain_page(s_frame); dp = map_domain_page(d_frame); @@ -1452,7 +1463,7 @@ unmap_domain_page(dp); unmap_domain_page(sp); - gnttab_mark_dirty(dd, d_frame); + gnttab_post_dirty(dd, d_frame); put_page_and_type(mfn_to_page(d_frame)); error_out: diff -r 0477f9061c8a xen/include/asm-x86/cow.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/asm-x86/cow.h Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,83 @@ +/****************************************************************************** + * include/asm-x86/cow.h + * + * Common interface for cow support. + * + * Copyright (c) 2009 University of British Columbia (Patrick Colp) + * Parts based on earlier work by Geoffrey Lefebvre + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#ifndef __COW_H__ +#define __COW_H__ + + +#include <xen/sched.h> + + +/* Flag used for CoW debug */ +#define PAGING_DEBUG_COW 1 + + +/* CoW helper functions */ +#define cow_enabled(_d) ((_d)->is_cow) +#define enable_cow(_d) ((_d)->is_cow = 1) +#define disable_cow(_d) ((_d)->is_cow = 0) + +/* CoW lock */ +#define cow_ring_lock_init(_d) spin_lock_init(&(_d)->arch.paging.cow.ring_lock) +#define cow_ring_lock(_d) spin_lock(&(_d)->arch.paging.cow.ring_lock) +#define cow_ring_unlock(_d) spin_unlock(&(_d)->arch.paging.cow.ring_lock) + + +/* Enable CoW */ +int cow_enable(struct domain *d); + +/* Disable CoW */ +void cow_disable(struct domain *d); + +/* CoW initialisation */ +void cow_init(struct domain *d); + +/* CoW teardown */ +void cow_teardown(struct domain *d); + +/* Take a CoW snapshot */ +int cow_snapshot(struct domain *d, RING_IDX *when); + +/* Resume a domain paused because of CoW (buffer was full) */ +int cow_resume(struct domain *d); + +/* We use the mapping and unmaping of the page as conservative boundary + * on the page being written to by the foreign domain */ +void cow_pre_dirty(struct domain *d, unsigned long guest_mfn); + +/* Copy pages out and mark them as dirty so they don't get copied again */ +void cow_mark_dirty(struct domain *d, unsigned long guest_mfn); + + +#endif /* __COW_H__ */ + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/include/asm-x86/domain.h Mon Apr 20 10:21:49 2009 -0700 @@ -6,6 +6,7 @@ #include <asm/hvm/vcpu.h> #include <asm/hvm/domain.h> #include <asm/e820.h> +#include <public/io/cow.h> #define has_32bit_shinfo(d) ((d)->arch.has_32bit_shinfo) #define is_pv_32bit_domain(d) ((d)->arch.is_32bit_pv) @@ -149,6 +150,41 @@ }; /************************************************/ +/* copy-on-write */ +/************************************************/ +struct cow_domain { + /* cow lock */ + spinlock_t lock; + int locker; /* processor that holds the lock */ + const char *locker_function; /* func that took it */ + + /* ring lock */ + spinlock_t ring_lock; + + /* size of the cow bitmaps */ + unsigned long bitmap_size; + + /* cow bitmap to record foreign pages before cow was enabled */ + unsigned long *precow_foreign_bitmap; + + /* cow bitmap to record pages that have been saved */ + unsigned long *bitmap; + + /* back-end ring for reading mfns and storing pfns */ + cow_back_ring_t back_ring; + + /* high water mark for ring */ + RING_IDX ring_high_water; + bool_t notified_high_water; + + /* paused domain */ + bool_t is_paused; + bool_t is_paused_pre_dirty; + bool_t is_paused_scan_foreign; + unsigned long paused_guest_mfn; +}; + +/************************************************/ /* common paging data structure */ /************************************************/ struct log_dirty_domain { @@ -181,6 +217,8 @@ struct hap_domain hap; /* log dirty support */ struct log_dirty_domain log_dirty; + /* cow support */ + struct cow_domain cow; }; struct paging_vcpu { diff -r 0477f9061c8a xen/include/asm-x86/grant_table.h --- a/xen/include/asm-x86/grant_table.h Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/include/asm-x86/grant_table.h Mon Apr 20 10:21:49 2009 -0700 @@ -31,7 +31,8 @@ #define gnttab_shared_gmfn(d, t, i) \ (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i))) -#define gnttab_mark_dirty(d, f) paging_mark_dirty((d), (f)) +#define gnttab_pre_dirty(d, f) paging_pre_dirty((d), (f)) +#define gnttab_post_dirty(d, f) paging_mark_dirty((d), (f)) static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr) { diff -r 0477f9061c8a xen/include/asm-x86/paging.h --- a/xen/include/asm-x86/paging.h Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/include/asm-x86/paging.h Mon Apr 20 10:21:49 2009 -0700 @@ -140,7 +140,7 @@ void paging_free_log_dirty_bitmap(struct domain *d); /* enable log dirty */ -int paging_log_dirty_enable(struct domain *d); +int paging_log_dirty_enable(struct domain *d, bool_t enable_cow); /* disable log dirty */ int paging_log_dirty_disable(struct domain *d); @@ -152,6 +152,7 @@ void (*clean_dirty_bitmap)(struct domain *d)); /* mark a page as dirty */ +void paging_pre_dirty(struct domain *d, unsigned long guest_mfn); void paging_mark_dirty(struct domain *d, unsigned long guest_mfn); /* diff -r 0477f9061c8a xen/include/public/domctl.h --- a/xen/include/public/domctl.h Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/include/public/domctl.h Mon Apr 20 10:21:49 2009 -0700 @@ -33,6 +33,7 @@ #endif #include "xen.h" +#include "io/ring.h" #define XEN_DOMCTL_INTERFACE_VERSION 0x00000005 @@ -645,6 +646,41 @@ } xen_domctl_hvmcontext_partial_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t); +/* FIXME: use types instead of different domctls */ +/* + * Enable/disable Copy-on-write for a domain. + */ +#define XEN_DOMCTL_cow_enable 56 +#define XEN_DOMCTL_cow_snapshot 57 +#define XEN_DOMCTL_cow_resume 58 +#define XEN_DOMCTL_cow_disable 59 +#define XEN_DOMCTL_cow_page_type 60 + +struct xen_domctl_cow_enable { + /* IN: mfn of the ring buffer */ + unsigned long mfn; +}; +typedef struct xen_domctl_cow_enable xen_domctl_cow_enable_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_enable_t); + +struct xen_domctl_cow_snapshot { + /* OUT: when the snapshot took place (rsp_prod) */ + RING_IDX when; +}; +typedef struct xen_domctl_cow_snapshot xen_domctl_cow_snapshot_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_snapshot_t); + +struct xen_domctl_cow_page_type { + /* IN: mfn of the page */ + unsigned long mfn; + /* OUT: count info */ + unsigned long count_info; + /* OUT: page type info */ + unsigned long type_info; +}; +typedef struct xen_domctl_cow_page_type xen_domctl_cow_page_type_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_page_type_t); + struct xen_domctl { uint32_t cmd; @@ -687,6 +723,9 @@ struct xen_domctl_set_target set_target; struct xen_domctl_subscribe subscribe; struct xen_domctl_debug_op debug_op; + struct xen_domctl_cow_enable cow_enable; + struct xen_domctl_cow_snapshot cow_snapshot; + struct xen_domctl_cow_page_type cow_page_type; #if defined(__i386__) || defined(__x86_64__) struct xen_domctl_cpuid cpuid; #endif diff -r 0477f9061c8a xen/include/public/io/cow.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/public/io/cow.h Mon Apr 20 10:21:49 2009 -0700 @@ -0,0 +1,82 @@ +/***************************************************************************** + * cow.h + * + * CoW common structures + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2009 University of British Columbia (Patrick Colp) + */ + +#ifndef _XEN_PUBLIC_IO_COW_H +#define _XEN_PUBLIC_IO_COW_H + + +#include "ring.h" + + +#define RING_MASK(_r, _i) ((_i) & (RING_SIZE(_r) - 1)) + + +#define XEN_COW_IOC_MAGIC 'w' +#define XEN_COW_IOCTL_INIT _IO(XEN_COW_IOC_MAGIC, 1) + +#define XEN_COW_RING_PAGES 1 /* TODO: 2+ pages? */ +#define XEN_COW_RING_SIZE (XEN_COW_RING_PAGES << PAGE_SHIFT) + +#define XEN_COW_RING_THRESHOLD 16 + + +/* Some definitions for the XenCow ring buffer. */ +typedef struct cow_request_st { + unsigned long mfn; +} cow_request_t; + +typedef struct cow_response_st { + unsigned long pfn; +} cow_response_t; + + +DEFINE_RING_TYPES(cow, cow_request_t, cow_response_t); + + +/* + * The structure used to initialise CoW. + */ +typedef struct cow_init_st { + /* Start address of buffer */ + unsigned long addr; + /* Number of frames in buffer */ + int num_mfns; + /* MFNs of buffer frames */ + unsigned long mfns[]; +} cow_init_t; + + +#endif /* _XEN_PUBLIC_IO_COW_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 0477f9061c8a xen/include/public/xen.h --- a/xen/include/public/xen.h Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/include/public/xen.h Mon Apr 20 10:21:49 2009 -0700 @@ -143,6 +143,8 @@ #define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */ #define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */ #define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */ +#define VIRQ_COW_BUFFER 9 /* G. (DOM0) CoW buffer has pages available */ +#define VIRQ_COW_PAUSE 10 /* G. (DOM0) CoW domain has been paused */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 diff -r 0477f9061c8a xen/include/xen/sched.h --- a/xen/include/xen/sched.h Fri Mar 20 17:42:46 2009 +0000 +++ b/xen/include/xen/sched.h Mon Apr 20 10:21:49 2009 -0700 @@ -223,6 +223,8 @@ bool_t is_paused_by_controller; /* Domain's VCPUs are pinned 1:1 to physical CPUs? */ bool_t is_pinned; + /* Is this guest doing CoW? */ + bool_t is_cow; /* Are any VCPUs polling event channels (SCHEDOP_poll)? */ DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |