[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-API] [PATCH 4 of 4] CA-33707: fix queueing deadlocking bug by never entering a queue with a lock held



# HG changeset patch
# User David Scott <dave.scott@xxxxxxxxxxxxx>
# Date 1260486290 0
# Node ID 9edc8c86f01dd5e951b944b5424e5b9383d0780c
# Parent  ca92f46da128588874c6c660aef6409adae119dd
CA-33707: fix queueing deadlocking bug by never entering a queue with a lock 
held.

We now request domain shutdown without the per-VM mutex held. This means these 
may race with the background event thread performing a domain destruction.

The domain destruction/recreation part of {clean,hard}_{shutdown,reboot} is 
placed in the domU_shutdown_queue by both the synchronous API path and the 
event thread.

We remove the Vmops.Domain_shutdown_for_wrong_reason exception and replace it 
with VM_{CRASHED,REBOOTED,HALTED} exceptions. We may yet be able to remove 
these "errors" completely.

Work around many instances of VM_FAILED_SHUTDOWN_ACK by reissuing the shutdown 
request every few seconds until the timeout expires.

If VM.{clean,hard}_reboot runs in parallel with an internal domain reboot then 
only one reboot will probably happen.
If VM.{clean,hard}_shutdown runs in parallel with an internal domain reboot 
then up to 10 retries to shut the VM down will be attempted.

Add FIST points to:
1. disable the event thread's handling of @releaseDomain
2. disable the synchronous API calls handling of domain destruction/recreation
3. disable the artificial VM reboot delay
4. simulate an internal shutdown (via Xc.domain_shutdown)

Add a series of tests to quicktest which run every combination of
* VM.{clean,hard}_{shutdown,reboot}
* parallel internal halt,reboot,crash
* synchronous thread only, event thread only, both

Signed-off-by: David Scott <dave.scott@xxxxxxxxxxxxx>

diff -r ca92f46da128 -r 9edc8c86f01d ocaml/idl/api_errors.ml
--- a/ocaml/idl/api_errors.ml   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/idl/api_errors.ml   Thu Dec 10 23:04:50 2009 +0000
@@ -136,6 +136,9 @@
 let vm_shutdown_timeout = "VM_SHUTDOWN_TIMEOUT"
 let vm_duplicate_vbd_device = "VM_DUPLICATE_VBD_DEVICE"
 let vm_not_resident_here = "VM_NOT_RESIDENT_HERE"
+let vm_crashed = "VM_CRASHED"
+let vm_rebooted = "VM_REBOOTED"
+let vm_halted = "VM_HALTED"
 let vms_failed_to_cooperate = "VMS_FAILED_TO_COOPERATE"
 let domain_exists = "DOMAIN_EXISTS"
 let cannot_reset_control_domain = "CANNOT_RESET_CONTROL_DOMAIN"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/idl/datamodel.ml
--- a/ocaml/idl/datamodel.ml    Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/idl/datamodel.ml    Thu Dec 10 23:04:50 2009 +0000
@@ -618,6 +618,12 @@
     ~doc:"VM didn't acknowledge the need to shutdown." ();
   error Api_errors.vm_shutdown_timeout [ "vm"; "timeout" ]
     ~doc:"VM failed to shutdown before the timeout expired" ();
+  error Api_errors.vm_crashed [ "vm" ]
+         ~doc:"The VM crashed" ();
+  error Api_errors.vm_rebooted [ "vm" ]
+         ~doc:"The VM unexpectedly rebooted" ();
+  error Api_errors.vm_halted [ "vm" ]
+         ~doc:"The VM unexpectedly halted" ();
   error Api_errors.bootloader_failed [ "vm"; "msg" ]
     ~doc:"The bootloader returned an error" ();
   error Api_errors.unknown_bootloader [ "vm"; "bootloader" ]
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/OMakefile
--- a/ocaml/xapi/OMakefile      Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/OMakefile      Thu Dec 10 23:04:50 2009 +0000
@@ -48,7 +48,7 @@
 
 #OCamlProgram(upload_receive, $(COMMON) fileupload upload_receive)
 
-OCamlProgram(quicktestbin, quicktest quicktest_common quicktest_ocamltest 
quicktest_storage quicktest_http quicktest_encodings quicktest_vm_placement 
vm_placement ../xenops/squeeze_test quicktest_vm_memory_constraints 
../util/vm_memory_constraints)
+OCamlProgram(quicktestbin, quicktest quicktest_common quicktest_ocamltest 
quicktest_storage quicktest_http quicktest_encodings quicktest_vm_placement 
vm_placement ../xenops/squeeze_test quicktest_vm_memory_constraints 
../util/vm_memory_constraints quicktest_lifecycle)
 OCamlProgram(stresstest, stresstest)
 OCamlProgram(fakeguestagent, fakeguestagent)
 
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/events.ml
--- a/ocaml/xapi/events.ml      Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/events.ml      Thu Dec 10 23:04:50 2009 +0000
@@ -121,15 +121,19 @@
        (try Db.VM.remove_from_other_config ~__context ~self:vm 
~key:Xapi_globs.last_artificial_reboot_delay_key with _ -> ());
        Db.VM.add_to_other_config ~__context ~self:vm 
~key:Xapi_globs.last_artificial_reboot_delay_key ~value:(string_of_int 2);
        0 in
-    debug "Adding artificial delay on reboot for VM: %s. delay time=%d 
seconds" (Ref.string_of vm) delay;
-    Thread.delay (float_of_int delay)
-      
+       if Xapi_fist.disable_reboot_delay ()
+       then debug "FIST: disable_reboot_delay"
+    else begin
+         debug "Adding artificial delay on reboot for VM: %s. delay time=%d 
seconds" (Ref.string_of vm) delay;
+      Thread.delay (float_of_int delay)
+       end
+
   let clear_reboot_delay ~__context ~vm =
     try Db.VM.remove_from_other_config ~__context ~self:vm 
~key:Xapi_globs.last_artificial_reboot_delay_key with _ -> ()
       
   let perform_destroy ~__context ~vm token =
     TaskHelper.set_description ~__context "destroy";
-    Xapi_vm.Shutdown.in_dom0 { Xapi_vm.TwoPhase.__context = __context; vm=vm; 
token=Some token; api_call_name="destroy"; clean=false };
+    Xapi_vm.Shutdown.in_dom0_already_locked { Xapi_vm.TwoPhase.__context = 
__context; vm=vm; api_call_name="destroy"; clean=false };
     update_allowed_ops_using_api ~__context vm
 
   let perform_preserve ~__context ~vm token = 
@@ -523,8 +527,12 @@
                let action_taken = Resync.vm ~__context token vm in
                if action_taken then debug "Action was taken so 
allowed_operations should be updated";             
              in
-             debug "adding Resync.vm to work queue";
-             push vm Local_work_queue.domU_internal_shutdown_queue description 
work_item;
+                 if Xapi_fist.disable_event_lifecycle_path ()
+                 then warn "FIST: disable_event_lifecycle_path: skipping 
Resync.vm"
+                 else begin
+                       debug "adding Resync.vm to work queue";
+                       push vm Local_work_queue.domU_internal_shutdown_queue 
description work_item;
+                 end
           )
        with Vm_corresponding_to_domid_not_in_db domid ->
         error "event could not be processed because VM record not in database"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/quicktest.ml
--- a/ocaml/xapi/quicktest.ml   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/quicktest.ml   Thu Dec 10 23:04:50 2009 +0000
@@ -619,6 +619,7 @@
     end;
     vbd_pause_unpause_test s debian;
     powercycle_test s debian;
+       Quicktest_lifecycle.test s debian;
     vm_uninstall test s debian;  
     success test
   with Unable_to_find_suitable_debian_template ->
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/quicktest_lifecycle.ml
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/ocaml/xapi/quicktest_lifecycle.ml Thu Dec 10 23:04:50 2009 +0000
@@ -0,0 +1,194 @@
+
+
+type 'a api_call = 
+  | Shutdown of 'a
+  | Reboot of 'a
+
+type api_mode = 
+  | Clean
+  | Hard
+
+type api = api_mode api_call
+
+type parallel_op = 
+  | Internal_reboot
+  | Internal_halt
+  | Internal_suspend
+  | Internal_crash
+
+type code_path = 
+  | Sync
+  | Event
+  | Both
+
+type result = 
+  | Rebooted
+  | Halted
+
+let final_guest_state = function
+  | Shutdown _ -> Halted
+  | Reboot _ -> Rebooted
+
+type test = { 
+       api: api option;
+       parallel_op: parallel_op option;
+       code_path: code_path;
+}
+
+let string_of_result = function
+  | Rebooted -> "Reboot"
+  | Halted -> "Halt"
+
+let expected_result = function
+  | { api = Some (Shutdown _); parallel_op = Some _; code_path = (Sync|Both) } 
-> Some Halted
+  | { api = Some (Reboot _);   parallel_op = Some _; code_path = (Sync|Both) } 
-> Some Rebooted
+  | { api = Some (Shutdown _); parallel_op = None;   code_path = 
(Sync|Event|Both) } -> Some Halted
+  | { api = Some (Reboot _);   parallel_op = None;   code_path = 
(Sync|Event|Both) } -> Some Rebooted
+  | { parallel_op = Some (Internal_halt | Internal_crash); code_path = Event } 
-> Some Halted
+  | { parallel_op = Some Internal_reboot; code_path = Event } -> Some Rebooted
+  
+  | { api = None; parallel_op = Some (Internal_halt (* | Internal_suspend *) | 
Internal_crash); code_path = (Event|Both) } -> Some Halted
+  | { api = None; parallel_op = Some Internal_reboot; code_path = (Event|Both) 
 } -> Some Rebooted
+  | _ -> None (* invalid test *)
+
+
+let string_of_test x = 
+  let string_of_api = function
+       | Shutdown Clean   -> "clean_shutdown"
+       | Shutdown Hard    -> "hard_shutdown "
+       | Reboot Clean     -> "clean_reboot  "
+       | Reboot Hard      -> "hard_reboot   " in
+  let string_of_parallel_op = function
+       | Internal_reboot  -> "reboot        "
+       | Internal_halt    -> "halt          "
+       | Internal_suspend -> "suspend       "
+       | Internal_crash   -> "crash         " in
+  let string_of_code_path = function
+       | Sync             -> "synch         "
+       | Event            -> "event         " 
+       | Both             -> "both          " in
+  let dm f x = match x with 
+       | None             -> "Nothing       " 
+       | Some x           -> f x in
+  Printf.sprintf "%s %s %s -> %s" 
+         (dm string_of_api x.api) (dm string_of_parallel_op x.parallel_op) 
(string_of_code_path x.code_path)
+         (match expected_result x with None -> "invalid" | Some y -> 
string_of_result y)
+open List
+
+let all_possible_tests =
+  let all_api_variants x = 
+       [ { x with api = None };
+         { x with api = Some (Shutdown Clean) };
+         { x with api = Some (Shutdown Hard) };
+         { x with api = Some (Reboot Clean) };
+         { x with api = Some (Reboot Hard) } ] in
+  let all_parallel_op_variants x = 
+       [ { x with parallel_op = None };
+         { x with parallel_op = Some Internal_reboot };
+         { x with parallel_op = Some Internal_halt };
+         { x with parallel_op = Some Internal_suspend };
+         { x with parallel_op = Some Internal_crash } ] in
+  let all_code_path_variants x = 
+       [ { x with code_path = Sync };
+         { x with code_path = Event };
+         { x with code_path = Both } ] in
+
+  let xs = [ { api = None; parallel_op = None; code_path = Sync } ] in
+  concat (map all_code_path_variants (concat (map all_parallel_op_variants 
(concat (map all_api_variants xs)))))
+                       
+let all_valid_tests = List.filter (fun t -> expected_result t <> None) 
all_possible_tests
+
+         (*
+let _ = 
+  List.iter print_endline (map string_of_test all_valid_tests);
+  Printf.printf "In total there are %d tests.\n" (List.length all_valid_tests)
+         *)
+
+open Quicktest_common
+open Client
+open Pervasiveext
+
+let one s debian test = 
+  let t = make_test (string_of_test test) 1 in
+  start t;
+  let event = "/tmp/fist_disable_event_lifecycle_path" in
+  let sync = "/tmp/fist_disable_sync_lifecycle_path" in
+  let simulate = "/tmp/fist_simulate_internal_shutdown" in
+  let delay = "/tmp/fist_disable_reboot_delay" in
+
+  finally
+         (fun () ->
+                  try
+                        begin 
+                          Unixext.unlink_safe simulate;
+                          Unixext.touch_file delay;
+                          match test.code_path with
+                          | Sync ->
+                                        Unixext.unlink_safe sync;
+                                        Unixext.touch_file event
+                          | Event ->
+                                        Unixext.unlink_safe event;
+                                        Unixext.touch_file sync
+                          | Both ->
+                                        Unixext.unlink_safe sync;
+                                        Unixext.unlink_safe event
+                        end;
+                          if Client.VM.get_power_state !rpc s debian = `Halted
+                          then Client.VM.start !rpc s debian false false;
+                          
+                          let call_api = function
+                                | Shutdown Clean -> Client.VM.clean_shutdown 
!rpc s debian
+                                | Shutdown Hard -> Client.VM.hard_shutdown 
!rpc s debian
+                                | Reboot Clean -> Client.VM.clean_reboot !rpc 
s debian
+                                | Reboot Hard -> Client.VM.hard_reboot !rpc s 
debian in
+                          
+                          let domid = Client.VM.get_domid !rpc s debian in
+                          begin match test with
+                          | { api = None; parallel_op = Some x } ->
+                                        let reason = match x with
+                                          | Internal_reboot -> Xc.Reboot
+                                          | Internal_halt -> Xc.Halt
+                                          | Internal_crash -> Xc.Crash
+                                          | Internal_suspend -> Xc.Suspend in
+                                        Xc.with_intf (fun xc -> 
Xc.domain_shutdown xc (Int64.to_int domid) reason)
+                          | { api = Some x; parallel_op = Some y } ->
+                                        let reason = match y with
+                                          | Internal_reboot -> "reboot"
+                                          | Internal_halt -> "halt"
+                                          | Internal_crash -> "crash"
+                                          | Internal_suspend -> "suspend" in
+                                        Unixext.write_string_to_file simulate 
reason;
+                                        call_api x
+                          | { api = Some x; parallel_op = None } ->
+                                        call_api x
+                          | t -> failwith (Printf.sprintf "Invalid test: %s" 
(string_of_test t))
+                          end;
+                          
+                          let wait_for_domid p =
+                                let start = Unix.gettimeofday () in
+                                let finished = ref false in
+                                while Unix.gettimeofday () -. start < 300. && 
(not !finished) do
+                                  finished := p (Client.VM.get_domid !rpc s 
debian);
+                                        if not !finished then Thread.delay 1.
+                                done;
+                                if not !finished then failwith "timeout"
+                          in
+                          
+                          begin match expected_result test with
+                          | None -> failwith (Printf.sprintf "Invalid test: 
%s" (string_of_test test))
+                          | Some Rebooted ->
+                                        wait_for_domid (fun domid' -> domid <> 
domid')
+                          | Some Halted ->
+                                        wait_for_domid (fun domid' -> domid' = 
-1L)
+                          end
+                  with e -> failed t (Printexc.to_string e)
+         )
+         (fun () ->
+                  Unixext.unlink_safe sync;
+                  Unixext.unlink_safe event;
+                  Unixext.unlink_safe delay
+         );
+  success t
+
+let test s debian = 
+  List.iter (one s debian) all_valid_tests
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/vmops.ml
--- a/ocaml/xapi/vmops.ml       Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/vmops.ml       Thu Dec 10 23:04:50 2009 +0000
@@ -777,59 +777,52 @@
     )
 
 
-
-let match_xal_and_shutdown xalreason reason =
-       debug "Comparing XAL %s with Domain %s"
-             (Xal.string_of_died_reason xalreason)
-             (Domain.string_of_shutdown_reason reason);
-       match xalreason, reason with
-       | Xal.Crashed, _ -> false
-       | Xal.Vanished, _ -> false
-       | Xal.Halted, (Domain.Halt | Domain.PowerOff) -> true
-       | Xal.Rebooted, Domain.Reboot -> true
-       | Xal.Suspended, Domain.Suspend -> true
-       | Xal.Shutdown i, Domain.Unknown i2 -> i = i2
-       | _, _ -> false
-
 (** Thrown if clean_shutdown_with_reason exits for the wrong reason: eg the 
domain
     crashed or rebooted *)
 exception Domain_shutdown_for_wrong_reason of Xal.died_reason
 
-(** Tells a VM to shutdown with a specific reason (reboot/halt/poweroff). *)
+(** Tells a VM to shutdown with a specific reason (reboot/halt/poweroff), 
waits for
+    it to shutdown (or vanish) and then return the reason.
+       Note this is not always called with the per-VM mutex. *)
 let clean_shutdown_with_reason ?(at = fun _ -> ()) ~xal ~__context ~self domid 
reason =
   (* Set the task allowed_operations to include cancel *)
   if reason <> Domain.Suspend then TaskHelper.set_cancellable ~__context;
 
   at 0.25;
-  (* Windows PV drivers will respond within 10s according to ssmith and
-     improving this is likely to happen in a Rio timeframe (CA-3964). It's
-     still possible (although unlikely) for us to timeout just before the
-     drivers activate but the worst we'll suffer is a shutdown failure
-     followed by a spontaneous shutdown (which can happen anyway). Having
-     this check in here allows us to bail out quickly in the common case
-     of the PV drivers being missing. *)
-  with_xs (fun xs ->
-            let xc = Xal.xc_of_ctx xal in
-            if not (Domain.shutdown_ack ~timeout:60. ~xc ~xs domid reason) then
-              raise (Api_errors.Server_error 
(Api_errors.vm_failed_shutdown_ack, []))
-         );
+  let xs = Xal.xs_of_ctx xal in
+  let xc = Xal.xc_of_ctx xal in
+  begin
+       (* Wait for up to 60s for the VM to acknowledge the shutdown request. 
In case the guest
+          misses our original request, keep making additional ones. *)
+       let finished = ref false in
+       let timeout = 60.0 in
+       let start = Unix.gettimeofday () in
+       while Unix.gettimeofday () -. start < timeout && not !finished do
+         try
+               (* Make the shutdown request: this will fail if the domain has 
vanished. *)
+               Domain.shutdown ~xs domid reason;
+               (* Wait for any necessary acknowledgement. If we get a 
Watch.Timeout _ then
+                  we abort early; otherwise we continue in Xal.wait_release 
below. *)
+               Domain.shutdown_wait_for_ack ~timeout:10. ~xc ~xs domid reason;
+               finished := true
+         with 
+         | Watch.Timeout _ -> () (* try again *)
+         | e ->
+                       debug "Caught and ignoring exception: %s" 
(ExnHelper.string_of_exn e);
+                       log_backtrace ();
+                       finished := true
+       done;
+       if not !finished then raise (Api_errors.Server_error 
(Api_errors.vm_failed_shutdown_ack, []))
+  end;
   at 0.50;
   let total_timeout = 20. *. 60. in (* 20 minutes *)
   (* Block for 5s at a time, in between check to see whether we've been 
cancelled
      and update our progress if not *)
   let start = Unix.gettimeofday () in
-  let finished = ref false in
-  while (Unix.gettimeofday () -. start < total_timeout) && not(!finished) do
+  let result = ref None in
+  while (Unix.gettimeofday () -. start < total_timeout) && (!result = None) do
     try
-      let r = Xal.wait_release xal ~timeout:5. domid in
-      if not (match_xal_and_shutdown r reason) then begin
-       let errmsg = Printf.sprintf 
-         "Domain died with reason: %s when it should have been %s" 
-         (Xal.string_of_died_reason r) (Domain.string_of_shutdown_reason 
reason) in
-       debug "%s" errmsg;
-       raise (Domain_shutdown_for_wrong_reason r)
-      end;
-      finished := true;
+      result := Some (Xal.wait_release xal ~timeout:5. domid);
     with Xal.Timeout -> 
       if reason <> Domain.Suspend && TaskHelper.is_cancelling ~__context
       then raise (Api_errors.Server_error(Api_errors.task_cancelled, [ 
Ref.string_of (Context.get_task_id __context) ]));
@@ -837,9 +830,11 @@
       let progress = min ((Unix.gettimeofday () -. start) /. total_timeout) 1. 
in
       at (0.50 +. 0.25 *. progress)
   done;
-  if not(!finished)
-  then raise (Api_errors.Server_error(Api_errors.vm_shutdown_timeout, [ 
Ref.string_of self; string_of_float total_timeout ]));
-  at 1.0
+  match !result with
+  | None -> raise (Api_errors.Server_error(Api_errors.vm_shutdown_timeout, [ 
Ref.string_of self; string_of_float total_timeout ]))
+  | Some x ->
+               at 1.0;
+               x
 
 (* !!! FIX ME  - This allows a 10% overhead on static_max for size of suspend 
image !!! *)
 let get_suspend_space __context vm =
@@ -877,9 +872,19 @@
                                                                        
Domain.suspend ~xc ~xs ~hvm domid fd []
                                                                                
~progress_callback:progress_cb
                                                                                
(fun () ->
-                                                                               
        clean_shutdown_with_reason ~xal
+                                                                               
        match clean_shutdown_with_reason ~xal
                                                                                
                ~__context ~self:vm domid
-                                                                               
                Domain.Suspend
+                                                                               
                Domain.Suspend with
+                                                                               
                | Xal.Suspended -> () (* good *)
+                                                                               
                | Xal.Crashed ->
+                                                                               
                          raise (Api_errors.Server_error(Api_errors.vm_crashed, 
[ Ref.string_of vm ]))
+                                                                               
                | Xal.Rebooted ->
+                                                                               
                          raise 
(Api_errors.Server_error(Api_errors.vm_rebooted, [ Ref.string_of vm ]))
+                                                                               
                | Xal.Halted
+                                                                               
                | Xal.Vanished ->
+                                                                               
                          raise (Api_errors.Server_error(Api_errors.vm_halted, 
[ Ref.string_of vm ]))
+                                                                               
                | Xal.Shutdown x ->
+                                                                               
                          failwith (Printf.sprintf "Expected domain shutdown 
reason: %d" x)
                                                                                
)
                                                                );
                                                        (* If the suspend 
succeeds, set the suspend_VDI *)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_fist.ml
--- a/ocaml/xapi/xapi_fist.ml   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_fist.ml   Thu Dec 10 23:04:50 2009 +0000
@@ -26,6 +26,8 @@
        try
                Some (Unixext.read_whole_file_to_string ("/tmp/fist_" ^ name))
        with _ -> None
+
+let delete name = Unixext.unlink_safe ("/tmp/fist_" ^ name)
 
 (** Insert 2 * Xapi_globs.max_clock_skew into the heartbeat messages *)
 let insert_clock_skew             () = fistpoint "insert_clock_skew"
@@ -94,3 +96,18 @@
 (** Set the expiry date of a v6-license to the one in the file *)
 let set_expiry_date () = fistpoint_read "set_expiry_date"
 
+(** Forces synchronous lifecycle path to defer to the event thread *)
+let disable_sync_lifecycle_path () = fistpoint "disable_sync_lifecycle_path"
+
+(** Forces synchronous lifecycle path by partially disabling the event thread 
*)
+let disable_event_lifecycle_path () = fistpoint "disable_event_lifecycle_path"
+
+(** If set to "reboot" "halt" "suspend" "crash" this will forcibly shutdown 
the domain during reboot/shutdown *)
+let simulate_internal_shutdown () = 
+  let fist = "simulate_internal_shutdown" in
+  let x = fistpoint_read fist in
+  delete fist;
+  x
+
+(** Disables the artificial reboot delay, for faster testing. *)
+let disable_reboot_delay () = fistpoint "disable_reboot_delay"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm.ml
--- a/ocaml/xapi/xapi_vm.ml     Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm.ml     Thu Dec 10 23:04:50 2009 +0000
@@ -298,7 +298,6 @@
   (** The signature of a single phase of reboot or shutdown *)
   type args = { __context: Context.t;
                vm: API.ref_VM;
-               token: Locking_helpers.token option;
                api_call_name: string;
                clean: bool }
 
@@ -307,31 +306,85 @@
     in_guest : args -> unit;
     in_dom0 : args -> unit;
   }
-  let execute (x: args) (y: t) = 
-    y.in_guest x;
-    y.in_dom0 x
+
+  (** Called with the per-VM lock held. Evaluates to true if the VM has been 
rebooted (eg by the event thread) *)
+  let is_vm_running x =
+       (* The VM may have been rebooted by the event thread: in this case 
there is no work to do *)
+       let domid = Helpers.domid_of_vm x.__context x.vm in
+       true
+       && domid <> -1 (* someone set the state to Halted *)
+       && (with_xc
+                       (fun xc ->
+                                let di = Xc.domain_getinfo xc domid in
+                                Xal.is_running di))
+
+  (** Called before a regular synchronous reboot/shutdown to simulate parallel 
in-guest shutdowns *)
+  let simulate_internal_shutdown domid = 
+       Helpers.log_exn_continue (Printf.sprintf "simulate_internal_shutdown 
domid=%d" domid)
+               (fun () ->
+                        match Xapi_fist.simulate_internal_shutdown () with
+                        | Some x ->
+                                  let x = String.strip String.isspace x in
+                                  with_xc
+                                          (fun xc ->
+                                                       warn "FIST: simulating 
internal %s for domid=%d" x domid;
+                                                       match x with
+                                                       | "reboot" -> 
Xc.domain_shutdown xc domid Xc.Reboot
+                                                       | "halt" -> 
Xc.domain_shutdown xc domid Xc.Halt
+                                                       | "suspend" -> 
Xc.domain_shutdown xc domid Xc.Suspend
+                                                       | "crash" -> 
Xc.domain_shutdown xc domid Xc.Crash
+                                                       | _ -> failwith 
"Unknown simulate_internal_shutdown code");
+                                  (* pause for 5s which probably lets the 
event thread do something (unless it is disabled) *)
+                                  Thread.delay 5.
+                        | None -> ()
+               ) ()
 end
+
 
 
 module Reboot = struct
   (** This module contains the low-level implementation actions, as distinct 
from the tangle
       of policy which comes later. *)
 
-  let in_guest { TwoPhase.__context = __context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
-    if clean then begin
-      debug "%s phase 0/3: shutting down existing domain" api_call_name;
-      let domid = Helpers.domid_of_vm ~__context ~self:vm in
-      with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
-                  ~at:(fun x -> TaskHelper.set_progress ~__context (x /. 2.))
-                  ~__context ~self:vm domid Domain.Reboot);
-    end else debug "%s phase 0/3: no shutdown request required since this is a 
hard_reboot" api_call_name
-      
-  let in_dom0 { TwoPhase.__context = __context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
+  (** Run without the per-VM lock to request the guest shuts itself down (if 
clean) *)
+  let in_guest { TwoPhase.__context = __context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
+    let domid = Helpers.domid_of_vm ~__context ~self:vm in
+       TwoPhase.simulate_internal_shutdown domid;
+
+       (* NB a parallel internal halt may leave the domid as -1. If so then 
there's no work for us 
+          to do here. *)
+       if domid <> -1 then begin
+      if clean then begin
+               debug "%s phase 0/3: shutting down existing domain (domid: %d)" 
api_call_name domid;
+               match with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
+                                                       ~at:(fun x -> 
TaskHelper.set_progress ~__context (x /. 2.))
+                                                       ~__context ~self:vm 
domid Domain.Reboot) with
+               | Xal.Vanished
+               | Xal.Rebooted -> () (* good *)
+               | Xal.Suspended ->
+                         error "VM: %s suspended when asked to reboot" 
(Ref.string_of vm)
+               | Xal.Crashed ->
+                         error "VM: %s crashed when asked to reboot" 
(Ref.string_of vm)
+               | Xal.Halted ->
+                         error "VM: %s halted when asked to reboot" 
(Ref.string_of vm)
+      end else begin
+               debug "%s phase 0/3: no shutdown request required since this is 
a hard_reboot" api_call_name;
+               (* The domain might be killed by the event thread. Again, this 
is ok. *)
+               Helpers.log_exn_continue (Printf.sprintf "Xc.domain_shutdown 
domid=%d Xc.Reboot" domid)
+                       (fun () -> 
+                                with_xc (fun xc -> Xc.domain_shutdown xc domid 
Xc.Reboot)
+                       ) ()
+         end
+       end
+
+  (** Once the domain has shutdown and the VM is locked, perform the reboot 
immediately *)
+  let in_dom0_already_locked { TwoPhase.__context = __context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
     License_check.vm ~__context vm;
     Stats.time_this "VM reboot (excluding clean shutdown phase)"
       (fun () ->
          let new_snapshot = Db.VM.get_record ~__context ~self:vm in
-        let current_snapshot = Helpers.get_boot_record ~__context ~self:vm in
+
+                let current_snapshot = Helpers.get_boot_record ~__context 
~self:vm in
         (* Master will have already checked the new memory_max and placed the 
max of
            the current and new values in the current_snapshot.
            Just in case someone raced with us and bumped the static_max 
*again* we
@@ -343,8 +396,7 @@
         let new_snapshot = { new_snapshot with API.vM_memory_static_max = 
new_mem } in
         
         let localhost = Helpers.get_localhost ~__context in
-        
-         let domid = Helpers.domid_of_vm ~__context ~self:vm in
+        let domid = Helpers.domid_of_vm ~__context ~self:vm in
          debug "%s phase 1/3: destroying old domain" api_call_name;
         (* CA-13585: prevent glitch where power-state goes to Halted in the 
middle of a reboot.
            If an error causes us to leave this function then the event thread 
should resynchronise
@@ -366,7 +418,6 @@
         Helpers.set_boot_record ~__context ~self:vm new_snapshot;
         
          debug "%s phase 2/3: starting new domain" api_call_name;
-        Opt.iter (Locking_helpers.assert_locked vm) token;
         begin
           try
              Vmops.start_paused
@@ -391,12 +442,28 @@
                          );
           Db.VM.set_resident_on ~__context ~self:vm ~value:localhost;
            Db.VM.set_power_state ~__context ~self:vm ~value:`Running;
-          Opt.iter (Locking_helpers.assert_locked vm) token;
         with exn ->
           error "Caught exception during %s: %s" api_call_name 
(ExnHelper.string_of_exn exn);
           with_xc_and_xs (fun xc xs -> Vmops.destroy ~__context ~xc ~xs 
~self:vm domid `Halted);
           raise exn     
       )
+
+  (** In the synchronous API call paths, acquire the VM lock and see if the VM 
hasn't rebooted yet.
+         If necessary we reboot it here. *)
+  let in_dom0_already_queued args = 
+       Locking_helpers.with_lock args.TwoPhase.vm 
+               (fun _ _ -> 
+                        if TwoPhase.is_vm_running args
+                        then debug "VM %s has already rebooted: taking no 
action" (Ref.string_of args.TwoPhase.vm)
+                        else in_dom0_already_locked args) ()
+
+  (** In the synchronouse API call paths, wait in the 
domU_internal_shutdown_queue and then attempt 
+         to reboot the VM. NB this is the same queue used by the event thread. 
*)
+  let in_dom0 args =
+    Local_work_queue.wait_in_line Local_work_queue.domU_internal_shutdown_queue
+      (Context.string_of_task args.TwoPhase.__context)
+      (fun () -> in_dom0_already_queued args)
+
   let actions = { TwoPhase.in_guest = in_guest; in_dom0 = in_dom0 }
 end
 
@@ -404,32 +471,64 @@
   (** This module contains the low-level implementation actions, as distinct 
from the tangle
       of policy which comes later. *)
 
-  let in_guest { TwoPhase.__context=__context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
-    Opt.iter (Locking_helpers.assert_locked vm) token;
+  (** Run without the per-VM lock to request the guest shuts itself down (if 
clean) *)
+  let in_guest { TwoPhase.__context=__context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
     assert_ha_always_run_is_false ~__context ~vm;
+    let domid = Helpers.domid_of_vm ~__context ~self:vm in
+       TwoPhase.simulate_internal_shutdown domid;
 
-    if clean then begin
-      debug "%s: phase 1/2: waiting for the domain to shutdown" api_call_name;
-      let domid = Helpers.domid_of_vm ~__context ~self:vm in
-      
-      with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
-                 ~at:(TaskHelper.set_progress ~__context)
-                 ~__context ~self:vm domid Domain.Halt);
-    end else debug "%s phase 0/3: no shutdown request required since this is a 
hard_shutdown" api_call_name
+       (* NB a parallel internal halt may leave the domid as -1. If so then 
there's no work for us 
+          to do here. *)
+       if domid <> -1 then begin
+      if clean then begin
+               debug "%s: phase 1/2: waiting for the domain to shutdown" 
api_call_name;
+               
+               match with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
+                                                       
~at:(TaskHelper.set_progress ~__context)
+                                                       ~__context ~self:vm 
domid Domain.Halt) with
+               | Xal.Vanished
+               | Xal.Halted -> () (* good *)
+               | Xal.Suspended ->
+                         (* Log the failure but continue *)
+                         error "VM: %s suspended when asked to shutdown" 
(Ref.string_of vm)
+               | Xal.Crashed ->
+                         (* Log the failure but continue *)
+                       error "VM: %s crashed when asked to shutdown" 
(Ref.string_of vm)
+               | Xal.Rebooted ->
+                         (* Log the failure but continue *)
+                         error "VM: %s attempted to reboot when asked to 
shutdown" (Ref.string_of vm)
+      end else begin
+               debug "%s phase 0/3: no shutdown request required since this is 
a hard_shutdown" api_call_name;
+               (* The domain might be killed by the event thread. Again, this 
is ok. *)
+               Helpers.log_exn_continue (Printf.sprintf "Xc.domain_shutdown 
domid=%d Xc.Halt" domid)
+                       (fun () -> 
+                                with_xc (fun xc -> Xc.domain_shutdown xc domid 
Xc.Halt)
+                       ) ()
+         end
+       end
 
-  let in_dom0 { TwoPhase.__context=__context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
-    (* Invoke pre_destroy hook *)
-    Xapi_hooks.vm_pre_destroy ~__context ~reason:(if clean then 
Xapi_hooks.reason__clean_shutdown else Xapi_hooks.reason__hard_shutdown) ~vm;
-
+  (** Run with the per-VM lock held to clean up any shutdown domain. Note if 
the VM has been rebooted
+         then we abort with OTHER_OPERATION_IN_PROGRESS. See 
[retry_on_conflict] *)
+  let in_dom0_already_locked { TwoPhase.__context=__context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
+       (* If the VM has been shutdown by the event thread (domid = -1) then 
there's no domain to destroy. *)
+       (* If the VM is running again then throw an error to trigger 
retry_on_conflict *)
     let domid = Helpers.domid_of_vm ~__context ~self:vm in
-    if domid <> -1 then begin
-      debug "%s: phase 2/2: destroying old domain (domid %d)" api_call_name 
domid;
-      with_xc_and_xs (fun xc xs ->
-                     Vmops.destroy ~__context ~xc ~xs ~self:vm domid `Halted;
-                     (* Force an update of the stats - this will cause the 
rrds to be synced back to the master *)
-                     Monitor.do_monitor __context xc
-                  );
-    end;
+       if domid <> -1 then begin
+         with_xc_and_xs 
+                 (fun xc xs ->
+                          let di = Xc.domain_getinfo xc domid in
+                          (* If someone rebooted it while we dropped the lock: 
*)
+                          if Xal.is_running di
+                          then raise 
(Api_errors.Server_error(Api_errors.other_operation_in_progress, [ "VM"; 
Ref.string_of vm ]));
+                          
+                          (* Invoke pre_destroy hook *)
+                          Xapi_hooks.vm_pre_destroy ~__context ~reason:(if 
clean then Xapi_hooks.reason__clean_shutdown else 
Xapi_hooks.reason__hard_shutdown) ~vm;
+                          debug "%s: phase 2/2: destroying old domain (domid 
%d)" api_call_name domid;
+                      Vmops.destroy ~__context ~xc ~xs ~self:vm domid `Halted;
+                      (* Force an update of the stats - this will cause the 
rrds to be synced back to the master *)
+                      Monitor.do_monitor __context xc
+                 )
+       end;
 
     if Db.VM.get_power_state ~__context ~self:vm = `Suspended then begin
       debug "hard_shutdown: destroying any suspend VDI";
@@ -445,6 +544,22 @@
        Xapi_vm_lifecycle.force_state_reset ~__context ~self:vm ~value:`Halted
     end
 
+  (** In the synchronous API call paths, acquire the lock, check if the VM's 
domain has shutdown (if not error out)
+         and continue with the shutdown *)
+  let in_dom0_already_queued args = 
+       Locking_helpers.with_lock args.TwoPhase.vm 
+               (fun _ _ -> 
+                        if TwoPhase.is_vm_running args
+                        then raise 
(Api_errors.Server_error(Api_errors.other_operation_in_progress, [ "VM"; 
Ref.string_of args.TwoPhase.vm ]))
+                        else in_dom0_already_locked args) ()
+
+  (** In the synchronouse API call paths, wait in the 
domU_internal_shutdown_queue and then attempt 
+         to reboot the VM. NB this is the same queue used by the event thread. 
*)
+  let in_dom0 args =
+    Local_work_queue.wait_in_line Local_work_queue.domU_internal_shutdown_queue
+      (Context.string_of_task args.TwoPhase.__context)
+      (fun () -> in_dom0_already_queued args)
+
   let actions = { TwoPhase.in_guest = in_guest; in_dom0 = in_dom0 }
 end
 
@@ -453,13 +568,24 @@
   | `restart -> Reboot.actions
   | `destroy -> Shutdown.actions
 
-(** Add queueing and locking policy for the external API calls *)
-let impose_external_api_policy (x: TwoPhase.t) : TwoPhase.t = 
-  let f args = 
-    Local_work_queue.wait_in_line Local_work_queue.normal_vm_queue
-      (Context.string_of_task args.TwoPhase.__context)
-      (fun () -> x.TwoPhase.in_dom0 args) in
-  { x with TwoPhase.in_dom0 = f }
+(** If our operation conflicts with another parallel operation (i.e. we ask 
for shutdown
+       but guest admin asks for reboot) then we raise an 
OTHER_OPERATION_IN_PROGRESS exception 
+       and retry the whole procedure. *)
+let retry_on_conflict (x: TwoPhase.args) (y: TwoPhase.t) =
+  let rec retry n = 
+       try 
+         y.TwoPhase.in_guest x;
+         if Xapi_fist.disable_sync_lifecycle_path ()
+         then warn "FIST: disable_sync_lifecycle_path: deferring to the event 
thread"
+         else y.TwoPhase.in_dom0 x
+       with 
+       | Api_errors.Server_error(code, _) as e when code = 
Api_errors.other_operation_in_progress ->
+                 let aborting = n < 1 in
+                 debug "Conflict when executing %s: %s" 
x.TwoPhase.api_call_name (if aborting then "aborting" else "retrying");
+                 if aborting then raise e;
+                 retry (n - 1) in
+  retry 10
+  
 
 (** CA-11132: Record information about the shutdown in odd other-config keys 
for Egenera *)
 let record_shutdown_details ~__context ~vm reason initiator action = 
@@ -480,51 +606,36 @@
   
 (** VM.hard_reboot entrypoint *)
 let hard_reboot ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.hard_reboot"; clean=false } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-       ) ()
+  let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.hard_reboot"; clean=false } in
+  retry_on_conflict args (of_action action)
 
 (** VM.hard_shutdown entrypoint *)
 let hard_shutdown ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Halted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.hard_shutdown"; clean=false } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-    ) ()
+  let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Halted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.hard_shutdown"; clean=false } in
+  retry_on_conflict args (of_action action)
 
 (** VM.clean_reboot entrypoint *)
 let clean_reboot ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.clean_reboot"; clean=true } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-    ) ()
+  let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.clean_reboot"; clean=true } in
+  retry_on_conflict args (of_action action)
 
 (** VM.clean_shutdown entrypoint *)
 let clean_shutdown ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Halted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.clean_shutdown"; clean=true } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-    ) ()
+  let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Halted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.clean_shutdown"; clean=true } in
+  retry_on_conflict args (of_action action)
 
 
(***************************************************************************************)
 
-(** VM.hard_reboot_internal: called via the event thread *)
-let hard_reboot_internal ~__context ~vm = 
-  (* VM is locked by the caller *)
-  let args = { TwoPhase.__context=__context; vm=vm; token=None; 
api_call_name="VM.hard_reboot_internal"; clean=false } in
-  Reboot.in_dom0 args
+(** @deprecated *)
+let hard_reboot_internal ~__context ~vm = assert false
 
 
(***************************************************************************************)
 
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm.mli
--- a/ocaml/xapi/xapi_vm.mli    Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm.mli    Thu Dec 10 23:04:50 2009 +0000
@@ -99,27 +99,26 @@
     type args = {
       __context : Context.t;
       vm : API.ref_VM;
-      token : Locking_helpers.token option;
       api_call_name : string;
       clean : bool;
     }
     type t = { in_guest : args -> unit; in_dom0 : args -> unit; }
-    val execute : args -> t -> unit
   end
 module Reboot :
   sig
     val in_guest : TwoPhase.args -> unit
+       val in_dom0_already_locked : TwoPhase.args -> unit
     val in_dom0 : TwoPhase.args -> unit
     val actions : TwoPhase.t
   end
 module Shutdown :
   sig
     val in_guest : TwoPhase.args -> unit
+       val in_dom0_already_locked : TwoPhase.args -> unit
     val in_dom0 : TwoPhase.args -> unit
     val actions : TwoPhase.t
   end
 val of_action : [< `destroy | `restart ] -> TwoPhase.t
-val impose_external_api_policy : TwoPhase.t -> TwoPhase.t
 val record_shutdown_details :
   __context:Context.t ->
   vm:[ `VM ] Ref.t ->
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm_migrate.ml
--- a/ocaml/xapi/xapi_vm_migrate.ml     Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm_migrate.ml     Thu Dec 10 23:04:50 2009 +0000
@@ -126,9 +126,18 @@
   (* If we got the ack, then proceed to shutdown the domain with the suspend
      reason.  If we failed to get the ack, then raise an exception to abort
      the migration *)
-  if (ack = `ACKED) then 
-    Vmops.clean_shutdown_with_reason ~xal ~__context ~self domid Domain.Suspend
-  else 
+  if (ack = `ACKED) then begin
+    match Vmops.clean_shutdown_with_reason ~xal ~__context ~self domid 
Domain.Suspend with
+       | Xal.Suspended -> () (* good *)
+       | Xal.Crashed ->
+                 raise (Api_errors.Server_error(Api_errors.vm_crashed, [ 
Ref.string_of self ]))
+       | Xal.Rebooted ->
+                 raise (Api_errors.Server_error(Api_errors.vm_rebooted, [ 
Ref.string_of self ]))       
+       | Xal.Vanished
+       | Xal.Halted ->
+                 raise (Api_errors.Server_error(Api_errors.vm_halted, [ 
Ref.string_of self ]))
+       | Xal.Shutdown x -> vm_migrate_failed (Printf.sprintf "Domain shutdown 
for unexpected reason: %d" x)
+  end else 
     vm_migrate_failed "Failed to receive suspend acknowledgement within 
timeout period or an abort was requested."
 
 (* ------------------------------------------------------------------- *)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xenops/domain.mli
--- a/ocaml/xenops/domain.mli   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xenops/domain.mli   Thu Dec 10 23:04:50 2009 +0000
@@ -85,7 +85,7 @@
 val shutdown: xs:Xs.xsh -> domid -> shutdown_reason -> unit
 
 (** Tell the domain to shutdown with reason ''shutdown_reason', waiting for an 
ack *)
-val shutdown_ack: ?timeout:float -> xc:Xc.handle -> xs:Xs.xsh -> domid -> 
shutdown_reason -> bool
+val shutdown_wait_for_ack: ?timeout:float -> xc:Xc.handle -> xs:Xs.xsh -> 
domid -> shutdown_reason -> unit
 
 (** send a domain a sysrq *)
 val sysrq: xs:Xs.xsh -> domid -> char -> unit
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xenops/xenops.ml
--- a/ocaml/xenops/xenops.ml    Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xenops/xenops.ml    Thu Dec 10 23:04:50 2009 +0000
@@ -76,7 +76,12 @@
        printf "built hvm domain: %u\n" domid
 
 let clean_shutdown_domain ~xal ~domid ~reason ~sync =
-       let acked = Domain.shutdown_ack (Xal.xc_of_ctx xal) (Xal.xs_of_ctx xal) 
domid reason in
+  let xc = Xal.xc_of_ctx xal in
+  let xs = Xal.xs_of_ctx xal in
+  Domain.shutdown ~xs domid reason;
+  (* Wait for any necessary acknowledgement. If we get a Watch.Timeout _ then
+        we abort early; otherwise we continue in Xal.wait_release below. *)
+  let acked = try Domain.shutdown_wait_for_ack ~xc ~xs domid reason; true with 
Watch.Timeout _ -> false in
        if not acked then (
                eprintf "domain %u didn't acknowledged shutdown\n" domid;
        ) else (
13 files changed, 497 insertions(+), 139 deletions(-)
ocaml/idl/api_errors.ml           |    3 
ocaml/idl/datamodel.ml            |    6 
ocaml/xapi/OMakefile              |    2 
ocaml/xapi/events.ml              |   20 +-
ocaml/xapi/quicktest.ml           |    1 
ocaml/xapi/quicktest_lifecycle.ml |  194 ++++++++++++++++++++++++++
ocaml/xapi/vmops.ml               |   91 ++++++------
ocaml/xapi/xapi_fist.ml           |   17 ++
ocaml/xapi/xapi_vm.ml             |  273 ++++++++++++++++++++++++++-----------
ocaml/xapi/xapi_vm.mli            |    5 
ocaml/xapi/xapi_vm_migrate.ml     |   15 +-
ocaml/xenops/domain.mli           |    2 
ocaml/xenops/xenops.ml            |    7 


Attachment: xen-api.hg-4.patch
Description: Text Data

_______________________________________________
xen-api mailing list
xen-api@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/mailman/listinfo/xen-api

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.