[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] MCE: Fix race condition in mctelem_reserve



Thanks!

Reviewed-by: Liu Jinsong <jinsong.liu@xxxxxxxxx>


Frediano Ziglio wrote:
> Yes,
>   all these patches came from some customer reports. We manage with
> xen-mceinj to reproduce the problem and got these types of races.
> These patches (the other one is already applied) fix the race issues
> we found. 
> 
> I tested with a CentOS 6.4 version with mcelog.
> 
> Frediano
> 
> On Tue, 2014-02-18 at 08:42 +0000, Liu, Jinsong wrote:
>> This logic (mctelem) is related to dom0 mcelog logic. Have you
>> tested if mcelog works fine with your patch? 
>> 
>> Thanks,
>> Jinsong
>> 
>> Frediano Ziglio wrote:
>>> These lines (in mctelem_reserve)
>>> 
>>> 
>>>         newhead = oldhead->mcte_next;
>>>         if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
>>> 
>>> are racy. After you read the newhead pointer it can happen that
>>> another flow (thread or recursive invocation) change all the list
>>> but set head with same value. So oldhead is the same as *freelp but
>>> you are setting a new head that could point to whatever element
>>> (even already used). 
>>> 
>>> This patch use instead a bit array and atomic bit operations.
>>> 
>>> Actually it use unsigned long instead of bitmap type as testing for
>>> all zeroes is easier. 
>>> 
>>> Signed-off-by: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx> ---
>>>  xen/arch/x86/cpu/mcheck/mctelem.c |   52
>>>  ++++++++++++++++++++++--------------- 1 file changed, 31
>>> insertions(+), 21 deletions(-)
>>> 
>>> diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c
>>> b/xen/arch/x86/cpu/mcheck/mctelem.c index 895ce1a..e56b6fb 100644
>>> --- a/xen/arch/x86/cpu/mcheck/mctelem.c
>>> +++ b/xen/arch/x86/cpu/mcheck/mctelem.c
>>> @@ -69,6 +69,11 @@ struct mctelem_ent {
>>>  #define    MC_URGENT_NENT          10
>>>  #define    MC_NONURGENT_NENT       20
>>> 
>>> +/* Check if we can fit enough bits in the free bit array */
>>> +#if MC_URGENT_NENT + MC_NONURGENT_NENT > BITS_PER_LONG +#error Too
>>> much elements +#endif
>>> +
>>>  #define    MC_NCLASSES             (MC_NONURGENT + 1)
>>> 
>>>  #define    COOKIE2MCTE(c)          ((struct mctelem_ent *)(c))
>>> @@ -77,11 +82,9 @@ struct mctelem_ent {
>>>  static struct mc_telem_ctl {
>>>     /* Linked lists that thread the array members together.          *
>>> -    * The free lists are singly-linked via mcte_next, and we allocate
>>> -    * from them by atomically unlinking an element from the head.
>>> -    * Consumed entries are returned to the head of the free list.
>>> -    * When an entry is reserved off the free list it is not linked
>>> -    * on any list until it is committed or dismissed.
>>> +    * The free lists is a bit array where bit 1 means free.
>>> +    * This as element number is quite small and is easy to
>>> +    * atomically allocate that way.
>>>      *
>>>      * The committed list grows at the head and we do not maintain a
>>>      * tail pointer; insertions are performed atomically.  The head
>>> @@ -101,7 +104,7 @@ static struct mc_telem_ctl {
>>>      * we can lock it for updates.  The head of the processing list
>>>      * always has the oldest telemetry, and we append (as above)
>>>      * at the tail of the processing list. */
>>> -   struct mctelem_ent *mctc_free[MC_NCLASSES];
>>> +   unsigned long mctc_free[MC_NCLASSES];
>>>     struct mctelem_ent *mctc_committed[MC_NCLASSES];
>>>     struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
>>>     struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
>>> @@ -214,7 +217,10 @@ static void mctelem_free(struct mctelem_ent
>>>     *tep) BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
>>> 
>>>     tep->mcte_prev = NULL;
>>> -   mctelem_xchg_head(&mctctl.mctc_free[target], &tep->mcte_next,
>>> tep); +     tep->mcte_next = NULL; +
>>> +   /* set free in array */
>>> +   set_bit(tep - mctctl.mctc_elems, &mctctl.mctc_free[target]);  }
>>> 
>>>  /* Increment the reference count of an entry that is not linked on
>>> to @@ -284,7 +290,7 @@ void mctelem_init(int reqdatasz)     }
>>> 
>>>     for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
>>> -           struct mctelem_ent *tep, **tepp;
>>> +           struct mctelem_ent *tep;
>>> 
>>>             tep = mctctl.mctc_elems + i;
>>>             tep->mcte_flags = MCTE_F_STATE_FREE;
>>> @@ -292,16 +298,15 @@ void mctelem_init(int reqdatasz)
>>>             tep->mcte_data = datarr + i * datasz;
>>> 
>>>             if (i < MC_URGENT_NENT) {
>>> -                   tepp = &mctctl.mctc_free[MC_URGENT];
>>> -                   tep->mcte_flags |= MCTE_F_HOME_URGENT;
>>> +                   __set_bit(i, &mctctl.mctc_free[MC_URGENT]);
>>> +                   tep->mcte_flags = MCTE_F_HOME_URGENT;
>>>             } else {
>>> -                   tepp = &mctctl.mctc_free[MC_NONURGENT];
>>> -                   tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
>>> +                   __set_bit(i, &mctctl.mctc_free[MC_NONURGENT]);
>>> +                   tep->mcte_flags = MCTE_F_HOME_NONURGENT;
>>>             }
>>> 
>>> -           tep->mcte_next = *tepp;
>>> +           tep->mcte_next = NULL;
>>>             tep->mcte_prev = NULL;
>>> -           *tepp = tep;
>>>     }
>>>  }
>>> 
>>> @@ -310,18 +315,21 @@ static int mctelem_drop_count;
>>> 
>>>  /* Reserve a telemetry entry, or return NULL if none available.
>>>   * If we return an entry then the caller must subsequently call
>>> exactly one of - * mctelem_unreserve or mctelem_commit for that
>>> entry. + * mctelem_dismiss or mctelem_commit for that entry.   */
>>>  mctelem_cookie_t mctelem_reserve(mctelem_class_t which)  {
>>> -   struct mctelem_ent **freelp;
>>> -   struct mctelem_ent *oldhead, *newhead;
>>> +   unsigned long *freelp;
>>> +   unsigned long oldfree;
>>> +   unsigned bit;
>>>     mctelem_class_t target = (which == MC_URGENT) ?
>>>         MC_URGENT : MC_NONURGENT;
>>> 
>>>     freelp = &mctctl.mctc_free[target];
>>>     for (;;) {
>>> -           if ((oldhead = *freelp) == NULL) {
>>> +           oldfree = *freelp;
>>> +
>>> +           if (oldfree == 0) {
>>>                     if (which == MC_URGENT && target == MC_URGENT) {
>>>                             /* raid the non-urgent freelist */
>>>                             target = MC_NONURGENT;
>>> @@ -333,9 +341,11 @@ mctelem_cookie_t
>>>             mctelem_reserve(mctelem_class_t                         which) 
>>> } }
>>> 
>>> -           newhead = oldhead->mcte_next;
>>> -           if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
>>> -                   struct mctelem_ent *tep = oldhead;
>>> +           /* try to allocate, atomically clear free bit */
>>> +           bit = find_first_set_bit(oldfree);
>>> +           if (test_and_clear_bit(bit, freelp)) {
>>> +                   /* return element we got */
>>> +                   struct mctelem_ent *tep = mctctl.mctc_elems + bit;
>>> 
>>>                     mctelem_hold(tep);
>>>                     MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.