From 8dfa47da8cb33ebaf7aae6db6548e75ed86e8f1e Mon Sep 17 00:00:00 2001
From: Andy McFadden <fadden@android.com>
Date: Thu, 27 May 2010 10:10:18 -0700
Subject: [PATCH] Atomic/SMP update, part 2.

Added new atomic functions, renamed some old ones.  Some #defines have
been added for backward compatibility.

Merged the pre- and post-ARMv6 implementations into a single file.

Renamed the semi-private __android_membar_full_smp to USE_SCREAMING_CAPS
since that's more appropriate for a macro.

Added lots of comments.

Note Mac OS X primitives have not been tested.

Change-Id: If827260750aeb61ad5c2b760e30658e29dbb26f2
---
 include/cutils/atomic-inline.h   |  18 +-
 include/cutils/atomic.h          | 107 ++++++--
 libcutils/atomic-android-arm.S   | 448 +++++++++++++++++++++++++------
 libcutils/atomic-android-armv6.S | 174 ------------
 libcutils/atomic-android-sh.c    |  52 +++-
 libcutils/atomic.c               | 142 ++++++----
 6 files changed, 576 insertions(+), 365 deletions(-)
 delete mode 100644 libcutils/atomic-android-armv6.S

diff --git a/include/cutils/atomic-inline.h b/include/cutils/atomic-inline.h
index 4f5ddf761..1c23be9f1 100644
--- a/include/cutils/atomic-inline.h
+++ b/include/cutils/atomic-inline.h
@@ -27,6 +27,12 @@
  *
  * Anything that does include this file must set ANDROID_SMP to either
  * 0 or 1, indicating compilation for UP or SMP, respectively.
+ *
+ * Macros defined in this header:
+ *
+ * void ANDROID_MEMBAR_FULL(void)
+ *   Full memory barrier.  Provides a compiler reordering barrier, and
+ *   on SMP systems emits an appropriate instruction.
  */
 
 #if !defined(ANDROID_SMP)
@@ -55,17 +61,17 @@ extern "C" {
  * This will fail on plain 16-bit Thumb.
  */
 #if defined(__ARM_HAVE_DMB)
-# define __android_membar_full_smp() \
+# define _ANDROID_MEMBAR_FULL_SMP() \
     do { __asm__ __volatile__ ("dmb" ::: "memory"); } while (0)
 #else
-# define __android_membar_full_smp()  ARM_SMP_defined_but_no_DMB()
+# define _ANDROID_MEMBAR_FULL_SMP()  ARM_SMP_defined_but_no_DMB()
 #endif
 
 #elif defined(__i386__) || defined(__x86_64__)
 /*
  * For recent x86, we can use the SSE2 mfence instruction.
  */
-# define __android_membar_full_smp() \
+# define _ANDROID_MEMBAR_FULL_SMP() \
     do { __asm__ __volatile__ ("mfence" ::: "memory"); } while (0)
 
 #else
@@ -73,7 +79,7 @@ extern "C" {
  * Implementation not defined for this platform.  Hopefully we're building
  * in uniprocessor mode.
  */
-# define __android_membar_full_smp()  SMP_barrier_not_defined_for_platform()
+# define _ANDROID_MEMBAR_FULL_SMP()  SMP_barrier_not_defined_for_platform()
 #endif
 
 
@@ -88,9 +94,9 @@ extern "C" {
  * be stale.  Other CPUs may do less, but the end result is equivalent.
  */
 #if ANDROID_SMP != 0
-# define android_membar_full() __android_membar_full_smp()
+# define ANDROID_MEMBAR_FULL() _ANDROID_MEMBAR_FULL_SMP()
 #else
-# define android_membar_full() \
+# define ANDROID_MEMBAR_FULL() \
     do { __asm__ __volatile__ ("" ::: "memory"); } while (0)
 #endif
 
diff --git a/include/cutils/atomic.h b/include/cutils/atomic.h
index 8e12902b0..0200709e1 100644
--- a/include/cutils/atomic.h
+++ b/include/cutils/atomic.h
@@ -25,51 +25,102 @@ extern "C" {
 #endif
 
 /*
- * Unless otherwise noted, the operations below perform a full fence before
- * the atomic operation on SMP systems ("release" semantics).
+ * A handful of basic atomic operations.  The appropriate pthread
+ * functions should be used instead of these whenever possible.
+ *
+ * The "acquire" and "release" terms can be defined intuitively in terms
+ * of the placement of memory barriers in a simple lock implementation:
+ *   - wait until compare-and-swap(lock-is-free --> lock-is-held) succeeds
+ *   - barrier
+ *   - [do work]
+ *   - barrier
+ *   - store(lock-is-free)
+ * In very crude terms, the initial (acquire) barrier prevents any of the
+ * "work" from happening before the lock is held, and the later (release)
+ * barrier ensures that all of the work happens before the lock is released.
+ * (Think of cached writes, cache read-ahead, and instruction reordering
+ * around the CAS and store instructions.)
+ *
+ * The barriers must apply to both the compiler and the CPU.  Note it is
+ * legal for instructions that occur before an "acquire" barrier to be
+ * moved down below it, and for instructions that occur after a "release"
+ * barrier to be moved up above it.
+ *
+ * The ARM-driven implementation we use here is short on subtlety,
+ * and actually requests a full barrier from the compiler and the CPU.
+ * The only difference between acquire and release is in whether they
+ * are issued before or after the atomic operation with which they
+ * are associated.  To ease the transition to C/C++ atomic intrinsics,
+ * you should not rely on this, and instead assume that only the minimal
+ * acquire/release protection is provided.
+ *
+ * NOTE: all int32_t* values are expected to be aligned on 32-bit boundaries.
+ * If they are not, atomicity is not guaranteed.
  */
 
-void android_atomic_write(int32_t value, volatile int32_t* addr);
-
 /*
- * all these atomic operations return the previous value
+ * Basic arithmetic and bitwise operations.  These all provide a
+ * barrier with "release" ordering, and return the previous value.
+ *
+ * These have the same characteristics (e.g. what happens on overflow)
+ * as the equivalent non-atomic C operations.
  */
-
 int32_t android_atomic_inc(volatile int32_t* addr);
 int32_t android_atomic_dec(volatile int32_t* addr);
-
 int32_t android_atomic_add(int32_t value, volatile int32_t* addr);
 int32_t android_atomic_and(int32_t value, volatile int32_t* addr);
 int32_t android_atomic_or(int32_t value, volatile int32_t* addr);
 
-int32_t android_atomic_swap(int32_t value, volatile int32_t* addr);
-
 /*
- * cmpxchg returns zero if the new value was successfully written.  This
- * will only happen when *addr == oldvalue.
+ * Perform an atomic load with "acquire" or "release" ordering.
  *
- * (The return value is inverted from implementations on other platforms, but
- * matches the ARM ldrex/strex sematics.  Note also this is a compare-and-set
- * operation, not a compare-and-exchange operation, since we don't return
- * the original value.)
+ * This is only necessary if you need the memory barrier.  A 32-bit read
+ * from a 32-bit aligned address is atomic on all supported platforms.
  */
-int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue,
+int32_t android_atomic_acquire_load(volatile int32_t* addr);
+int32_t android_atomic_release_load(volatile int32_t* addr);
+
+/*
+ * Perform an atomic store with "acquire" or "release" ordering.
+ *
+ * This is only necessary if you need the memory barrier.  A 32-bit write
+ * to a 32-bit aligned address is atomic on all supported platforms.
+ */
+void android_atomic_acquire_store(int32_t value, volatile int32_t* addr);
+void android_atomic_release_store(int32_t value, volatile int32_t* addr);
+
+/*
+ * Unconditional swap operation with "acquire" or "release" ordering.
+ *
+ * Stores the new value at *addr, and returns the previous value.
+ */
+int32_t android_atomic_acquire_swap(int32_t value, volatile int32_t* addr);
+int32_t android_atomic_release_swap(int32_t value, volatile int32_t* addr);
+
+/*
+ * Compare-and-set operation with "acquire" or "release" ordering.
+ *
+ * This returns zero if the new value was successfully stored, which will
+ * only happen when *addr == oldvalue.
+ *
+ * (The return value is inverted from implementations on other platforms,
+ * but matches the ARM ldrex/strex result.)
+ *
+ * Implementations that use the release CAS in a loop may be less efficient
+ * than possible, because we re-issue the memory barrier on each iteration.
+ */
+int android_atomic_acquire_cas(int32_t oldvalue, int32_t newvalue,
+        volatile int32_t* addr);
+int android_atomic_release_cas(int32_t oldvalue, int32_t newvalue,
         volatile int32_t* addr);
 
 /*
- * Same basic operation as android_atomic_cmpxchg, but with "acquire"
- * semantics.  The memory barrier, if required, is performed after the
- * new value is stored.  Useful for acquiring a spin lock.
+ * Aliases for code using an older version of this header.  These are now
+ * deprecated and should not be used.  The definitions will be removed
+ * in a future release.
  */
-int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
-        volatile int32_t* addr);
-
-/*
- * Perform an atomic store with "release" semantics.  The memory barrier,
- * if required, is performed before the store instruction.  Useful for
- * releasing a spin lock.
- */
-#define android_atomic_release_store android_atomic_write
+#define android_atomic_write android_atomic_release_store
+#define android_atomic_cmpxchg android_atomic_release_cas
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/libcutils/atomic-android-arm.S b/libcutils/atomic-android-arm.S
index f918990c8..d8ee15cc6 100644
--- a/libcutils/atomic-android-arm.S
+++ b/libcutils/atomic-android-arm.S
@@ -14,68 +14,353 @@
  * limitations under the License.
  */
 
-/* TODO: insert memory barriers on SMP */
-
 #include <machine/cpu-features.h>
 
+    .text
+    .align
+
+    .global android_atomic_acquire_load
+    .type android_atomic_acquire_load, %function
+    .global android_atomic_release_load
+    .type android_atomic_release_load, %function
+
+    .global android_atomic_acquire_store
+    .type android_atomic_acquire_store, %function
+    .global android_atomic_release_store
+    .type android_atomic_release_store, %function
+
+    .global android_atomic_inc
+    .type android_atomic_inc, %function
+    .global android_atomic_dec
+    .type android_atomic_dec, %function
+
+    .global android_atomic_add
+    .type android_atomic_add, %function
+    .global android_atomic_and
+    .type android_atomic_and, %function
+    .global android_atomic_or
+    .type android_atomic_or, %function
+
+    .global android_atomic_release_swap
+    .type android_atomic_release_swap, %function
+    .global android_atomic_acquire_swap
+    .type android_atomic_acquire_swap, %function
+
+    .global android_atomic_release_cas
+    .type android_atomic_release_cas, %function
+    .global android_atomic_acquire_cas
+    .type android_atomic_acquire_cas, %function
+
+/* must be on or off; cannot be left undefined */
+#if !defined(ANDROID_SMP)
+# error "ANDROID_SMP not defined"
+#endif
+
+
+#if defined(__ARM_HAVE_LDREX_STREX)
 /*
- * NOTE: these atomic operations are SMP safe on all architectures. 
+ * ===========================================================================
+ *      ARMv6+ implementation
+ * ===========================================================================
+ *
+ * These functions use the LDREX/STREX instructions to perform atomic
+ * operations ("LL/SC" approach).  On an SMP build they will include
+ * an appropriate memory barrier.
  */
 
-	.text
-	.align
-	
-    .global android_atomic_write
-    .type android_atomic_write, %function
-
-	.global android_atomic_inc
-	.type android_atomic_inc, %function
-	.global android_atomic_dec
-	.type android_atomic_dec, %function
-    
-	.global android_atomic_add
-	.type android_atomic_add, %function
-	.global android_atomic_and
-	.type android_atomic_and, %function
-	.global android_atomic_or
-	.type android_atomic_or, %function
-    
-    .global android_atomic_swap
-    .type android_atomic_swap, %function
-	
-	.global android_atomic_cmpxchg
-	.type android_atomic_cmpxchg, %function
-	.global android_atomic_acquire_cmpxchg
-	.type android_atomic_acquire_cmpxchg, %function
+/* generate the memory barrier instruction when the build requires it */
+#if ANDROID_SMP == 1
+# if defined(__ARM_HAVE_DMB)
+#  define SMP_DMB dmb
+# else
+   /* Data Memory Barrier operation, initated by writing a value into a
+      specific register with the Move to Coprocessor instruction.  We
+      arbitrarily use r0 here. */
+#  define SMP_DMB mcr p15, 0, r0, c7, c10, 5
+# endif
+#else
+# define SMP_DMB
+#endif
 
 /*
- * ----------------------------------------------------------------------------
- * int __kernel_cmpxchg(int oldval, int newval, int *ptr)
- * clobbered: r3, ip, flags
- * return 0 if a swap was made, non-zero otherwise.
- */ 
-
-   .equ     kernel_cmpxchg, 0xFFFF0FC0
-   .equ     kernel_atomic_base, 0xFFFF0FFF
+ * Sidebar: do we need to use the -EX instructions for atomic load/store?
+ *
+ * Consider the following situation (time advancing downward):
+ *
+ * P1                  P2
+ *  val = LDREX(mem)
+ *  val = val + 1
+ *                      STR(mem, otherval)
+ *  STREX(mem, val)
+ *
+ * If these instructions issue on separate cores, the STREX will correctly
+ * fail because of the intervening store from the other core.  If this same
+ * sequence of instructions executes in two threads on the same core, the
+ * STREX will incorrectly succeed.
+ *
+ * There are two ways to fix this:
+ * (1) Use LDREX/STREX for the atomic store operations.  This doesn't
+ *   prevent the program from doing a non-exclusive store, but at least
+ *   this way if they always use atomic ops to access the memory location
+ *   there won't be any problems.
+ * (2) Have the kernel clear the LDREX reservation on thread context switch.
+ *  This will sometimes clear the reservation unnecessarily, but guarantees
+ *  correct behavior.
+ *
+ * The Android kernel performs a CLREX (v7) or dummy STREX (pre-v7), so we
+ * can get away with a non-exclusive store here.
+ *
+ * -----
+ *
+ * It's worth noting that using non-exclusive LDR and STR means the "load"
+ * and "store" operations aren't quite the same as read-modify-write or
+ * swap operations.  By definition those must read and write memory in a
+ * in a way that is coherent across all cores, whereas our non-exclusive
+ * load and store have no such requirement.
+ *
+ * In practice this doesn't matter, because the only guarantees we make
+ * about who sees what when are tied to the acquire/release semantics.
+ * Other cores may not see our atomic releasing store as soon as they would
+ * if the code used LDREX/STREX, but a store-release operation doesn't make
+ * any guarantees as to how soon the store will be visible.  It's allowable
+ * for operations that happen later in program order to become visible
+ * before the store.  For an acquring store we issue a full barrier after
+ * the STREX, ensuring that other processors see events in the proper order.
+ */
 
 /*
- * ----------------------------------------------------------------------------
- * android_atomic_write
- * input: r0=value, r1=address
+ * android_atomic_acquire_load / android_atomic_release_load
+ * input: r0 = address
+ * output: r0 = value
+ */
+android_atomic_acquire_load:
+    .fnstart
+    ldr     r0, [r0]
+    SMP_DMB
+    bx      lr
+    .fnend
+
+android_atomic_release_load:
+    .fnstart
+    SMP_DMB
+    ldr     r0, [r0]
+    bx      lr
+    .fnend
+
+
+/*
+ * android_atomic_acquire_store / android_atomic_release_store
+ * input: r0 = value, r1 = address
  * output: void
  */
- 
-android_atomic_write:
+android_atomic_acquire_store:
+    .fnstart
     str     r0, [r1]
-    bx      lr;
+    SMP_DMB
+    bx      lr
+    .fnend
+
+android_atomic_release_store:
+    .fnstart
+    SMP_DMB
+    str     r0, [r1]
+    bx      lr
+    .fnend
+
+/*
+ * Common sequence for read-modify-write operations.
+ *
+ * input: r1 = address
+ * output: r0 = original value, returns to caller
+ */
+    .macro  RMWEX   op, arg
+1:  ldrex   r0, [r1]                    @ load current value into r0
+    \op     r2, r0, \arg                @ generate new value into r2
+    strex   r3, r2, [r1]                @ try to store new value; result in r3
+    cmp     r3, #0                      @ success?
+    bxeq    lr                          @ yes, return
+    b       1b                          @ no, retry
+    .endm
+
+
+/*
+ * android_atomic_inc
+ * input: r0 = address
+ * output: r0 = old value
+ */
+android_atomic_inc:
+    .fnstart
+    SMP_DMB
+    mov     r1, r0
+    RMWEX   add, #1
+    .fnend
+
+
+/*
+ * android_atomic_dec
+ * input: r0 = address
+ * output: r0 = old value
+ */
+android_atomic_dec:
+    .fnstart
+    SMP_DMB
+    mov     r1, r0
+    RMWEX   sub, #1
+    .fnend
+
+
+/*
+ * android_atomic_add
+ * input: r0 = value, r1 = address
+ * output: r0 = old value
+ */
+android_atomic_add:
+    .fnstart
+    SMP_DMB
+    mov     ip, r0
+    RMWEX   add, ip
+    .fnend
+
+
+/*
+ * android_atomic_and
+ * input: r0 = value, r1 = address
+ * output: r0 = old value
+ */
+android_atomic_and:
+    .fnstart
+    SMP_DMB
+    mov     ip, r0
+    RMWEX   and, ip
+    .fnend
+
+
+/*
+ * android_atomic_or
+ * input: r0 = value, r1 = address
+ * output: r0 = old value
+ */
+android_atomic_or:
+    .fnstart
+    SMP_DMB
+    mov     ip, r0
+    RMWEX   orr, ip
+    .fnend
+
+
+/*
+ * android_atomic_acquire_swap / android_atomic_release_swap
+ * input: r0 = value, r1 = address
+ * output: r0 = old value
+ */
+android_atomic_acquire_swap:
+    .fnstart
+1:  ldrex   r2, [r1]                    @ load current value into r2
+    strex   r3, r0, [r1]                @ store new value
+    teq     r3, #0                      @ strex success?
+    bne     1b                          @ no, loop
+    mov     r0, r2                      @ return old value
+    SMP_DMB
+    bx      lr
+    .fnend
+
+android_atomic_release_swap:
+    .fnstart
+    SMP_DMB
+1:  ldrex   r2, [r1]
+    strex   r3, r0, [r1]
+    teq     r3, #0
+    bne     1b
+    mov     r0, r2
+    bx      lr
+    .fnend
+
+
+/*
+ * android_atomic_acquire_cas / android_atomic_release_cas
+ * input: r0 = oldvalue, r1 = newvalue, r2 = address
+ * output: r0 = 0 (xchg done) or non-zero (xchg not done)
+ */
+android_atomic_acquire_cas:
+    .fnstart
+1:  mov     ip, #2                      @ ip=2 means "new != old"
+    ldrex   r3, [r2]                    @ load current value into r3
+    teq     r0, r3                      @ new == old?
+    strexeq ip, r1, [r2]                @ yes, try store, set ip to 0 or 1
+    teq     ip, #1                      @ strex failure?
+    beq     1b                          @ yes, retry
+    mov     r0, ip                      @ return 0 on success, 2 on failure
+    SMP_DMB
+    bx      lr
+    .fnend
+
+android_atomic_release_cas:
+    .fnstart
+    SMP_DMB
+1:  mov     ip, #2
+    ldrex   r3, [r2]
+    teq     r0, r3
+    strexeq ip, r1, [r2]
+    teq     ip, #1
+    beq     1b
+    mov     r0, ip
+    bx      lr
+    .fnend
+
+
+#else /*not defined __ARM_HAVE_LDREX_STREX*/
+/*
+ * ===========================================================================
+ *      Pre-ARMv6 implementation
+ * ===========================================================================
+ *
+ * These functions call through the kernel cmpxchg facility, or use the
+ * (now deprecated) SWP instruction.  They are not SMP-safe.
+ */
+#if ANDROID_SMP == 1
+# error "SMP defined, but LDREX/STREX not available"
+#endif
+
+/*
+ * int __kernel_cmpxchg(int oldval, int newval, int *ptr)
+ * clobbered: r3, ip, flags
+ * return 0 if a swap was made, non-zero otherwise.
+ */ 
+   .equ     kernel_cmpxchg, 0xFFFF0FC0
+   .equ     kernel_atomic_base, 0xFFFF0FFF
+
+
+/*
+ * android_atomic_acquire_load / android_atomic_release_load
+ * input: r0 = address
+ * output: r0 = value
+ */
+android_atomic_acquire_load:
+android_atomic_release_load:
+    .fnstart
+    ldr     r0, [r0]
+    bx      lr
+    .fnend
+
+
+/*
+ * android_atomic_acquire_store / android_atomic_release_store
+ * input: r0 = value, r1 = address
+ * output: void
+ */
+android_atomic_acquire_store:
+android_atomic_release_store:
+    .fnstart
+    str     r0, [r1]
+    bx      lr
+    .fnend
+
 
 /*
- * ----------------------------------------------------------------------------
  * android_atomic_inc
  * input: r0 = address
  * output: r0 = old value
  */
- 
 android_atomic_inc:
     .fnstart
     .save {r4, lr}
@@ -99,14 +384,13 @@ android_atomic_inc:
     ldmia   sp!, {r4, lr}
     bx      lr
     .fnend
-  
+
+
 /*
- * ----------------------------------------------------------------------------
  * android_atomic_dec
- * input: r0=address
+ * input: r0 = address
  * output: r0 = old value
  */
- 
 android_atomic_dec:
     .fnstart
     .save {r4, lr}
@@ -130,14 +414,13 @@ android_atomic_dec:
     ldmia   sp!, {r4, lr}
     bx      lr
     .fnend
-    
+
+
 /*
- * ----------------------------------------------------------------------------
  * android_atomic_add
- * input: r0=value, r1=address
+ * input: r0 = value, r1 = address
  * output: r0 = old value
  */
-
 android_atomic_add:
     .fnstart
     .save {r4, lr}
@@ -162,19 +445,17 @@ android_atomic_add:
     ldmia   sp!, {r4, lr}
     bx      lr
     .fnend
-    
-    
+
+
 /*
- * ----------------------------------------------------------------------------
  * android_atomic_and
- * input: r0=value, r1=address
+ * input: r0 = value, r1 = address
  * output: r0 = old value
  */
-
 android_atomic_and:
     .fnstart
-    .save {r4, r5, lr}
-    stmdb   sp!, {r4, r5, lr}   
+    .save {r4, r5, ip, lr}      /* include ip for 64-bit stack alignment */
+    stmdb   sp!, {r4, r5, ip, lr}
     mov     r2, r1              /* r2 = address */
     mov     r4, r0              /* r4 = the value */
 1: @ android_atomic_and
@@ -194,21 +475,20 @@ android_atomic_and:
 #endif
     bcc     1b
     mov     r0, r5
-    ldmia   sp!, {r4, r5, lr}
+    ldmia   sp!, {r4, r5, ip, lr}
     bx      lr
     .fnend
-    
+
+
 /*
- * ----------------------------------------------------------------------------
  * android_atomic_or
- * input: r0=value, r1=address
+ * input: r0 = value, r1 = address
  * output: r0 = old value
  */
-
 android_atomic_or:
     .fnstart
-    .save {r4, r5, lr}
-    stmdb   sp!, {r4, r5, lr}   
+    .save {r4, r5, ip, lr}      /* include ip for 64-bit stack alignment */
+    stmdb   sp!, {r4, r5, ip, lr}
     mov     r2, r1              /* r2 = address */
     mov     r4, r0              /* r4 = the value */
 1: @ android_atomic_or
@@ -228,40 +508,31 @@ android_atomic_or:
 #endif
     bcc     1b
     mov     r0, r5
-    ldmia   sp!, {r4, r5, lr}
+    ldmia   sp!, {r4, r5, ip, lr}
     bx      lr
     .fnend
 
+
 /*
- * ----------------------------------------------------------------------------
- * android_atomic_swap
- * input: r0=value, r1=address
+ * android_atomic_acquire_swap / android_atomic_release_swap
+ * input: r0 = value, r1 = address
  * output: r0 = old value
  */
-
-/* replaced swp instruction with ldrex/strex for ARMv6 & ARMv7 */
-android_atomic_swap:
-#if defined (__ARM_HAVE_LDREX_STREX)
-1:  ldrex   r2, [r1]
-    strex   r3, r0, [r1]
-    teq     r3, #0
-    bne     1b
-    mov     r0, r2
-    mcr     p15, 0, r0, c7, c10, 5 /* or, use dmb */
-#else
+android_atomic_acquire_swap:
+android_atomic_release_swap:
+    .fnstart
     swp     r0, r0, [r1]
-#endif
     bx      lr
+    .fnend
+
 
 /*
- * ----------------------------------------------------------------------------
- * android_atomic_cmpxchg
- * input: r0=oldvalue, r1=newvalue, r2=address
+ * android_atomic_acquire_cas / android_atomic_release_cas
+ * input: r0 = oldvalue, r1 = newvalue, r2 = address
  * output: r0 = 0 (xchg done) or non-zero (xchg not done)
  */
-
-android_atomic_acquire_cmpxchg:
-android_atomic_cmpxchg:
+android_atomic_acquire_cas:
+android_atomic_release_cas:
     .fnstart
     .save {r4, lr}
     stmdb   sp!, {r4, lr}
@@ -287,3 +558,4 @@ android_atomic_cmpxchg:
     bx      lr
     .fnend
 
+#endif /*not defined __ARM_HAVE_LDREX_STREX*/
diff --git a/libcutils/atomic-android-armv6.S b/libcutils/atomic-android-armv6.S
deleted file mode 100644
index 1574c9c92..000000000
--- a/libcutils/atomic-android-armv6.S
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-    .text
-    .align
-    
-    .global android_atomic_write
-    .type android_atomic_write, %function
-    
-    .global android_atomic_inc
-    .type android_atomic_inc, %function
-    .global android_atomic_dec
-    .type android_atomic_dec, %function
-    
-    .global android_atomic_add
-    .type android_atomic_add, %function
-    .global android_atomic_and
-    .type android_atomic_and, %function
-    .global android_atomic_or
-    .type android_atomic_or, %function
-    
-    .global android_atomic_swap
-    .type android_atomic_swap, %function
-    
-    .global android_atomic_cmpxchg
-    .type android_atomic_cmpxchg, %function
-    
-
-
-/* FIXME: On SMP systems memory barriers may be needed */
-#warning  "this file is not safe with SMP systems"
-
-
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_write
- * input: r0=value, r1=address
- * output: void
- */
-
-android_atomic_write:
-    str     r0, [r1]
-    bx      lr;
-
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_inc
- * input: r0 = address
- * output: r0 = old value
- */
- 
-android_atomic_inc:
-    mov     r12, r0
-1:  ldrex   r0, [r12]
-    add     r2, r0, #1
-    strex   r1, r2, [r12]
-    cmp     r1, #0
-    bxeq    lr
-    b       1b
-
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_dec
- * input: r0=address
- * output: r0 = old value
- */
- 
-android_atomic_dec:
-    mov     r12, r0
-1:  ldrex   r0, [r12]
-    sub     r2, r0, #1
-    strex   r1, r2, [r12]
-    cmp     r1, #0
-    bxeq    lr
-    b       1b
-
-    
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_add
- * input: r0=value, r1=address
- * output: r0 = old value
- */
-
-android_atomic_add:
-    mov     r12, r0
-1:  ldrex   r0, [r1]
-    add     r2, r0, r12
-    strex   r3, r2, [r1]
-    cmp     r3, #0
-    bxeq    lr
-    b       1b
-    
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_and
- * input: r0=value, r1=address
- * output: r0 = old value
- */
-
-android_atomic_and:
-    mov     r12, r0
-1:  ldrex   r0, [r1]
-    and     r2, r0, r12
-    strex   r3, r2, [r1]
-    cmp     r3, #0
-    bxeq    lr
-    b       1b
-
-    
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_or
- * input: r0=value, r1=address
- * output: r0 = old value
- */
-
-android_atomic_or:
-    mov     r12, r0
-1:  ldrex   r0, [r1]
-    orr     r2, r0, r12
-    strex   r3, r2, [r1]
-    cmp     r3, #0
-    bxeq    lr
-    b       1b
-
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_swap
- * input: r0=value, r1=address
- * output: r0 = old value
- */
-
-android_atomic_swap:
-    swp     r0, r0, [r1]
-    bx      lr
-
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_cmpxchg
- * input: r0=oldvalue, r1=newvalue, r2=address
- * output: r0 = 0 (xchg done) or non-zero (xchg not done)
- */
-
-android_atomic_cmpxchg:
-    mov     r12, r1
-    ldrex   r3, [r2]
-    eors    r0, r0, r3
-    strexeq r0, r12, [r2]
-    bx      lr
-
-
-
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_cmpxchg_64
- * input: r0-r1=oldvalue, r2-r3=newvalue, arg4 (on stack)=address
- * output: r0 = 0 (xchg done) or non-zero (xchg not done)
- */
-/* TODO: NEED IMPLEMENTATION FOR THIS ARCHITECTURE */
diff --git a/libcutils/atomic-android-sh.c b/libcutils/atomic-android-sh.c
index d95b02bdb..abe7d25fb 100644
--- a/libcutils/atomic-android-sh.c
+++ b/libcutils/atomic-android-sh.c
@@ -35,6 +35,9 @@
  *     ARM implementation, in this file above.
  *     We follow the fact that the initializer for mutex is a simple zero
  *     value.
+ *
+ * (3) These operations are NOT safe for SMP, as there is no currently
+ *     no definition for a memory barrier operation.
  */
 
 #include <pthread.h>
@@ -46,18 +49,35 @@ static pthread_mutex_t  _swap_locks[SWAP_LOCK_COUNT];
    &_swap_locks[((unsigned)(void*)(addr) >> 3U) % SWAP_LOCK_COUNT]
 
 
-void android_atomic_write(int32_t value, volatile int32_t* addr) {
+int32_t android_atomic_acquire_load(volatile int32_t* addr)
+{
+    return *addr;
+}
+
+int32_t android_atomic_release_load(volatile int32_t* addr)
+{
+    return *addr;
+}
+
+void android_atomic_acquire_store(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, value, addr));
+    } while (android_atomic_release_cas(oldValue, value, addr));
+}
+
+void android_atomic_release_store(int32_t value, volatile int32_t* addr) {
+    int32_t oldValue;
+    do {
+        oldValue = *addr;
+    } while (android_atomic_release_cas(oldValue, value, addr));
 }
 
 int32_t android_atomic_inc(volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue+1, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue+1, addr));
     return oldValue;
 }
 
@@ -65,7 +85,7 @@ int32_t android_atomic_dec(volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue-1, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue-1, addr));
     return oldValue;
 }
 
@@ -73,7 +93,7 @@ int32_t android_atomic_add(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue+value, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue+value, addr));
     return oldValue;
 }
 
@@ -81,7 +101,7 @@ int32_t android_atomic_and(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue&value, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue&value, addr));
     return oldValue;
 }
 
@@ -89,11 +109,15 @@ int32_t android_atomic_or(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue|value, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue|value, addr));
     return oldValue;
 }
 
-int32_t android_atomic_swap(int32_t value, volatile int32_t* addr) {
+int32_t android_atomic_acquire_swap(int32_t value, volatile int32_t* addr) {
+    return android_atomic_release_swap(value, addr);
+}
+
+int32_t android_atomic_release_swap(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
@@ -101,7 +125,12 @@ int32_t android_atomic_swap(int32_t value, volatile int32_t* addr) {
     return oldValue;
 }
 
-int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue,
+int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
+                           volatile int32_t* addr) {
+    return android_atomic_release_cmpxchg(oldValue, newValue, addr);
+}
+
+int android_atomic_release_cmpxchg(int32_t oldvalue, int32_t newvalue,
                            volatile int32_t* addr) {
     int result;
     pthread_mutex_t*  lock = SWAP_LOCK(addr);
@@ -118,8 +147,3 @@ int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue,
     return result;
 }
 
-int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
-                           volatile int32_t* addr) {
-    return android_atomic_cmpxchg(oldValue, newValue, addr);
-}
-
diff --git a/libcutils/atomic.c b/libcutils/atomic.c
index d81890614..4cefa6b93 100644
--- a/libcutils/atomic.c
+++ b/libcutils/atomic.c
@@ -27,11 +27,25 @@
 
 #include <libkern/OSAtomic.h>
 
-void android_atomic_write(int32_t value, volatile int32_t* addr) {
-    int32_t oldValue;
-    do {
-        oldValue = *addr;
-    } while (OSAtomicCompareAndSwap32Barrier(oldValue, value, (int32_t*)addr) == 0);
+int32_t android_atomic_acquire_load(volatile int32_t* addr) {
+    int32_t value = *addr;
+    OSMemoryBarrier();
+    return value;
+}
+
+int32_t android_atomic_release_load(volatile int32_t* addr) {
+    OSMemoryBarrier();
+    return *addr;
+}
+
+void android_atomic_acquire_store(int32_t value, volatile int32_t* addr) {
+    *addr = value;
+    OSMemoryBarrier();
+}
+
+void android_atomic_release_store(int32_t value, volatile int32_t* addr) {
+    OSMemoryBarrier();
+    *addr = value;
 }
 
 int32_t android_atomic_inc(volatile int32_t* addr) {
@@ -47,74 +61,81 @@ int32_t android_atomic_add(int32_t value, volatile int32_t* addr) {
 }
 
 int32_t android_atomic_and(int32_t value, volatile int32_t* addr) {
-    int32_t oldValue;
-    do {
-        oldValue = *addr;
-    } while (OSAtomicCompareAndSwap32Barrier(oldValue, oldValue&value, (int32_t*)addr) == 0);
-    return oldValue;
+    return OSAtomicAnd32OrigBarrier(value, (int32_t*)addr);
 }
 
 int32_t android_atomic_or(int32_t value, volatile int32_t* addr) {
+    return OSAtomicOr32OrigBarrier(value, (int32_t*)addr);
+}
+
+int32_t android_atomic_acquire_swap(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (OSAtomicCompareAndSwap32Barrier(oldValue, oldValue|value, (int32_t*)addr) == 0);
+    } while (android_atomic_acquire_cas(oldValue, value, addr));
     return oldValue;
 }
 
-int32_t android_atomic_swap(int32_t value, volatile int32_t* addr) {
+int32_t android_atomic_release_swap(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, value, addr));
+    } while (android_atomic_release_cas(oldValue, value, addr));
     return oldValue;
 }
 
-int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue, volatile int32_t* addr) {
+int android_atomic_release_cas(int32_t oldvalue, int32_t newvalue, volatile int32_t* addr) {
     /* OS X CAS returns zero on failure; invert to return zero on success */
     return OSAtomicCompareAndSwap32Barrier(oldvalue, newvalue, (int32_t*)addr) == 0;
 }
 
-int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
+int android_atomic_acquire_cas(int32_t oldvalue, int32_t newvalue,
         volatile int32_t* addr) {
     int result = (OSAtomicCompareAndSwap32(oldvalue, newvalue, (int32_t*)addr) == 0);
-    if (!result) {
+    if (result == 0) {
         /* success, perform barrier */
         OSMemoryBarrier();
     }
+    return result;
 }
 
 /*****************************************************************************/
 #elif defined(__i386__) || defined(__x86_64__)
 
-void android_atomic_write(int32_t value, volatile int32_t* addr) {
-    int32_t oldValue;
-    do {
-        oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, value, addr));
+int32_t android_atomic_acquire_load(volatile int32_t* addr) {
+    int32_t value = *addr;
+    ANDROID_MEMBAR_FULL();
+    return value;
+}
+
+int32_t android_atomic_release_load(volatile int32_t* addr) {
+    ANDROID_MEMBAR_FULL();
+    return *addr;
+}
+
+void android_atomic_acquire_store(int32_t value, volatile int32_t* addr) {
+    *addr = value;
+    ANDROID_MEMBAR_FULL();
+}
+
+void android_atomic_release_store(int32_t value, volatile int32_t* addr) {
+    ANDROID_MEMBAR_FULL();
+    *addr = value;
 }
 
 int32_t android_atomic_inc(volatile int32_t* addr) {
-    int32_t oldValue;
-    do {
-        oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue+1, addr));
-    return oldValue;
+    return android_atomic_add(1, addr);
 }
 
 int32_t android_atomic_dec(volatile int32_t* addr) {
-    int32_t oldValue;
-    do {
-        oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue-1, addr));
-    return oldValue;
+    return android_atomic_add(-1, addr);
 }
 
 int32_t android_atomic_add(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue+value, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue+value, addr));
     return oldValue;
 }
 
@@ -122,7 +143,7 @@ int32_t android_atomic_and(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue&value, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue&value, addr));
     return oldValue;
 }
 
@@ -130,20 +151,13 @@ int32_t android_atomic_or(int32_t value, volatile int32_t* addr) {
     int32_t oldValue;
     do {
         oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, oldValue|value, addr));
+    } while (android_atomic_release_cas(oldValue, oldValue|value, addr));
     return oldValue;
 }
 
-int32_t android_atomic_swap(int32_t value, volatile int32_t* addr) {
-    int32_t oldValue;
-    do {
-        oldValue = *addr;
-    } while (android_atomic_cmpxchg(oldValue, value, addr));
-    return oldValue;
-}
-
-int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue, volatile int32_t* addr) {
-    android_membar_full();
+/* returns 0 on successful swap */
+static inline int cas(int32_t oldvalue, int32_t newvalue,
+        volatile int32_t* addr) {
     int xchg;
     asm volatile
     (
@@ -156,18 +170,36 @@ int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue, volatile int32_t*
     return xchg;
 }
 
-int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
+int32_t android_atomic_acquire_swap(int32_t value, volatile int32_t* addr) {
+    int32_t oldValue;
+    do {
+        oldValue = *addr;
+    } while (cas(oldValue, value, addr));
+    ANDROID_MEMBAR_FULL();
+    return oldValue;
+}
+
+int32_t android_atomic_release_swap(int32_t value, volatile int32_t* addr) {
+    ANDROID_MEMBAR_FULL();
+    int32_t oldValue;
+    do {
+        oldValue = *addr;
+    } while (cas(oldValue, value, addr));
+    return oldValue;
+}
+
+int android_atomic_acquire_cas(int32_t oldvalue, int32_t newvalue,
         volatile int32_t* addr) {
-    int xchg;
-    asm volatile
-    (
-    "   lock; cmpxchg %%ecx, (%%edx);"
-    "   setne %%al;"
-    "   andl $1, %%eax"
-    : "=a" (xchg)
-    : "a" (oldvalue), "c" (newvalue), "d" (addr)
-    );
-    android_membar_full();
+    int xchg = cas(oldvalue, newvalue, addr);
+    if (xchg == 0)
+        ANDROID_MEMBAR_FULL();
+    return xchg;
+}
+
+int android_atomic_release_cas(int32_t oldvalue, int32_t newvalue,
+        volatile int32_t* addr) {
+    ANDROID_MEMBAR_FULL();
+    int xchg = cas(oldvalue, newvalue, addr);
     return xchg;
 }