From 96dbb4fc58fe2dcf4390e073dbb42cc77ef2f0b5 Mon Sep 17 00:00:00 2001
From: Martyn Capewell <martyn.capewell@arm.com>
Date: Mon, 7 Dec 2009 13:59:59 +0000
Subject: [PATCH 1/5] Adds UXTB16 support to Pixelflinger

 * Add support for UXTB16 to the disassembler
 * Add encoding of the UXTB16 instruction to the Pixelflinger JIT.

Introducing the UXTB16 instruction allows removal of some masking code, and is
beneficial from a pipeline point of view - lots of UXTB16 followed by MUL
sequences.

Also, further rescheduling and use of SMULWB brings extra performance
improvements.

 * Use UXTB16 in bilinear filtered texturing

Uses UXTB16 to extract channels for SIMD operations, rather than creating and
ANDing with masks. Saves a register and is faster on A8, as UXTB16 result can
feed into first stage of multiply, unlike AND.

Also, used SMULWB rather than SMULBB, which allows removal of MOVs used to
rescale results.

Code has been scheduled for A8 pipeline, specifically aiming to allow
multiplies to issue in pipeline 0, for efficient dual issue operation.

Testing on SpriteMethodTest (http://code.google.com/p/apps-for-android/) gives
8% improvement (12.7 vs. 13.7 fps.)

SMULBB to SMULWB trick could be used in <v6 code path, but this hasn't been
implemented.
---
 libpixelflinger/codeflinger/ARMAssembler.cpp  |  10 ++
 libpixelflinger/codeflinger/ARMAssembler.h    |   1 +
 .../codeflinger/ARMAssemblerInterface.h       |   3 +
 .../codeflinger/ARMAssemblerProxy.cpp         |   3 +
 .../codeflinger/ARMAssemblerProxy.h           |   2 +
 libpixelflinger/codeflinger/disassem.c        |   6 +
 libpixelflinger/codeflinger/texturing.cpp     | 104 +++++++++++++++++-
 7 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/libpixelflinger/codeflinger/ARMAssembler.cpp b/libpixelflinger/codeflinger/ARMAssembler.cpp
index ff7b0b3e0..d3720c3ee 100644
--- a/libpixelflinger/codeflinger/ARMAssembler.cpp
+++ b/libpixelflinger/codeflinger/ARMAssembler.cpp
@@ -424,5 +424,15 @@ void ARMAssembler::SMLAW(int cc, int y,
     *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
 }
 
+#if 0
+#pragma mark -
+#pragma mark Byte/half word extract and extend (ARMv6+ only)...
+#endif
+
+void ARMAssembler::UXTB16(int cc, int Rd, int Rm, int rotate)
+{
+    *mPC++ = (cc<<28) | 0x6CF0070 | (Rd<<12) | ((rotate >> 3) << 10) | Rm;
+}
+
 }; // namespace android
 
diff --git a/libpixelflinger/codeflinger/ARMAssembler.h b/libpixelflinger/codeflinger/ARMAssembler.h
index ef3b66af7..a667cb511 100644
--- a/libpixelflinger/codeflinger/ARMAssembler.h
+++ b/libpixelflinger/codeflinger/ARMAssembler.h
@@ -123,6 +123,7 @@ public:
                 int RdHi, int RdLo, int Rs, int Rm);
     virtual void SMLAW(int cc, int y,
                 int Rd, int Rm, int Rs, int Rn);
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
 
 private:
                 ARMAssembler(const ARMAssembler& rhs);
diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.h b/libpixelflinger/codeflinger/ARMAssemblerInterface.h
index 465b3bd9d..ff6af2a22 100644
--- a/libpixelflinger/codeflinger/ARMAssemblerInterface.h
+++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.h
@@ -203,6 +203,9 @@ public:
     virtual void SMLAW(int cc, int y,
                 int Rd, int Rm, int Rs, int Rn) = 0;
 
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate) = 0;
+
     // -----------------------------------------------------------------------
     // convenience...
     // -----------------------------------------------------------------------
diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
index 18c461864..7c422dbad 100644
--- a/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
@@ -195,6 +195,9 @@ void ARMAssemblerProxy::SMLAW(int cc, int y, int Rd, int Rm, int Rs, int Rn) {
     mTarget->SMLAW(cc, y, Rd, Rm, Rs, Rn);
 }
 
+void ARMAssemblerProxy::UXTB16(int cc, int Rd, int Rm, int rotate) {
+    mTarget->UXTB16(cc, Rd, Rm, rotate);
+}
 
 }; // namespace android
 
diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.h b/libpixelflinger/codeflinger/ARMAssemblerProxy.h
index 4bdca9cf5..9134cce6f 100644
--- a/libpixelflinger/codeflinger/ARMAssemblerProxy.h
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.h
@@ -114,6 +114,8 @@ public:
     virtual void SMLAW(int cc, int y,
                 int Rd, int Rm, int Rs, int Rn);
 
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+
 private:
     ARMAssemblerInterface*  mTarget;
 };
diff --git a/libpixelflinger/codeflinger/disassem.c b/libpixelflinger/codeflinger/disassem.c
index 4676da0d8..ee5e63a2b 100644
--- a/libpixelflinger/codeflinger/disassem.c
+++ b/libpixelflinger/codeflinger/disassem.c
@@ -80,6 +80,7 @@
  * f - 1st fp operand (register) (bits 12-14)
  * g - 2nd fp operand (register) (bits 16-18)
  * h - 3rd fp operand (register/immediate) (bits 0-4)
+ * j - xtb rotate literal (bits 10-11)
  * b - branch address
  * t - thumb branch address (bits 24, 0-23)
  * k - breakpoint comment (bits 0-3, 8-19)
@@ -122,6 +123,7 @@ static const struct arm32_insn arm32_i[] = {
     { 0x0fe000f0, 0x00c00090, "smull",	"Sdnms" },
     { 0x0fe000f0, 0x00a00090, "umlal",	"Sdnms" },
     { 0x0fe000f0, 0x00e00090, "smlal",	"Sdnms" },
+    { 0x0fff03f0, 0x06cf0070, "uxtb16", "dmj" },
     { 0x0d700000, 0x04200000, "strt",	"daW" },
     { 0x0d700000, 0x04300000, "ldrt",	"daW" },
     { 0x0d700000, 0x04600000, "strbt",	"daW" },
@@ -406,6 +408,10 @@ disasm(const disasm_interface_t *di, u_int loc, int altfmt)
 			else
 				di->di_printf("f%d", insn & 7);
 			break;
+		/* j - xtb rotate literal (bits 10-11) */
+		case 'j':
+			di->di_printf("ror #%d", ((insn >> 10) & 3) << 3);
+			break;
 		/* b - branch address */
 		case 'b':
 			branch = ((insn << 2) & 0x03ffffff);
diff --git a/libpixelflinger/codeflinger/texturing.cpp b/libpixelflinger/codeflinger/texturing.cpp
index 90e658407..ba13fb303 100644
--- a/libpixelflinger/codeflinger/texturing.cpp
+++ b/libpixelflinger/codeflinger/texturing.cpp
@@ -25,6 +25,7 @@
 
 #include "codeflinger/GGLAssembler.h"
 
+#include <machine/cpu-features.h>
 
 namespace android {
 
@@ -567,7 +568,7 @@ void GGLAssembler::build_textures(  fragment_parts_t& parts,
                     RSB(GE, 0, height, height, imm(0));
                     MUL(AL, 0, height, stride, height);
                 } else {
-                    // u has not been CLAMPed yet
+                    // v has not been CLAMPed yet
                     CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
                     MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
                     MOV(LE, 0, height, imm(0));
@@ -868,6 +869,106 @@ void GGLAssembler::filter24(
     load(txPtr, texel, 0);
 }
 
+#if __ARM_ARCH__ >= 6
+// ARMv6 version, using UXTB16, and scheduled for Cortex-A8 pipeline
+void GGLAssembler::filter32(
+        const fragment_parts_t& parts,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    const int adjust = FRAC_BITS*2 - 8;
+    const int round  = 0;
+    const int prescale = 16 - adjust;
+
+    Scratch scratches(registerFile());
+    
+    int pixel= scratches.obtain();
+    int dh   = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+
+    int temp = scratches.obtain();
+    int dl   = scratches.obtain();
+
+    int offsetrt = scratches.obtain();
+    int offsetlb = scratches.obtain();
+
+    int pixellb = offsetlb;
+
+    // RB -> U * V
+    CONTEXT_LOAD(offsetrt, generated_vars.rt);
+    CONTEXT_LOAD(offsetlb, generated_vars.lb);
+    if(!round) {
+        MOV(AL, 0, U, reg_imm(U, LSL, prescale));
+    }
+    ADD(AL, 0, u, offsetrt, offsetlb);
+
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(u));
+    if (round) {
+        SMULBB(AL, u, U, V);
+        RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    } else {
+        SMULWB(AL, u, U, V);
+        RSB(AL, 0, U, U, imm(1<<(FRAC_BITS+prescale)));
+    }
+    UXTB16(AL, temp, pixel, 0);
+    if (round) {
+        ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    LDR(AL, pixellb, txPtr.reg, reg_scale_pre(offsetlb));
+    MUL(AL, 0, dh, temp, u);
+    UXTB16(AL, temp, pixel, 8);
+    MUL(AL, 0, dl, temp, u);
+    RSB(AL, 0, k, u, imm(0x100));
+
+    // LB -> (1-U) * V
+    if (round) {
+        SMULBB(AL, u, U, V);
+    } else {
+        SMULWB(AL, u, U, V);
+    }
+    UXTB16(AL, temp, pixellb, 0);
+    if (round) {
+        ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    UXTB16(AL, temp, pixellb, 8);
+    MLA(AL, 0, dl, temp, u, dl);
+    SUB(AL, 0, k, k, u);
+
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDR(AL, pixel, txPtr.reg);
+    if (round) {
+        SMULBB(AL, u, U, V);
+    } else {
+        SMULWB(AL, u, U, V);
+    }
+    UXTB16(AL, temp, pixel, 0);
+    if (round) {
+        ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    UXTB16(AL, temp, pixel, 8);
+    MLA(AL, 0, dl, temp, u, dl);
+
+    // RT -> U*(1-V)            
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offsetrt));
+    SUB(AL, 0, u, k, u);
+    UXTB16(AL, temp, pixel, 0);
+    MLA(AL, 0, dh, temp, u, dh);    
+    UXTB16(AL, temp, pixel, 8);
+    MLA(AL, 0, dl, temp, u, dl);
+
+    UXTB16(AL, dh, dh, 8);
+    UXTB16(AL, dl, dl, 8);
+    ORR(AL, 0, texel.reg, dh, reg_imm(dl, LSL, 8));
+}
+#else
 void GGLAssembler::filter32(
         const fragment_parts_t& parts,
         pixel_t& texel, const texture_unit_t& tmu,
@@ -955,6 +1056,7 @@ void GGLAssembler::filter32(
     AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
     ORR(AL, 0, texel.reg, dh, dl);
 }
+#endif
 
 void GGLAssembler::build_texture_environment(
         component_t& fragment,

From f9e8ab03bd93d98567e96822535090a877594aba Mon Sep 17 00:00:00 2001
From: Martyn Capewell <martyn.capewell@arm.com>
Date: Mon, 7 Dec 2009 15:00:19 +0000
Subject: [PATCH 2/5] NEON shortcut for flat colour blending into 16-bit

This is a shortcut for the needs descriptor
00000077:03515104_00000000_00000000.  It requires blending a single 32-bit
colour value into a 16-bit framebuffer.
It's used when fading out the screen, eg. when a modal requester pops-up.

The PF JIT produces code for this using 24 instructions/pixel. The NEON
implementation requires 2.1 instructions/pixel. Performance hasn't been
benchmarked, but the improvement is quite visible.

This code has only been tested by inspection of the fading effect described
above, when press+holding a finger on the home screen to pop up the
Shortcuts/Widgets/Folders/Wallpaper requester.

Along with the NEON version, a fallback v5TE implementation is also provided.

This ARM version of col32cb16blend is not fully optimised, but is a reasonable
implementation, and better than the version produced by the JIT. It is here as
a fallback, if NEON is not available.
---
 libpixelflinger/Android.mk            |   6 +
 libpixelflinger/col32cb16blend.S      |  78 +++++++++++++
 libpixelflinger/col32cb16blend_neon.S | 153 ++++++++++++++++++++++++++
 libpixelflinger/scanline.cpp          |  47 ++++++++
 4 files changed, 284 insertions(+)
 create mode 100644 libpixelflinger/col32cb16blend.S
 create mode 100644 libpixelflinger/col32cb16blend_neon.S

diff --git a/libpixelflinger/Android.mk b/libpixelflinger/Android.mk
index 0cc85d9dc..6491d243b 100644
--- a/libpixelflinger/Android.mk
+++ b/libpixelflinger/Android.mk
@@ -40,7 +40,13 @@ PIXELFLINGER_SRC_FILES:= \
 	buffer.cpp
 
 ifeq ($(TARGET_ARCH),arm)
+ifeq ($(TARGET_ARCH_VERSION),armv7-a)
+PIXELFLINGER_SRC_FILES += col32cb16blend_neon.S
+PIXELFLINGER_SRC_FILES += col32cb16blend.S
+else
 PIXELFLINGER_SRC_FILES += t32cb16blend.S
+PIXELFLINGER_SRC_FILES += col32cb16blend.S
+endif
 endif
 
 ifeq ($(TARGET_ARCH),arm)
diff --git a/libpixelflinger/col32cb16blend.S b/libpixelflinger/col32cb16blend.S
new file mode 100644
index 000000000..1450bde84
--- /dev/null
+++ b/libpixelflinger/col32cb16blend.S
@@ -0,0 +1,78 @@
+/* libs/pixelflinger/col32cb16blend.S
+**
+** (C) COPYRIGHT 2009 ARM Limited.
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+**
+*/
+
+    .text
+    .align
+
+    .global scanline_col32cb16blend_arm
+
+//
+// This function alpha blends a fixed color into a destination scanline, using
+// the formula:
+//
+//     d = s + (((a + (a >> 7)) * d) >> 8)
+//
+// where d is the destination pixel,
+//       s is the source color,
+//       a is the alpha channel of the source color.
+//
+
+// r0 = destination buffer pointer
+// r1 = color value
+// r2 = count
+
+
+scanline_col32cb16blend_arm:
+    push        {r4-r10, lr}                    // stack ARM regs
+
+    mov         r5, r1, lsr #24                 // shift down alpha
+    mov         r9, #0xff                       // create mask
+    add         r5, r5, r5, lsr #7              // add in top bit
+    rsb         r5, r5, #256                    // invert alpha
+    and         r10, r1, #0xff                  // extract red
+    and         r12, r9, r1, lsr #8             // extract green
+    and         r4, r9, r1, lsr #16             // extract blue
+    mov         r10, r10, lsl #5                // prescale red
+    mov         r12, r12, lsl #6                // prescale green
+    mov         r4, r4, lsl #5                  // prescale blue
+    mov         r9, r9, lsr #2                  // create dest green mask
+
+1:
+    ldrh        r8, [r0]                        // load dest pixel
+    subs        r2, r2, #1                      // decrement loop counter
+    mov         r6, r8, lsr #11                 // extract dest red
+    and         r7, r9, r8, lsr #5              // extract dest green
+    and         r8, r8, #0x1f                   // extract dest blue
+
+    smlabb      r6, r6, r5, r10                 // dest red * alpha + src red
+    smlabb      r7, r7, r5, r12                 // dest green * alpha + src green
+    smlabb      r8, r8, r5, r4                  // dest blue * alpha + src blue
+
+    mov         r6, r6, lsr #8                  // shift down red
+    mov         r7, r7, lsr #8                  // shift down green
+    mov         r6, r6, lsl #11                 // shift red into 565
+    orr         r6, r7, lsl #5                  // shift green into 565
+    orr         r6, r8, lsr #8                  // shift blue into 565
+
+    strh        r6, [r0], #2                    // store pixel to dest, update ptr
+    bne         1b                              // if count != 0, loop
+
+    pop         {r4-r10, pc}                    // return
+
+
+
diff --git a/libpixelflinger/col32cb16blend_neon.S b/libpixelflinger/col32cb16blend_neon.S
new file mode 100644
index 000000000..17b0d01a8
--- /dev/null
+++ b/libpixelflinger/col32cb16blend_neon.S
@@ -0,0 +1,153 @@
+/* libs/pixelflinger/col32cb16blend_neon.S
+**
+** (C) COPYRIGHT 2009 ARM Limited.
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+**
+*/
+
+    .text
+    .align
+
+    .global scanline_col32cb16blend_neon
+
+//
+// This function alpha blends a fixed color into a destination scanline, using
+// the formula:
+//
+//     d = s + (((a + (a >> 7)) * d) >> 8)
+//
+// where d is the destination pixel,
+//       s is the source color,
+//       a is the alpha channel of the source color.
+//
+// The NEON implementation processes 16 pixels per iteration. The remaining 0 - 15
+// pixels are processed in ARM code.
+//
+
+// r0 = destination buffer pointer
+// r1 = color pointer
+// r2 = count
+
+
+scanline_col32cb16blend_neon:
+    push        {r4-r11, lr}                    // stack ARM regs
+
+    vmov.u16    q15, #256                       // create alpha constant
+    movs        r3, r2, lsr #4                  // calc. sixteens iterations
+    vmov.u16    q14, #0x1f                      // create blue mask
+
+    beq         2f                              // if r3 == 0, branch to singles
+
+    vld4.8      {d0[], d2[], d4[], d6[]}, [r1]  // load color into four registers
+                                                //  split and duplicate them, such that
+                                                //  d0 = 8 equal red values
+                                                //  d2 = 8 equal green values
+                                                //  d4 = 8 equal blue values
+                                                //  d6 = 8 equal alpha values
+    vshll.u8    q0, d0, #5                      // shift up red and widen
+    vshll.u8    q1, d2, #6                      // shift up green and widen
+    vshll.u8    q2, d4, #5                      // shift up blue and widen
+
+    vshr.u8     d7, d6, #7                      // extract top bit of alpha
+    vaddl.u8    q3, d6, d7                      // add top bit into alpha
+    vsub.u16    q3, q15, q3                     // invert alpha
+
+1:
+    // This loop processes 16 pixels per iteration. In the comments, references to
+    // the first eight pixels are suffixed with "0" (red0, green0, blue0), 
+    // the second eight are suffixed "1".
+                                                // q8  = dst red0
+                                                // q9  = dst green0
+                                                // q10 = dst blue0
+                                                // q13 = dst red1
+                                                // q12 = dst green1
+                                                // q11 = dst blue1
+
+    vld1.16     {d20, d21, d22, d23}, [r0]      // load 16 dest pixels
+    vshr.u16    q8, q10, #11                    // shift dst red0 to low 5 bits
+    pld         [r0, #63]                       // preload next dest pixels
+    vshl.u16    q9, q10, #5                     // shift dst green0 to top 6 bits
+    vand        q10, q10, q14                   // extract dst blue0
+    vshr.u16    q9, q9, #10                     // shift dst green0 to low 6 bits
+    vmul.u16    q8, q8, q3                      // multiply dst red0 by src alpha
+    vshl.u16    q12, q11, #5                    // shift dst green1 to top 6 bits
+    vmul.u16    q9, q9, q3                      // multiply dst green0 by src alpha
+    vshr.u16    q13, q11, #11                   // shift dst red1 to low 5 bits
+    vmul.u16    q10, q10, q3                    // multiply dst blue0 by src alpha
+    vshr.u16    q12, q12, #10                   // shift dst green1 to low 6 bits
+    vand        q11, q11, q14                   // extract dst blue1
+    vadd.u16    q8, q8, q0                      // add src red to dst red0
+    vmul.u16    q13, q13, q3                    // multiply dst red1 by src alpha
+    vadd.u16    q9, q9, q1                      // add src green to dst green0 
+    vmul.u16    q12, q12, q3                    // multiply dst green1 by src alpha
+    vadd.u16    q10, q10, q2                    // add src blue to dst blue0
+    vmul.u16    q11, q11, q3                    // multiply dst blue1 by src alpha
+    vshr.u16    q8, q8, #8                      // shift down red0
+    vadd.u16    q13, q13, q0                    // add src red to dst red1
+    vshr.u16    q9, q9, #8                      // shift down green0
+    vadd.u16    q12, q12, q1                    // add src green to dst green1
+    vshr.u16    q10, q10, #8                    // shift down blue0
+    vadd.u16    q11, q11, q2                    // add src blue to dst blue1
+    vsli.u16    q10, q9, #5                     // shift & insert green0 into blue0
+    vshr.u16    q13, q13, #8                    // shift down red1
+    vsli.u16    q10, q8, #11                    // shift & insert red0 into blue0    
+    vshr.u16    q12, q12, #8                    // shift down green1
+    vshr.u16    q11, q11, #8                    // shift down blue1
+    subs        r3, r3, #1                      // decrement loop counter
+    vsli.u16    q11, q12, #5                    // shift & insert green1 into blue1
+    vsli.u16    q11, q13, #11                   // shift & insert red1 into blue1
+
+    vst1.16     {d20, d21, d22, d23}, [r0]!     // write 16 pixels back to dst
+    bne         1b                              // if count != 0, loop
+
+2:
+    ands        r3, r2, #15                     // calc. single iterations 
+    beq         4f                              // if r3 == 0, exit
+
+    ldr         r4, [r1]                        // load source color
+    mov         r5, r4, lsr #24                 // shift down alpha
+    add         r5, r5, r5, lsr #7              // add in top bit
+    rsb         r5, r5, #256                    // invert alpha
+    and         r11, r4, #0xff                  // extract red
+    ubfx        r12, r4, #8, #8                 // extract green
+    ubfx        r4, r4, #16, #8                 // extract blue
+    mov         r11, r11, lsl #5                // prescale red
+    mov         r12, r12, lsl #6                // prescale green
+    mov         r4, r4, lsl #5                  // prescale blue
+
+3:
+    ldrh        r8, [r0]                        // load dest pixel
+    subs        r3, r3, #1                      // decrement loop counter
+    mov         r6, r8, lsr #11                 // extract dest red
+    ubfx        r7, r8, #5, #6                  // extract dest green
+    and         r8, r8, #0x1f                   // extract dest blue
+
+    smlabb      r6, r6, r5, r11                 // dest red * alpha + src red
+    smlabb      r7, r7, r5, r12                 // dest green * alpha + src green
+    smlabb      r8, r8, r5, r4                  // dest blue * alpha + src blue
+
+    mov         r6, r6, lsr #8                  // shift down red
+    mov         r7, r7, lsr #8                  // shift down green
+    mov         r6, r6, lsl #11                 // shift red into 565
+    orr         r6, r7, lsl #5                  // shift green into 565
+    orr         r6, r8, lsr #8                  // shift blue into 565
+
+    strh        r6, [r0], #2                    // store pixel to dest, update ptr
+    bne         3b                              // if count != 0, loop
+4:
+
+    pop         {r4-r11, pc}                    // return
+
+
+
diff --git a/libpixelflinger/scanline.cpp b/libpixelflinger/scanline.cpp
index f70030680..a2f43eb08 100644
--- a/libpixelflinger/scanline.cpp
+++ b/libpixelflinger/scanline.cpp
@@ -80,6 +80,7 @@ static void scanline_perspective(context_t* c);
 static void scanline_perspective_single(context_t* c);
 static void scanline_t32cb16blend(context_t* c);
 static void scanline_t32cb16(context_t* c);
+static void scanline_col32cb16blend(context_t* c);
 static void scanline_memcpy(context_t* c);
 static void scanline_memset8(context_t* c);
 static void scanline_memset16(context_t* c);
@@ -93,6 +94,8 @@ static void rect_memcpy(context_t* c, size_t yc);
 
 extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
 extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
+extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
+extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
 
 // ----------------------------------------------------------------------------
 
@@ -111,6 +114,9 @@ static shortcut_t shortcuts[] = {
     { { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
         "565 fb, 8888 tx", scanline_t32cb16, init_y_noop  },  
+    { { { 0x03515104, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0xFFFFFFFF } } },
+        "565 fb, 8888 fixed color", scanline_col32cb16blend, init_y_packed  },  
     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
         { 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
         "(nop) alpha test", scanline_noop, init_y_noop },
@@ -943,6 +949,8 @@ void init_y_packed(context_t* c, int32_t y0)
     uint8_t f = c->state.buffers.color.format;
     c->packed = ggl_pack_color(c, f,
             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
+    c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
+            c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
     c->iterators.y = y0;
     c->step_y = step_y__nop;
     // choose the rectangle blitter
@@ -1253,6 +1261,45 @@ finish:
 
 // ----------------------------------------------------------------------------
 
+void scanline_col32cb16blend(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    union {
+        uint16_t* dst;
+        uint32_t* dst32;
+    };
+    dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+
+#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
+#if defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+    scanline_col32cb16blend_neon(dst, &(c->packed8888), ct);
+#else  // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+    scanline_col32cb16blend_arm(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
+#endif // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+#else
+    uint32_t s = GGL_RGBA_TO_HOST(c->packed8888);
+    int sA = (s>>24);
+    int f = 0x100 - (sA + (sA>>7));
+    while (ct--) {
+        uint16_t d = *dst;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        int sR = (s >> (   3))&0x1F;
+        int sG = (s >> ( 8+2))&0x3F;
+        int sB = (s >> (16+3))&0x1F;
+        sR += (f*dR)>>8;
+        sG += (f*dG)>>8;
+        sB += (f*dB)>>8;
+        *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+#endif
+
+}
+
 void scanline_t32cb16(context_t* c)
 {
     int32_t x = c->iterators.xl;

From cd64315f72537359537c66eebe482495ffefba57 Mon Sep 17 00:00:00 2001
From: Tim <tdhutt@gmail.com>
Date: Tue, 16 Feb 2010 20:18:29 +0000
Subject: [PATCH 3/5] Add documentation for some adb environmental variables.

The ADB_TRACE one is particularly important.

Change-Id: I125a5930c43065c8cf505eea40d20e3f209bc858
---
 adb/commandline.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/adb/commandline.c b/adb/commandline.c
index bb77c4fcd..8003a644e 100644
--- a/adb/commandline.c
+++ b/adb/commandline.c
@@ -170,6 +170,12 @@ void help()
         "\n"
         "  - If it is \"system\" or \"data\", only the corresponding partition\n"
         "    is updated.\n"
+        "\n"
+        "environmental variables:\n"
+        "  ADB_TRACE                    - Print debug information. A comma separated list of the following values\n"
+        "                                 1 or all, adb, sockets, packets, rwx, usb, sync, sysdeps, transport, jdwp\n"
+        "  ANDROID_SERIAL               - The serial number to connect to. -s takes priority over this if given.\n"
+        "  ANDROID_LOG_TAGS             - When used with the logcat option, only these debug tags are printed.\n"
         );
 }
 

From f42d2fac2b09547295e353ddffb281aa7932403f Mon Sep 17 00:00:00 2001
From: Martyn Capewell <martyn.capewell@arm.com>
Date: Mon, 7 Dec 2009 15:24:08 +0000
Subject: [PATCH 4/5] Fix LDM addressing mode disassembly

The Pixelflinger disassembler does not handle LDM addressing modes correctly,
assuming that the P and U bits in the instruction mean the same in both LDM and
STM. This results in the disassembler producing sequences like:

  stmfd r13!, {r4-r11, r14}
  ...
  ...
  ...
  ldmea r13!, {r4-r11, r14}

This small patch fixes it by EORing the P and U bits with the Load/Store bit.

Change-Id: Ic7a1556642c4e29415fc3697019f1239b6c26fc2
---
 libpixelflinger/codeflinger/disassem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libpixelflinger/codeflinger/disassem.c b/libpixelflinger/codeflinger/disassem.c
index ee5e63a2b..c17f3ecc1 100644
--- a/libpixelflinger/codeflinger/disassem.c
+++ b/libpixelflinger/codeflinger/disassem.c
@@ -278,7 +278,7 @@ static char const insn_fpaconstants[][8] = {
 
 #define insn_condition(x)	arm32_insn_conditions[(x >> 28) & 0x0f]
 #define insn_blktrans(x)	insn_block_transfers[(x >> 23) & 3]
-#define insn_stkblktrans(x)	insn_stack_block_transfers[(x >> 23) & 3]
+#define insn_stkblktrans(x)	insn_stack_block_transfers[(3*((x >> 20)&1))^((x >> 23)&3)]
 #define op2_shift(x)		op_shifts[(x >> 5) & 3]
 #define insn_fparnd(x)		insn_fpa_rounding[(x >> 5) & 0x03]
 #define insn_fpaprec(x)		insn_fpa_precision[(((x >> 18) & 2)|(x >> 7)) & 1]

From 9b6c850d24df82451862b81f059361b586f5ef0b Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Queru <jbq@google.com>
Date: Mon, 3 May 2010 12:31:13 -0700
Subject: [PATCH 5/5] fix sim build

Change-Id: Ide300eafbcbbc6dfae25fe86188302c6676c4a3b
---
 libpixelflinger/codeflinger/texturing.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libpixelflinger/codeflinger/texturing.cpp b/libpixelflinger/codeflinger/texturing.cpp
index ba13fb303..6baa28ed9 100644
--- a/libpixelflinger/codeflinger/texturing.cpp
+++ b/libpixelflinger/codeflinger/texturing.cpp
@@ -25,7 +25,9 @@
 
 #include "codeflinger/GGLAssembler.h"
 
+#ifdef __arm__
 #include <machine/cpu-features.h>
+#endif
 
 namespace android {