pixman: update to 0.24.0

* All patches that went upstream are removed

Tested on beagleboard/angstrom

Signed-off-by: Koen Kooi <koen@dominion.thruhere.net>
This commit is contained in:
Koen Kooi
2011-11-30 15:27:26 +01:00
parent f3ca7acd0b
commit 80e3a0c42d
14 changed files with 4 additions and 2153 deletions
@@ -1,114 +0,0 @@
From e17f676f1f42239fb4304d75191f373bb06e8fc0 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Tue, 16 Mar 2010 16:55:28 +0100
Subject: [PATCH 1/4] Generic C implementation of pixman_blt with overlapping support
Uses memcpy/memmove functions to copy pixels, can handle the
case when both source and destination areas are in the same
image (this is useful for scrolling).
It is assumed that copying direction is only important when
using the same image for both source and destination (and
src_stride == dst_stride). Copying direction is undefined
for the images with different source and destination stride
which happen to be in the overlapped areas (but this is an
unrealistic case anyway).
---
pixman/pixman-general.c | 21 ++++++++++++++++++---
pixman/pixman-private.h | 43 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 727affc..fa448f7 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -238,9 +238,24 @@ general_blt (pixman_implementation_t *imp,
int width,
int height)
{
- /* We can't blit unless we have sse2 or mmx */
-
- return FALSE;
+ uint8_t *dst_bytes = (uint8_t *)dst_bits;
+ uint8_t *src_bytes = (uint8_t *)src_bits;
+ int bpp;
+
+ if (src_bpp != dst_bpp || src_bpp & 7)
+ return FALSE;
+
+ bpp = src_bpp >> 3;
+ width *= bpp;
+ src_stride *= 4;
+ dst_stride *= 4;
+ pixman_blt_helper (src_bytes + src_y * src_stride + src_x * bpp,
+ dst_bytes + dst_y * dst_stride + dst_x * bpp,
+ src_stride,
+ dst_stride,
+ width,
+ height);
+ return TRUE;
}
static pixman_bool_t
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 60060a9..5369ad9 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -10,6 +10,7 @@
#include "pixman.h"
#include <time.h>
+#include <string.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
@@ -899,4 +900,46 @@ void pixman_timer_register (pixman_timer_t *timer);
#endif /* PIXMAN_TIMERS */
+/* a helper function, can blit 8-bit images with src/dst overlapping support */
+static inline void
+pixman_blt_helper (uint8_t *src_bytes,
+ uint8_t *dst_bytes,
+ int src_stride,
+ int dst_stride,
+ int width,
+ int height)
+{
+ /*
+ * The second part of this check is not strictly needed, but it prevents
+ * unnecessary upside-down processing of areas which belong to different
+ * images. Upside-down processing can be slower with fixed-distance-ahead
+ * prefetch and perceived as having more tearing.
+ */
+ if (src_bytes < dst_bytes + width &&
+ src_bytes + src_stride * height > dst_bytes)
+ {
+ src_bytes += src_stride * height - src_stride;
+ dst_bytes += dst_stride * height - dst_stride;
+ dst_stride = -dst_stride;
+ src_stride = -src_stride;
+ /* Horizontal scrolling to the left needs memmove */
+ if (src_bytes + width > dst_bytes)
+ {
+ while (--height >= 0)
+ {
+ memmove (dst_bytes, src_bytes, width);
+ dst_bytes += dst_stride;
+ src_bytes += src_stride;
+ }
+ return;
+ }
+ }
+ while (--height >= 0)
+ {
+ memcpy (dst_bytes, src_bytes, width);
+ dst_bytes += dst_stride;
+ src_bytes += src_stride;
+ }
+}
+
#endif /* PIXMAN_PRIVATE_H */
--
1.6.6.1
@@ -1,91 +0,0 @@
From 6d8b811414c73df7f75cc192e3b1f1d777615bdc Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 22 Oct 2009 05:45:47 +0300
Subject: [PATCH 2/4] Support of overlapping src/dst for pixman_blt_mmx
---
pixman/pixman-mmx.c | 55 +++++++++++++++++++++++++++++---------------------
1 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 0272347..5bcbd0e 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -2996,34 +2996,43 @@ pixman_blt_mmx (uint32_t *src_bits,
{
uint8_t * src_bytes;
uint8_t * dst_bytes;
- int byte_width;
+ int bpp;
- if (src_bpp != dst_bpp)
+ if (src_bpp != dst_bpp || src_bpp & 7)
return FALSE;
- if (src_bpp == 16)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 2;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
- src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 2 * width;
- src_stride *= 2;
- dst_stride *= 2;
- }
- else if (src_bpp == 32)
+ bpp = src_bpp >> 3;
+ width *= bpp;
+ src_stride *= 4;
+ dst_stride *= 4;
+ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp;
+ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp;
+
+ if (src_bpp != 16 && src_bpp != 32)
{
- src_stride = src_stride * (int) sizeof (uint32_t) / 4;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 4 * width;
- src_stride *= 4;
- dst_stride *= 4;
+ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride,
+ width, height);
+ return TRUE;
}
- else
+
+ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes)
{
- return FALSE;
+ src_bytes += src_stride * height - src_stride;
+ dst_bytes += dst_stride * height - dst_stride;
+ dst_stride = -dst_stride;
+ src_stride = -src_stride;
+
+ if (src_bytes + width > dst_bytes)
+ {
+ /* TODO: reverse scanline copy using MMX */
+ while (--height >= 0)
+ {
+ memmove (dst_bytes, src_bytes, width);
+ dst_bytes += dst_stride;
+ src_bytes += src_stride;
+ }
+ return TRUE;
+ }
}
while (height--)
@@ -3033,7 +3042,7 @@ pixman_blt_mmx (uint32_t *src_bits,
uint8_t *d = dst_bytes;
src_bytes += src_stride;
dst_bytes += dst_stride;
- w = byte_width;
+ w = width;
while (w >= 2 && ((unsigned long)d & 3))
{
--
1.6.6.1
@@ -1,91 +0,0 @@
From c9ca9dc0f345fa3e2e0f16b2627150e0b696fd7a Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 22 Oct 2009 05:45:54 +0300
Subject: [PATCH 3/4] Support of overlapping src/dst for pixman_blt_sse2
---
pixman/pixman-sse2.c | 55 +++++++++++++++++++++++++++++--------------------
1 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 533b858..9fa7191 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -4691,34 +4691,43 @@ pixman_blt_sse2 (uint32_t *src_bits,
{
uint8_t * src_bytes;
uint8_t * dst_bytes;
- int byte_width;
+ int bpp;
- if (src_bpp != dst_bpp)
+ if (src_bpp != dst_bpp || src_bpp & 7)
return FALSE;
- if (src_bpp == 16)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 2;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
- src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 2 * width;
- src_stride *= 2;
- dst_stride *= 2;
- }
- else if (src_bpp == 32)
+ bpp = src_bpp >> 3;
+ width *= bpp;
+ src_stride *= 4;
+ dst_stride *= 4;
+ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp;
+ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp;
+
+ if (src_bpp != 16 && src_bpp != 32)
{
- src_stride = src_stride * (int) sizeof (uint32_t) / 4;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 4 * width;
- src_stride *= 4;
- dst_stride *= 4;
+ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride,
+ width, height);
+ return TRUE;
}
- else
+
+ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes)
{
- return FALSE;
+ src_bytes += src_stride * height - src_stride;
+ dst_bytes += dst_stride * height - dst_stride;
+ dst_stride = -dst_stride;
+ src_stride = -src_stride;
+
+ if (src_bytes + width > dst_bytes)
+ {
+ /* TODO: reverse scanline copy using SSE2 */
+ while (--height >= 0)
+ {
+ memmove (dst_bytes, src_bytes, width);
+ dst_bytes += dst_stride;
+ src_bytes += src_stride;
+ }
+ return TRUE;
+ }
}
while (height--)
@@ -4728,7 +4737,7 @@ pixman_blt_sse2 (uint32_t *src_bits,
uint8_t *d = dst_bytes;
src_bytes += src_stride;
dst_bytes += dst_stride;
- w = byte_width;
+ w = width;
while (w >= 2 && ((unsigned long)d & 3))
{
--
1.6.6.1
@@ -1,94 +0,0 @@
From 604f22b515d4d678df4c301ecec3c7da4987ee16 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 18 Nov 2009 06:08:48 +0200
Subject: [PATCH 4/4] Support of overlapping src/dst for pixman_blt_neon
---
pixman/pixman-arm-neon.c | 62 +++++++++++++++++++++++++++++++++++++--------
1 files changed, 51 insertions(+), 11 deletions(-)
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index e5127a6..b67fed9 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -234,26 +234,66 @@ pixman_blt_neon (uint32_t *src_bits,
int width,
int height)
{
- if (src_bpp != dst_bpp)
+ uint8_t * src_bytes;
+ uint8_t * dst_bytes;
+ int bpp;
+
+ if (src_bpp != dst_bpp || src_bpp & 7)
return FALSE;
+ bpp = src_bpp >> 3;
+ width *= bpp;
+ src_stride *= 4;
+ dst_stride *= 4;
+ src_bytes = (uint8_t *)src_bits + src_y * src_stride + src_x * bpp;
+ dst_bytes = (uint8_t *)dst_bits + dst_y * dst_stride + dst_x * bpp;
+
+ if (src_bpp != 16 && src_bpp != 32)
+ {
+ pixman_blt_helper (src_bytes, dst_bytes, src_stride, dst_stride,
+ width, height);
+ return TRUE;
+ }
+
+ if (src_bytes < dst_bytes && src_bytes + src_stride * height > dst_bytes)
+ {
+ src_bytes += src_stride * height - src_stride;
+ dst_bytes += dst_stride * height - dst_stride;
+ dst_stride = -dst_stride;
+ src_stride = -src_stride;
+
+ if (src_bytes + width > dst_bytes)
+ {
+ /* TODO: reverse scanline copy using NEON */
+ while (--height >= 0)
+ {
+ memmove (dst_bytes, src_bytes, width);
+ dst_bytes += dst_stride;
+ src_bytes += src_stride;
+ }
+ return TRUE;
+ }
+ }
+
switch (src_bpp)
{
case 16:
pixman_composite_src_0565_0565_asm_neon (
- width, height,
- (uint16_t *)(((char *) dst_bits) +
- dst_y * dst_stride * 4 + dst_x * 2), dst_stride * 2,
- (uint16_t *)(((char *) src_bits) +
- src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+ width >> 1,
+ height,
+ (uint16_t *) dst_bytes,
+ dst_stride >> 1,
+ (uint16_t *) src_bytes,
+ src_stride >> 1);
return TRUE;
case 32:
pixman_composite_src_8888_8888_asm_neon (
- width, height,
- (uint32_t *)(((char *) dst_bits) +
- dst_y * dst_stride * 4 + dst_x * 4), dst_stride,
- (uint32_t *)(((char *) src_bits) +
- src_y * src_stride * 4 + src_x * 4), src_stride);
+ width >> 2,
+ height,
+ (uint32_t *) dst_bytes,
+ dst_stride >> 2,
+ (uint32_t *) src_bytes,
+ src_stride >> 2);
return TRUE;
default:
return FALSE;
--
1.6.6.1
@@ -1,331 +0,0 @@
From 809b8d4e3707c8617cafafb8a16b1b48e2477311 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue, 20 Sep 2011 19:46:25 +0900
Subject: [PATCH 1/8] ARM: NEON: Some cleanup of bilinear scanline functions
Use STRIDE and initial horizontal weight update is done before
entering interpolation loop. Cache preload for mask and dst.
---
pixman/pixman-arm-neon-asm-bilinear.S | 128 +++++++++++++++++----------------
1 files changed, 67 insertions(+), 61 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 3c7fe0f..c5ba929 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -44,10 +44,6 @@
* All temp registers can be used freely outside the code block.
* Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
*
- * TODOs
- * Support 0565 pixel format
- * Optimization for two and last pixel cases
- *
* Remarks
* There can be lots of pipeline stalls inside code block and between code blocks.
* Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
@@ -92,21 +88,19 @@ fname:
*/
.macro bilinear_load_8888 reg1, reg2, tmp
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {reg1}, [TMP1]
- vld1.32 {reg2}, [TMP2]
+ add TMP1, TOP, TMP1, asl #2
+ vld1.32 {reg1}, [TMP1], STRIDE
+ vld1.32 {reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- vld1.32 {reg2[0]}, [TMP1]
- vld1.32 {reg2[1]}, [TMP2]
+ add TMP1, TOP, TMP1, asl #1
+ vld1.32 {reg2[0]}, [TMP1], STRIDE
+ vld1.32 {reg2[1]}, [TMP1]
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
.endm
@@ -134,18 +128,16 @@ fname:
.macro bilinear_load_and_vertical_interpolate_two_0565 \
acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- mov TMP4, X, asr #16
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- add TMP3, TOP, TMP4, asl #1
- add TMP4, BOTTOM, TMP4, asl #1
- vld1.32 {acc2lo[0]}, [TMP1]
- vld1.32 {acc2hi[0]}, [TMP3]
- vld1.32 {acc2lo[1]}, [TMP2]
- vld1.32 {acc2hi[1]}, [TMP4]
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {acc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {acc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {acc2lo[1]}, [TMP1]
+ vld1.32 {acc2hi[1]}, [TMP2]
convert_0565_to_x888 acc2, reg3, reg2, reg1
vzip.u8 reg1, reg3
vzip.u8 reg2, reg4
@@ -161,34 +153,30 @@ fname:
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- mov TMP4, X, asr #16
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- add TMP3, TOP, TMP4, asl #1
- add TMP4, BOTTOM, TMP4, asl #1
- vld1.32 {xacc2lo[0]}, [TMP1]
- vld1.32 {xacc2hi[0]}, [TMP3]
- vld1.32 {xacc2lo[1]}, [TMP2]
- vld1.32 {xacc2hi[1]}, [TMP4]
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {xacc2lo[1]}, [TMP1]
+ vld1.32 {xacc2hi[1]}, [TMP2]
convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- mov TMP4, X, asr #16
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- add TMP3, TOP, TMP4, asl #1
- add TMP4, BOTTOM, TMP4, asl #1
- vld1.32 {yacc2lo[0]}, [TMP1]
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
vzip.u8 xreg1, xreg3
- vld1.32 {yacc2hi[0]}, [TMP3]
+ vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
vzip.u8 xreg2, xreg4
- vld1.32 {yacc2lo[1]}, [TMP2]
+ vld1.32 {yacc2lo[1]}, [TMP1]
vzip.u8 xreg3, xreg4
- vld1.32 {yacc2hi[1]}, [TMP4]
+ vld1.32 {yacc2hi[1]}, [TMP2]
vzip.u8 xreg1, xreg2
convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
vmull.u8 xacc1, xreg1, d28
@@ -252,6 +240,7 @@ fname:
.else
.error bilinear_load_mask_8 numpix is unsupported
.endif
+ pld [MASK, #prefetch_offset]
.endm
.macro bilinear_load_mask mask_fmt, numpix, mask
@@ -279,6 +268,7 @@ fname:
.else
.error bilinear_load_dst_8888 numpix is unsupported
.endif
+ pld [OUT, #(prefetch_offset * 4)]
.endm
.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
@@ -303,7 +293,7 @@ fname:
* For two pixel case
* (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
* (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
- * We can do some optimizations for this including one pixel cases.
+ * We can do some optimizations for this including last pixel cases.
*/
.macro bilinear_duplicate_mask_x numpix, mask
.endm
@@ -497,8 +487,7 @@ fname:
bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
- vshr.u16 d30, d24, #8
- /* 4 cycles bubble */
+ /* 5 cycles bubble */
vshll.u16 q0, d2, #8
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@@ -525,18 +514,18 @@ fname:
q1, q11, d0, d1, d20, d21, d22, d23
bilinear_load_mask mask_fmt, 2, d4
bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
vshll.u16 q0, d2, #8
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
vshll.u16 q10, d22, #8
vmlsl.u16 q10, d22, d31
vmlal.u16 q10, d23, d31
- vshrn.u32 d30, q0, #16
- vshrn.u32 d31, q10, #16
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q10, #16
bilinear_duplicate_mask mask_fmt, 2, d4
- vmovn.u16 d0, q15
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d0, q0
bilinear_interleave_src_dst \
mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
@@ -554,8 +543,7 @@ fname:
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
+ sub TMP1, TMP1, STRIDE
vshll.u16 q0, d2, #8
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@@ -567,9 +555,9 @@ fname:
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
vshll.u16 q8, d18, #8
- bilinear_load_mask mask_fmt, 4, d30
+ bilinear_load_mask mask_fmt, 4, d22
bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
- pld [TMP2, PF_OFFS]
+ pld [TMP1, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
vadd.u16 q12, q12, q13
@@ -577,17 +565,19 @@ fname:
vshrn.u32 d1, q10, #16
vshrn.u32 d4, q2, #16
vshrn.u32 d5, q8, #16
- bilinear_duplicate_mask mask_fmt, 4, d30
+ bilinear_duplicate_mask mask_fmt, 4, d22
+ vshr.u16 q15, q12, #8
vmovn.u16 d0, q0
vmovn.u16 d1, q2
+ vadd.u16 q12, q12, q13
bilinear_interleave_src_dst \
mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
bilinear_apply_mask_to_src \
- mask_fmt, 4, d0, d1, q0, d30, \
+ mask_fmt, 4, d0, d1, q0, d22, \
q3, q8, q9, q10
bilinear_combine \
op, 4, d0, d1, q0, d2, d3, q1, \
- q3, q8, q9, q10, d22
+ q3, q8, q9, q10, d23
bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
bilinear_store_&dst_fmt 4, q2, q3
.endm
@@ -610,6 +600,7 @@ pixman_asm_function fname
PF_OFFS .req r7
TMP3 .req r8
TMP4 .req r9
+ STRIDE .req r2
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
@@ -617,6 +608,11 @@ pixman_asm_function fname
ldmia ip, {WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
+ .set prefetch_offset, prefetch_distance
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
cmp WIDTH, #0
ble 3f
@@ -626,6 +622,8 @@ pixman_asm_function fname
vdup.u8 d29, WB
vadd.u16 d25, d25, d26
vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
subs WIDTH, WIDTH, #4
blt 1f
@@ -648,7 +646,6 @@ pixman_asm_function fname
.unreq OUT
.unreq TOP
- .unreq BOTTOM
.unreq WT
.unreq WB
.unreq X
@@ -659,6 +656,7 @@ pixman_asm_function fname
.unreq PF_OFFS
.unreq TMP3
.unreq TMP4
+ .unreq STRIDE
.endfunc
.endm
@@ -682,6 +680,7 @@ pixman_asm_function fname
PF_OFFS .req r8
TMP3 .req r9
TMP4 .req r10
+ STRIDE .req r3
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
@@ -689,6 +688,11 @@ pixman_asm_function fname
ldmia ip, {WT, WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
+ .set prefetch_offset, prefetch_distance
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
cmp WIDTH, #0
ble 3f
@@ -698,6 +702,8 @@ pixman_asm_function fname
vdup.u8 d29, WB
vadd.u16 d25, d25, d26
vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
subs WIDTH, WIDTH, #4
blt 1f
@@ -720,7 +726,6 @@ pixman_asm_function fname
.unreq OUT
.unreq TOP
- .unreq BOTTOM
.unreq WT
.unreq WB
.unreq X
@@ -732,6 +737,7 @@ pixman_asm_function fname
.unreq PF_OFFS
.unreq TMP3
.unreq TMP4
+ .unreq STRIDE
.endfunc
.endm
--
1.6.6.1
@@ -1,235 +0,0 @@
From ce2fd2ac6aab2c14916d332ade47d72b06d504c1 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue, 20 Sep 2011 21:32:35 +0900
Subject: [PATCH 2/8] ARM: NEON: Bilinear macro template for instruction scheduling
This macro template takes 6 code blocks.
1. process_last_pixel
2. process_two_pixels
3. process_four_pixels
4. process_pixblock_head
5. process_pixblock_tail
6. process_pixblock_tail_head
process_last_pixel does not need to update horizontal weight. This
is done by the template. two and four code block should update
horizontal weight inside of them. head/tail/tail_head blocks
consist unrolled core loop. You can apply instruction scheduling
to the tail_head blocks.
You can also specify size of the pixel block. Supported size is 4
and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
to the template, then you can use register MASK. When using d8~d15
registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
registers are properly saved on the stack and later restored.
---
pixman/pixman-arm-neon-asm-bilinear.S | 195 +++++++++++++++++++++++++++++++++
1 files changed, 195 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index c5ba929..784e5df 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -773,3 +773,198 @@ generate_bilinear_scanline_func_src_a8_dst \
generate_bilinear_scanline_func_src_a8_dst \
pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
8888, 8888, add, 2, 28
+
+.set BILINEAR_FLAG_USE_MASK, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
+ * process_last_pixel - code block that interpolate one pixel and does not
+ * update horizontal weight
+ * process_two_pixels - code block that interpolate two pixels and update
+ * horizontal weight
+ * process_four_pixels - code block that interpolate four pixels and update
+ * horizontal weight
+ * process_pixblock_head - head part of middle loop
+ * process_pixblock_tail - tail part of middle loop
+ * process_pixblock_tail_head - tail_head of middle loop
+ * pixblock_size - number of pixels processed in a single middle loop
+ * prefetch_distance - prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+ fname, \
+ src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+ bilinear_process_last_pixel, \
+ bilinear_process_two_pixels, \
+ bilinear_process_four_pixels, \
+ bilinear_process_pixblock_head, \
+ bilinear_process_pixblock_tail, \
+ bilinear_process_pixblock_tail_head, \
+ pixblock_size, \
+ prefetch_distance, \
+ flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+ .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+ STRIDE .req r2
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+.else
+ OUT .req r0
+ MASK .req r1
+ TOP .req r2
+ BOTTOM .req r3
+ WT .req r4
+ WB .req r5
+ X .req r6
+ UX .req r7
+ WIDTH .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ PF_OFFS .req r8
+ TMP3 .req r9
+ TMP4 .req r10
+ STRIDE .req r3
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9, r10, ip}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WT, WB, X, UX, WIDTH}
+.endif
+
+ mul PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpush {d8-d15}
+.endif
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 3f
+
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 0f
+ tst OUT, #(1 << dst_bpp_shift)
+ beq 0f
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ bilinear_process_last_pixel
+ sub WIDTH, WIDTH, #1
+0:
+ vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+
+ cmp WIDTH, #2
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 1))
+ beq 0f
+ bilinear_process_two_pixels
+ sub WIDTH, WIDTH, #2
+0:
+.if pixblock_size == 8
+ cmp WIDTH, #4
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 2))
+ beq 0f
+ bilinear_process_four_pixels
+ sub WIDTH, WIDTH, #4
+0:
+.endif
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+ bilinear_process_pixblock_head
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 5f
+0:
+ bilinear_process_pixblock_tail_head
+ subs WIDTH, WIDTH, #pixblock_size
+ bge 0b
+5:
+ bilinear_process_pixblock_tail
+1:
+.if pixblock_size == 8
+ tst WIDTH, #4
+ beq 2f
+ bilinear_process_four_pixels
+2:
+.endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 2f
+ bilinear_process_two_pixels
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_process_last_pixel
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpop {d8-d15}
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ pop {r4, r5, r6, r7, r8, r9}
+.else
+ pop {r4, r5, r6, r7, r8, r9, r10, ip}
+.endif
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+ .unreq MASK
+.endif
+
+.endfunc
+
+.endm
--
1.6.6.1
@@ -1,520 +0,0 @@
From 8d0460c4f1b23f3a13e9ff7282b30dd06f10aee1 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Fri, 23 Sep 2011 00:03:22 +0900
Subject: [PATCH 3/8] ARM: NEON: Replace old bilinear scanline generator with new template
Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
be replaced with new template just by wrapping existing macros.
---
pixman/pixman-arm-neon-asm-bilinear.S | 484 ++++++++++++++++++++-------------
1 files changed, 292 insertions(+), 192 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 784e5df..25bcb24 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -582,198 +582,6 @@ fname:
bilinear_store_&dst_fmt 4, q2, q3
.endm
-.macro generate_bilinear_scanline_func_src_dst \
- fname, src_fmt, dst_fmt, op, \
- bpp_shift, prefetch_distance
-
-pixman_asm_function fname
- OUT .req r0
- TOP .req r1
- BOTTOM .req r2
- WT .req r3
- WB .req r4
- X .req r5
- UX .req r6
- WIDTH .req ip
- TMP1 .req r3
- TMP2 .req r4
- PF_OFFS .req r7
- TMP3 .req r8
- TMP4 .req r9
- STRIDE .req r2
-
- mov ip, sp
- push {r4, r5, r6, r7, r8, r9}
- mov PF_OFFS, #prefetch_distance
- ldmia ip, {WB, X, UX, WIDTH}
- mul PF_OFFS, PF_OFFS, UX
-
- .set prefetch_offset, prefetch_distance
-
- sub STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 3f
-
- vdup.u16 q12, X
- vdup.u16 q13, UX
- vdup.u8 d28, WT
- vdup.u8 d29, WB
- vadd.u16 d25, d25, d26
- vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
-
- subs WIDTH, WIDTH, #4
- blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
-0:
- bilinear_interpolate_four_pixels src_fmt, x, dst_fmt, op
- subs WIDTH, WIDTH, #4
- bge 0b
-1:
- tst WIDTH, #2
- beq 2f
- bilinear_interpolate_two_pixels src_fmt, x, dst_fmt, op
-2:
- tst WIDTH, #1
- beq 3f
- bilinear_interpolate_last_pixel src_fmt, x, dst_fmt, op
-3:
- pop {r4, r5, r6, r7, r8, r9}
- bx lr
-
- .unreq OUT
- .unreq TOP
- .unreq WT
- .unreq WB
- .unreq X
- .unreq UX
- .unreq WIDTH
- .unreq TMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
-.endfunc
-
-.endm
-
-.macro generate_bilinear_scanline_func_src_a8_dst \
- fname, src_fmt, dst_fmt, op, \
- bpp_shift, prefetch_distance
-
-pixman_asm_function fname
- OUT .req r0
- MASK .req r1
- TOP .req r2
- BOTTOM .req r3
- WT .req r4
- WB .req r5
- X .req r6
- UX .req r7
- WIDTH .req ip
- TMP1 .req r4
- TMP2 .req r5
- PF_OFFS .req r8
- TMP3 .req r9
- TMP4 .req r10
- STRIDE .req r3
-
- mov ip, sp
- push {r4, r5, r6, r7, r8, r9, r10, ip}
- mov PF_OFFS, #prefetch_distance
- ldmia ip, {WT, WB, X, UX, WIDTH}
- mul PF_OFFS, PF_OFFS, UX
-
- .set prefetch_offset, prefetch_distance
-
- sub STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 3f
-
- vdup.u16 q12, X
- vdup.u16 q13, UX
- vdup.u8 d28, WT
- vdup.u8 d29, WB
- vadd.u16 d25, d25, d26
- vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
-
- subs WIDTH, WIDTH, #4
- blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
-0:
- bilinear_interpolate_four_pixels src_fmt, 8, dst_fmt, op
- subs WIDTH, WIDTH, #4
- bge 0b
-1:
- tst WIDTH, #2
- beq 2f
- bilinear_interpolate_two_pixels src_fmt, 8, dst_fmt, op
-2:
- tst WIDTH, #1
- beq 3f
- bilinear_interpolate_last_pixel src_fmt, 8, dst_fmt, op
-3:
- pop {r4, r5, r6, r7, r8, r9, r10, ip}
- bx lr
-
- .unreq OUT
- .unreq TOP
- .unreq WT
- .unreq WB
- .unreq X
- .unreq UX
- .unreq WIDTH
- .unreq MASK
- .unreq TMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
-.endfunc
-
-.endm
-
-generate_bilinear_scanline_func_src_dst \
- pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
- 8888, 8888, over, 2, 28
-
-generate_bilinear_scanline_func_src_dst \
- pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
- 8888, 8888, add, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
- 8888, 8888, src, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
- 8888, 0565, src, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
- 0565, 8888, src, 1, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
- 0565, 0565, src, 1, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
- 8888, 8888, over, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
- 8888, 8888, add, 2, 28
-
.set BILINEAR_FLAG_USE_MASK, 1
.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
@@ -855,6 +663,8 @@ pixman_asm_function fname
TMP4 .req r10
STRIDE .req r3
+ .set prefetch_offset, prefetch_distance
+
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
mov PF_OFFS, #prefetch_distance
@@ -968,3 +778,293 @@ pixman_asm_function fname
.endfunc
.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+ bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+ bilinear_src_8888_8_8888_process_pixblock_tail
+ bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+ bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+ bilinear_src_8888_8_0565_process_pixblock_tail
+ bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+ bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+ bilinear_src_0565_8_x888_process_pixblock_tail
+ bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+ bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+ bilinear_src_0565_8_0565_process_pixblock_tail
+ bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+ bilinear_over_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+ bilinear_over_8888_8888_process_pixblock_tail
+ bilinear_over_8888_8888_process_pixblock_head
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+ bilinear_over_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+ bilinear_over_8888_8_8888_process_pixblock_tail
+ bilinear_over_8888_8_8888_process_pixblock_head
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+ bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+ bilinear_add_8888_8888_process_pixblock_tail
+ bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+ bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+ bilinear_add_8888_8_8888_process_pixblock_tail
+ bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_src_8888_8_8888_process_last_pixel, \
+ bilinear_src_8888_8_8888_process_two_pixels, \
+ bilinear_src_8888_8_8888_process_four_pixels, \
+ bilinear_src_8888_8_8888_process_pixblock_head, \
+ bilinear_src_8888_8_8888_process_pixblock_tail, \
+ bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+ 8888, 0565, 2, 1, \
+ bilinear_src_8888_8_0565_process_last_pixel, \
+ bilinear_src_8888_8_0565_process_two_pixels, \
+ bilinear_src_8888_8_0565_process_four_pixels, \
+ bilinear_src_8888_8_0565_process_pixblock_head, \
+ bilinear_src_8888_8_0565_process_pixblock_tail, \
+ bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+ 0565, 8888, 1, 2, \
+ bilinear_src_0565_8_x888_process_last_pixel, \
+ bilinear_src_0565_8_x888_process_two_pixels, \
+ bilinear_src_0565_8_x888_process_four_pixels, \
+ bilinear_src_0565_8_x888_process_pixblock_head, \
+ bilinear_src_0565_8_x888_process_pixblock_tail, \
+ bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+ 0565, 0565, 1, 1, \
+ bilinear_src_0565_8_0565_process_last_pixel, \
+ bilinear_src_0565_8_0565_process_two_pixels, \
+ bilinear_src_0565_8_0565_process_four_pixels, \
+ bilinear_src_0565_8_0565_process_pixblock_head, \
+ bilinear_src_0565_8_0565_process_pixblock_tail, \
+ bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8888_process_last_pixel, \
+ bilinear_over_8888_8888_process_two_pixels, \
+ bilinear_over_8888_8888_process_four_pixels, \
+ bilinear_over_8888_8888_process_pixblock_head, \
+ bilinear_over_8888_8888_process_pixblock_tail, \
+ bilinear_over_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8_8888_process_last_pixel, \
+ bilinear_over_8888_8_8888_process_two_pixels, \
+ bilinear_over_8888_8_8888_process_four_pixels, \
+ bilinear_over_8888_8_8888_process_pixblock_head, \
+ bilinear_over_8888_8_8888_process_pixblock_tail, \
+ bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8888_process_last_pixel, \
+ bilinear_add_8888_8888_process_two_pixels, \
+ bilinear_add_8888_8888_process_four_pixels, \
+ bilinear_add_8888_8888_process_pixblock_head, \
+ bilinear_add_8888_8888_process_pixblock_tail, \
+ bilinear_add_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8_8888_process_last_pixel, \
+ bilinear_add_8888_8_8888_process_two_pixels, \
+ bilinear_add_8888_8_8888_process_four_pixels, \
+ bilinear_add_8888_8_8888_process_pixblock_head, \
+ bilinear_add_8888_8_8888_process_pixblock_tail, \
+ bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
--
1.6.6.1
@@ -1,186 +0,0 @@
From b9009d108277b42ebb4c0ea03eb3fb5845106497 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Wed, 21 Sep 2011 15:52:13 +0900
Subject: [PATCH 4/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 50.43 Mpix/s
after : 61.09 Mpix/s
---
pixman/pixman-arm-neon-asm-bilinear.S | 149 ++++++++++++++++++++++++++++++++-
1 files changed, 146 insertions(+), 3 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 25bcb24..82d248e 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -893,15 +893,158 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8888_process_pixblock_head
- bilinear_over_8888_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+
+ vld1.32 {d22}, [TMP1], STRIDE
+ vld1.32 {d23}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmull.u8 q8, d22, d28
+ vmlal.u8 q8, d23, d29
+
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmull.u8 q9, d22, d28
+ vmlal.u8 q9, d23, d29
+
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+
+ vshll.u16 q1, d18, #8
+ vmlsl.u16 q1, d18, d31
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail
+ vshll.u16 q2, d20, #8
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vmovn.u16 d7, q2
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vmvn.8 d4, d4
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vrshr.u16 q1, q11, #8
+ vrshr.u16 q10, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q10, q2
+ vqadd.u8 q3, q1, q3
+ vuzp.8 d6, d7
+ vuzp.8 d6, d7
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail_head
- bilinear_over_8888_8888_process_pixblock_tail
- bilinear_over_8888_8888_process_pixblock_head
+ vshll.u16 q2, d20, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vmlsl.u16 q2, d20, d30
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vmovn.u16 d7, q2
+ vld1.32 {d22}, [TMP3], STRIDE
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vld1.32 {d23}, [TMP3]
+ vmvn.8 d4, d4
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vrshr.u16 q1, q11, #8
+ vmlal.u16 q0, d17, d30
+ vrshr.u16 q8, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q8, q2
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vqadd.u8 q3, q1, q3
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vuzp.8 d6, d7
+ vshll.u16 q1, d18, #8
+ vuzp.8 d6, d7
+ vmlsl.u16 q1, d18, d31
+ vadd.u16 q12, q12, q13
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
/* over_8888_8_8888 */
--
1.6.6.1
@@ -1,206 +0,0 @@
From c98ce663e2a5dd1e65013053f461c3aac9a3922e Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 19:04:53 +0900
Subject: [PATCH 5/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 40.53 Mpix/s
after : 50.76 Mpix/s
---
pixman/pixman-arm-neon-asm-bilinear.S | 162 ++++++++++++++++++++++++++++++++-
1 files changed, 158 insertions(+), 4 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 82d248e..f7913ad 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -949,7 +949,7 @@ pixman_asm_function fname
vshrn.u32 d0, q0, #16
vshrn.u32 d1, q1, #16
vld1.32 {d2, d3}, [OUT, :128]
- pld [OUT, PF_OFFS]
+ pld [OUT, #(prefetch_offset * 4)]
vshrn.u32 d4, q2, #16
vshr.u16 q15, q12, #8
vshrn.u32 d5, q3, #16
@@ -1061,15 +1061,169 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_head
- bilinear_over_8888_8_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vld1.32 {d3}, [TMP2]
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2}, [TMP3], STRIDE
+ vld1.32 {d3}, [TMP3]
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22[0]}, [MASK]!
+ pld [MASK, #prefetch_offset]
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d16, q0
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail
+ vshll.u16 q9, d6, #8
+ vshll.u16 q10, d2, #8
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vdup.32 d22, d17[1]
+ vmvn.8 d22, d22
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q0, q11, #8
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q0, q11
+ vqadd.u8 q9, q8, q9
+ vuzp.8 d18, d19
+ vuzp.8 d18, d19
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
- bilinear_over_8888_8_8888_process_pixblock_tail
- bilinear_over_8888_8_8888_process_pixblock_head
+ vshll.u16 q9, d6, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vshll.u16 q10, d2, #8
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vld1.32 {d3}, [TMP2]
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, #(prefetch_offset * 4)]
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vld1.32 {d2}, [TMP3], STRIDE
+ vdup.32 d22, d17[1]
+ vld1.32 {d3}, [TMP3]
+ vmvn.8 d22, d22
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q15, q11, #8
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q15, q11
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vqadd.u8 q9, q8, q9
+ vld1.32 {d22[0]}, [MASK]!
+ vuzp.8 d18, d19
+ vadd.u16 q12, q12, q13
+ vuzp.8 d18, d19
+ vmovn.u16 d16, q0
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
/* add_8888_8888 */
--
1.6.6.1
@@ -1,129 +0,0 @@
From 2851a24d4562437cfb333568fcab1ce9861033a8 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 17:03:54 +0900
Subject: [PATCH 6/8] ARM: NEON: Standard fast path src_n_8_8888
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s)
- after
L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s)
---
pixman/pixman-arm-neon-asm.S | 73 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 7 ++++
2 files changed, 80 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3fcd07d..1db02db 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1219,6 +1219,79 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+ /* expecting solid source in {d0, d1, d2, d3} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d24, d1
+ vmull.u8 q10, d24, d2
+ vmull.u8 q11, d24, d3
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q8, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q9, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q10, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q11, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q8, d24, d0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q9, d24, d1
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d2
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d3
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8888_init, \
+ pixman_composite_src_n_8_8888_cleanup, \
+ pixman_composite_src_n_8_8888_process_pixblock_head, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index effb50b..3db9adf 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
--
1.6.6.1
@@ -1,118 +0,0 @@
From 34ce640914e06f2e23a0a93a3a49ec0bfff7497b Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 18:33:27 +0900
Subject: [PATCH 7/8] ARM: NEON: Standard fast path src_n_8_8
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s)
- after
L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s)
---
pixman/pixman-arm-neon-asm.S | 66 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 3 ++
2 files changed, 69 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 1db02db..da8f054 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1292,6 +1292,72 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d16
+ vmull.u8 q1, d25, d16
+ vmull.u8 q2, d26, d16
+ vmull.u8 q3, d27, d16
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+ vrshrn.u16 d28, q0, #8
+ vrshrn.u16 d29, q1, #8
+ vrshrn.u16 d30, q2, #8
+ vrshrn.u16 d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q0, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q1, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q2, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q3, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q0, d24, d16
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q1, d25, d16
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q2, d26, d16
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q3, d27, d16
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d16[0]}, [DUMMY]
+ vdup.8 d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8_init, \
+ pixman_composite_src_n_8_8_cleanup, \
+ pixman_composite_src_n_8_8_process_pixblock_head, \
+ pixman_composite_src_n_8_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3db9adf..ca139de 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
--
1.6.6.1
@@ -1,31 +0,0 @@
require pixman.inc
# Some artefacts observed in webkit scrolling, need to see if it's a regression or not
DEFAULT_PREFERENCE = "-1"
LICENSE = "MIT & MIT-style & Public Domain"
LIC_FILES_CHKSUM = "file://COPYING;md5=14096c769ae0cbb5fcb94ec468be11b3\
file://pixman/pixman-matrix.c;endline=25;md5=ba6e8769bfaaee2c41698755af04c4be \
file://pixman/pixman-arm-neon-asm.h;endline=24;md5=9a9cc1e51abbf1da58f4d9528ec9d49b \
"
PR = "${INC_PR}.3"
SRC_URI = "http://xorg.freedesktop.org/archive/individual/lib/${BPN}-${PV}.tar.gz \
file://0001-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch \
file://0002-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch \
file://0003-ARM-NEON-Replace-old-bilinear-scanline-generator-wit.patch \
file://0004-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch \
file://0005-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch \
file://0006-ARM-NEON-Standard-fast-path-src_n_8_8888.patch \
file://0007-ARM-NEON-Standard-fast-path-src_n_8_8.patch \
file://0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch \
"
SRC_URI[md5sum] = "27eb7a0ec440c89cccd7c396c3581041"
SRC_URI[sha256sum] = "4e35f49474e78a9430d93caaaea8bbf7e30b65f0da33c31f15a988c25a3ac369"
NEON = " --disable-arm-neon "
NEON_armv7a = " "
EXTRA_OECONF = "${NEON} --disable-gtk"
@@ -9,14 +9,11 @@ LIC_FILES_CHKSUM = "file://COPYING;md5=14096c769ae0cbb5fcb94ec468be11b3\
PR = "${INC_PR}.0"
SRC_URI = "http://xorg.freedesktop.org/archive/individual/lib/${BPN}-${PV}.tar.gz \
file://0001-Generic-C-implementation-of-pixman_blt-with-overlapp.patch \
file://0002-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch \
file://0003-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch \
file://0004-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch \
"
file://0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch \
"
SRC_URI[md5sum] = "3dd0d9ed05dbf0e5e75d526ebae42e11"
SRC_URI[sha256sum] = "51f4f26be030e476a1b481a8f76e6695b45d1dce084beae5251236c3bb2a1f89"
SRC_URI[md5sum] = "a2d0b120509bdccb10aa7f4bec3730e4"
SRC_URI[sha256sum] = "a5647c7158f103eedff5fba799018f4169f6b26b573ab7685812ebc9a1c5d2e4"
NEON = " --disable-arm-neon "
NEON_armv7a = " "