pixman 0.23.6: add initial version with NEON bilinear patches

The overlappet blit patches have been reduced to the generic C version

Signed-off-by: Koen Kooi <koen@dominion.thruhere.net>
This commit is contained in:
Koen Kooi
2011-10-11 18:47:55 +02:00
parent 8fcf92fed8
commit 6d792e9051
9 changed files with 1868 additions and 0 deletions
@@ -0,0 +1,129 @@
From f7d1d45e30b59b513d48294de50dc86af60ea68c Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 17:03:54 +0900
Subject: [PATCH 1/8] ARM: NEON: Standard fast path src_n_8_8888
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s)
- after
L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s)
---
pixman/pixman-arm-neon-asm.S | 73 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 7 ++++
2 files changed, 80 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3fcd07d..1db02db 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1219,6 +1219,79 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+ /* expecting solid source in {d0, d1, d2, d3} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d24, d1
+ vmull.u8 q10, d24, d2
+ vmull.u8 q11, d24, d3
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q8, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q9, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q10, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q11, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q8, d24, d0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q9, d24, d1
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d2
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d3
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W :128]!
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8888_init, \
+ pixman_composite_src_n_8_8888_cleanup, \
+ pixman_composite_src_n_8_8888_process_pixblock_head, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index effb50b..3db9adf 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
--
1.6.6.1
@@ -0,0 +1,118 @@
From fc92ad56c5218157a097f6ed0c06196be9f74906 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 18:33:27 +0900
Subject: [PATCH 2/8] ARM: NEON: Standard fast path src_n_8_8
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s)
- after
L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s)
---
pixman/pixman-arm-neon-asm.S | 66 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 3 ++
2 files changed, 69 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 1db02db..da8f054 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1292,6 +1292,72 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d16
+ vmull.u8 q1, d25, d16
+ vmull.u8 q2, d26, d16
+ vmull.u8 q3, d27, d16
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+ vrshrn.u16 d28, q0, #8
+ vrshrn.u16 d29, q1, #8
+ vrshrn.u16 d30, q2, #8
+ vrshrn.u16 d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q0, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q1, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q2, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q3, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q0, d24, d16
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q1, d25, d16
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q2, d26, d16
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q3, d27, d16
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d16[0]}, [DUMMY]
+ vdup.8 d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8_init, \
+ pixman_composite_src_n_8_8_cleanup, \
+ pixman_composite_src_n_8_8_process_pixblock_head, \
+ pixman_composite_src_n_8_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3db9adf..ca139de 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
--
1.6.6.1
@@ -0,0 +1,331 @@
From ed7580525054e6a543694088c561dee525b4ae28 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue, 20 Sep 2011 19:46:25 +0900
Subject: [PATCH 3/8] ARM: NEON: Some cleanup of bilinear scanline functions
Use STRIDE and initial horizontal weight update is done before
entering interpolation loop. Cache preload for mask and dst.
---
pixman/pixman-arm-neon-asm-bilinear.S | 128 +++++++++++++++++----------------
1 files changed, 67 insertions(+), 61 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 3c7fe0f..c5ba929 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -44,10 +44,6 @@
* All temp registers can be used freely outside the code block.
* Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
*
- * TODOs
- * Support 0565 pixel format
- * Optimization for two and last pixel cases
- *
* Remarks
* There can be lots of pipeline stalls inside code block and between code blocks.
* Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
@@ -92,21 +88,19 @@ fname:
*/
.macro bilinear_load_8888 reg1, reg2, tmp
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {reg1}, [TMP1]
- vld1.32 {reg2}, [TMP2]
+ add TMP1, TOP, TMP1, asl #2
+ vld1.32 {reg1}, [TMP1], STRIDE
+ vld1.32 {reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- vld1.32 {reg2[0]}, [TMP1]
- vld1.32 {reg2[1]}, [TMP2]
+ add TMP1, TOP, TMP1, asl #1
+ vld1.32 {reg2[0]}, [TMP1], STRIDE
+ vld1.32 {reg2[1]}, [TMP1]
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
.endm
@@ -134,18 +128,16 @@ fname:
.macro bilinear_load_and_vertical_interpolate_two_0565 \
acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- mov TMP4, X, asr #16
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- add TMP3, TOP, TMP4, asl #1
- add TMP4, BOTTOM, TMP4, asl #1
- vld1.32 {acc2lo[0]}, [TMP1]
- vld1.32 {acc2hi[0]}, [TMP3]
- vld1.32 {acc2lo[1]}, [TMP2]
- vld1.32 {acc2hi[1]}, [TMP4]
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {acc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {acc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {acc2lo[1]}, [TMP1]
+ vld1.32 {acc2hi[1]}, [TMP2]
convert_0565_to_x888 acc2, reg3, reg2, reg1
vzip.u8 reg1, reg3
vzip.u8 reg2, reg4
@@ -161,34 +153,30 @@ fname:
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- mov TMP4, X, asr #16
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- add TMP3, TOP, TMP4, asl #1
- add TMP4, BOTTOM, TMP4, asl #1
- vld1.32 {xacc2lo[0]}, [TMP1]
- vld1.32 {xacc2hi[0]}, [TMP3]
- vld1.32 {xacc2lo[1]}, [TMP2]
- vld1.32 {xacc2hi[1]}, [TMP4]
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {xacc2lo[1]}, [TMP1]
+ vld1.32 {xacc2hi[1]}, [TMP2]
convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
- mov TMP2, X, asr #16
+ mov TMP1, X, asr #16
add X, X, UX
- mov TMP4, X, asr #16
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
add X, X, UX
- add TMP1, TOP, TMP2, asl #1
- add TMP2, BOTTOM, TMP2, asl #1
- add TMP3, TOP, TMP4, asl #1
- add TMP4, BOTTOM, TMP4, asl #1
- vld1.32 {yacc2lo[0]}, [TMP1]
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
vzip.u8 xreg1, xreg3
- vld1.32 {yacc2hi[0]}, [TMP3]
+ vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
vzip.u8 xreg2, xreg4
- vld1.32 {yacc2lo[1]}, [TMP2]
+ vld1.32 {yacc2lo[1]}, [TMP1]
vzip.u8 xreg3, xreg4
- vld1.32 {yacc2hi[1]}, [TMP4]
+ vld1.32 {yacc2hi[1]}, [TMP2]
vzip.u8 xreg1, xreg2
convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
vmull.u8 xacc1, xreg1, d28
@@ -252,6 +240,7 @@ fname:
.else
.error bilinear_load_mask_8 numpix is unsupported
.endif
+ pld [MASK, #prefetch_offset]
.endm
.macro bilinear_load_mask mask_fmt, numpix, mask
@@ -279,6 +268,7 @@ fname:
.else
.error bilinear_load_dst_8888 numpix is unsupported
.endif
+ pld [OUT, #(prefetch_offset * 4)]
.endm
.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
@@ -303,7 +293,7 @@ fname:
* For two pixel case
* (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
* (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
- * We can do some optimizations for this including one pixel cases.
+ * We can do some optimizations for this including last pixel cases.
*/
.macro bilinear_duplicate_mask_x numpix, mask
.endm
@@ -497,8 +487,7 @@ fname:
bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
- vshr.u16 d30, d24, #8
- /* 4 cycles bubble */
+ /* 5 cycles bubble */
vshll.u16 q0, d2, #8
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@@ -525,18 +514,18 @@ fname:
q1, q11, d0, d1, d20, d21, d22, d23
bilinear_load_mask mask_fmt, 2, d4
bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
vshll.u16 q0, d2, #8
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
vshll.u16 q10, d22, #8
vmlsl.u16 q10, d22, d31
vmlal.u16 q10, d23, d31
- vshrn.u32 d30, q0, #16
- vshrn.u32 d31, q10, #16
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q10, #16
bilinear_duplicate_mask mask_fmt, 2, d4
- vmovn.u16 d0, q15
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d0, q0
bilinear_interleave_src_dst \
mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
@@ -554,8 +543,7 @@ fname:
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
+ sub TMP1, TMP1, STRIDE
vshll.u16 q0, d2, #8
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@@ -567,9 +555,9 @@ fname:
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
vshll.u16 q8, d18, #8
- bilinear_load_mask mask_fmt, 4, d30
+ bilinear_load_mask mask_fmt, 4, d22
bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
- pld [TMP2, PF_OFFS]
+ pld [TMP1, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
vadd.u16 q12, q12, q13
@@ -577,17 +565,19 @@ fname:
vshrn.u32 d1, q10, #16
vshrn.u32 d4, q2, #16
vshrn.u32 d5, q8, #16
- bilinear_duplicate_mask mask_fmt, 4, d30
+ bilinear_duplicate_mask mask_fmt, 4, d22
+ vshr.u16 q15, q12, #8
vmovn.u16 d0, q0
vmovn.u16 d1, q2
+ vadd.u16 q12, q12, q13
bilinear_interleave_src_dst \
mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
bilinear_apply_mask_to_src \
- mask_fmt, 4, d0, d1, q0, d30, \
+ mask_fmt, 4, d0, d1, q0, d22, \
q3, q8, q9, q10
bilinear_combine \
op, 4, d0, d1, q0, d2, d3, q1, \
- q3, q8, q9, q10, d22
+ q3, q8, q9, q10, d23
bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
bilinear_store_&dst_fmt 4, q2, q3
.endm
@@ -610,6 +600,7 @@ pixman_asm_function fname
PF_OFFS .req r7
TMP3 .req r8
TMP4 .req r9
+ STRIDE .req r2
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
@@ -617,6 +608,11 @@ pixman_asm_function fname
ldmia ip, {WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
+ .set prefetch_offset, prefetch_distance
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
cmp WIDTH, #0
ble 3f
@@ -626,6 +622,8 @@ pixman_asm_function fname
vdup.u8 d29, WB
vadd.u16 d25, d25, d26
vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
subs WIDTH, WIDTH, #4
blt 1f
@@ -648,7 +646,6 @@ pixman_asm_function fname
.unreq OUT
.unreq TOP
- .unreq BOTTOM
.unreq WT
.unreq WB
.unreq X
@@ -659,6 +656,7 @@ pixman_asm_function fname
.unreq PF_OFFS
.unreq TMP3
.unreq TMP4
+ .unreq STRIDE
.endfunc
.endm
@@ -682,6 +680,7 @@ pixman_asm_function fname
PF_OFFS .req r8
TMP3 .req r9
TMP4 .req r10
+ STRIDE .req r3
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
@@ -689,6 +688,11 @@ pixman_asm_function fname
ldmia ip, {WT, WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
+ .set prefetch_offset, prefetch_distance
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
cmp WIDTH, #0
ble 3f
@@ -698,6 +702,8 @@ pixman_asm_function fname
vdup.u8 d29, WB
vadd.u16 d25, d25, d26
vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
subs WIDTH, WIDTH, #4
blt 1f
@@ -720,7 +726,6 @@ pixman_asm_function fname
.unreq OUT
.unreq TOP
- .unreq BOTTOM
.unreq WT
.unreq WB
.unreq X
@@ -732,6 +737,7 @@ pixman_asm_function fname
.unreq PF_OFFS
.unreq TMP3
.unreq TMP4
+ .unreq STRIDE
.endfunc
.endm
--
1.6.6.1
@@ -0,0 +1,235 @@
From 524d1cc7acb753167fffdd08d8c10bf71e0634ba Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue, 20 Sep 2011 21:32:35 +0900
Subject: [PATCH 4/8] ARM: NEON: Bilinear macro template for instruction scheduling
This macro template takes 6 code blocks.
1. process_last_pixel
2. process_two_pixels
3. process_four_pixels
4. process_pixblock_head
5. process_pixblock_tail
6. process_pixblock_tail_head
process_last_pixel does not need to update horizontal weight. This
is done by the template. two and four code block should update
horizontal weight inside of them. head/tail/tail_head blocks
consist unrolled core loop. You can apply instruction scheduling
to the tail_head blocks.
You can also specify size of the pixel block. Supported size is 4
and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
to the template, then you can use register MASK. When using d8~d15
registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
registers are properly saved on the stack and later restored.
---
pixman/pixman-arm-neon-asm-bilinear.S | 195 +++++++++++++++++++++++++++++++++
1 files changed, 195 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index c5ba929..784e5df 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -773,3 +773,198 @@ generate_bilinear_scanline_func_src_a8_dst \
generate_bilinear_scanline_func_src_a8_dst \
pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
8888, 8888, add, 2, 28
+
+.set BILINEAR_FLAG_USE_MASK, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
+ * process_last_pixel - code block that interpolate one pixel and does not
+ * update horizontal weight
+ * process_two_pixels - code block that interpolate two pixels and update
+ * horizontal weight
+ * process_four_pixels - code block that interpolate four pixels and update
+ * horizontal weight
+ * process_pixblock_head - head part of middle loop
+ * process_pixblock_tail - tail part of middle loop
+ * process_pixblock_tail_head - tail_head of middle loop
+ * pixblock_size - number of pixels processed in a single middle loop
+ * prefetch_distance - prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+ fname, \
+ src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+ bilinear_process_last_pixel, \
+ bilinear_process_two_pixels, \
+ bilinear_process_four_pixels, \
+ bilinear_process_pixblock_head, \
+ bilinear_process_pixblock_tail, \
+ bilinear_process_pixblock_tail_head, \
+ pixblock_size, \
+ prefetch_distance, \
+ flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+ .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+ STRIDE .req r2
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+.else
+ OUT .req r0
+ MASK .req r1
+ TOP .req r2
+ BOTTOM .req r3
+ WT .req r4
+ WB .req r5
+ X .req r6
+ UX .req r7
+ WIDTH .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ PF_OFFS .req r8
+ TMP3 .req r9
+ TMP4 .req r10
+ STRIDE .req r3
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9, r10, ip}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WT, WB, X, UX, WIDTH}
+.endif
+
+ mul PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpush {d8-d15}
+.endif
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 3f
+
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 0f
+ tst OUT, #(1 << dst_bpp_shift)
+ beq 0f
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ bilinear_process_last_pixel
+ sub WIDTH, WIDTH, #1
+0:
+ vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+
+ cmp WIDTH, #2
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 1))
+ beq 0f
+ bilinear_process_two_pixels
+ sub WIDTH, WIDTH, #2
+0:
+.if pixblock_size == 8
+ cmp WIDTH, #4
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 2))
+ beq 0f
+ bilinear_process_four_pixels
+ sub WIDTH, WIDTH, #4
+0:
+.endif
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+ bilinear_process_pixblock_head
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 5f
+0:
+ bilinear_process_pixblock_tail_head
+ subs WIDTH, WIDTH, #pixblock_size
+ bge 0b
+5:
+ bilinear_process_pixblock_tail
+1:
+.if pixblock_size == 8
+ tst WIDTH, #4
+ beq 2f
+ bilinear_process_four_pixels
+2:
+.endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 2f
+ bilinear_process_two_pixels
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_process_last_pixel
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpop {d8-d15}
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ pop {r4, r5, r6, r7, r8, r9}
+.else
+ pop {r4, r5, r6, r7, r8, r9, r10, ip}
+.endif
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+ .unreq MASK
+.endif
+
+.endfunc
+
+.endm
--
1.6.6.1
@@ -0,0 +1,520 @@
From 10b257b46f379d9c79483acd55c9a13fff130843 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Fri, 23 Sep 2011 00:03:22 +0900
Subject: [PATCH 5/8] ARM: NEON: Replace old bilinear scanline generator with new template
Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
be replaced with new template just by wrapping existing macros.
---
pixman/pixman-arm-neon-asm-bilinear.S | 484 ++++++++++++++++++++-------------
1 files changed, 292 insertions(+), 192 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 784e5df..25bcb24 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -582,198 +582,6 @@ fname:
bilinear_store_&dst_fmt 4, q2, q3
.endm
-.macro generate_bilinear_scanline_func_src_dst \
- fname, src_fmt, dst_fmt, op, \
- bpp_shift, prefetch_distance
-
-pixman_asm_function fname
- OUT .req r0
- TOP .req r1
- BOTTOM .req r2
- WT .req r3
- WB .req r4
- X .req r5
- UX .req r6
- WIDTH .req ip
- TMP1 .req r3
- TMP2 .req r4
- PF_OFFS .req r7
- TMP3 .req r8
- TMP4 .req r9
- STRIDE .req r2
-
- mov ip, sp
- push {r4, r5, r6, r7, r8, r9}
- mov PF_OFFS, #prefetch_distance
- ldmia ip, {WB, X, UX, WIDTH}
- mul PF_OFFS, PF_OFFS, UX
-
- .set prefetch_offset, prefetch_distance
-
- sub STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 3f
-
- vdup.u16 q12, X
- vdup.u16 q13, UX
- vdup.u8 d28, WT
- vdup.u8 d29, WB
- vadd.u16 d25, d25, d26
- vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
-
- subs WIDTH, WIDTH, #4
- blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
-0:
- bilinear_interpolate_four_pixels src_fmt, x, dst_fmt, op
- subs WIDTH, WIDTH, #4
- bge 0b
-1:
- tst WIDTH, #2
- beq 2f
- bilinear_interpolate_two_pixels src_fmt, x, dst_fmt, op
-2:
- tst WIDTH, #1
- beq 3f
- bilinear_interpolate_last_pixel src_fmt, x, dst_fmt, op
-3:
- pop {r4, r5, r6, r7, r8, r9}
- bx lr
-
- .unreq OUT
- .unreq TOP
- .unreq WT
- .unreq WB
- .unreq X
- .unreq UX
- .unreq WIDTH
- .unreq TMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
-.endfunc
-
-.endm
-
-.macro generate_bilinear_scanline_func_src_a8_dst \
- fname, src_fmt, dst_fmt, op, \
- bpp_shift, prefetch_distance
-
-pixman_asm_function fname
- OUT .req r0
- MASK .req r1
- TOP .req r2
- BOTTOM .req r3
- WT .req r4
- WB .req r5
- X .req r6
- UX .req r7
- WIDTH .req ip
- TMP1 .req r4
- TMP2 .req r5
- PF_OFFS .req r8
- TMP3 .req r9
- TMP4 .req r10
- STRIDE .req r3
-
- mov ip, sp
- push {r4, r5, r6, r7, r8, r9, r10, ip}
- mov PF_OFFS, #prefetch_distance
- ldmia ip, {WT, WB, X, UX, WIDTH}
- mul PF_OFFS, PF_OFFS, UX
-
- .set prefetch_offset, prefetch_distance
-
- sub STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 3f
-
- vdup.u16 q12, X
- vdup.u16 q13, UX
- vdup.u8 d28, WT
- vdup.u8 d29, WB
- vadd.u16 d25, d25, d26
- vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
-
- subs WIDTH, WIDTH, #4
- blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
-0:
- bilinear_interpolate_four_pixels src_fmt, 8, dst_fmt, op
- subs WIDTH, WIDTH, #4
- bge 0b
-1:
- tst WIDTH, #2
- beq 2f
- bilinear_interpolate_two_pixels src_fmt, 8, dst_fmt, op
-2:
- tst WIDTH, #1
- beq 3f
- bilinear_interpolate_last_pixel src_fmt, 8, dst_fmt, op
-3:
- pop {r4, r5, r6, r7, r8, r9, r10, ip}
- bx lr
-
- .unreq OUT
- .unreq TOP
- .unreq WT
- .unreq WB
- .unreq X
- .unreq UX
- .unreq WIDTH
- .unreq MASK
- .unreq TMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
-.endfunc
-
-.endm
-
-generate_bilinear_scanline_func_src_dst \
- pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
- 8888, 8888, over, 2, 28
-
-generate_bilinear_scanline_func_src_dst \
- pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
- 8888, 8888, add, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
- 8888, 8888, src, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
- 8888, 0565, src, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
- 0565, 8888, src, 1, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
- 0565, 0565, src, 1, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
- 8888, 8888, over, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
- 8888, 8888, add, 2, 28
-
.set BILINEAR_FLAG_USE_MASK, 1
.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
@@ -855,6 +663,8 @@ pixman_asm_function fname
TMP4 .req r10
STRIDE .req r3
+ .set prefetch_offset, prefetch_distance
+
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
mov PF_OFFS, #prefetch_distance
@@ -968,3 +778,293 @@ pixman_asm_function fname
.endfunc
.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+ bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+ bilinear_src_8888_8_8888_process_pixblock_tail
+ bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+ bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+ bilinear_src_8888_8_0565_process_pixblock_tail
+ bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+ bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+ bilinear_src_0565_8_x888_process_pixblock_tail
+ bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+ bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+ bilinear_src_0565_8_0565_process_pixblock_tail
+ bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+ bilinear_over_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+ bilinear_over_8888_8888_process_pixblock_tail
+ bilinear_over_8888_8888_process_pixblock_head
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+ bilinear_over_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+ bilinear_over_8888_8_8888_process_pixblock_tail
+ bilinear_over_8888_8_8888_process_pixblock_head
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+ bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+ bilinear_add_8888_8888_process_pixblock_tail
+ bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+ bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+ bilinear_add_8888_8_8888_process_pixblock_tail
+ bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_src_8888_8_8888_process_last_pixel, \
+ bilinear_src_8888_8_8888_process_two_pixels, \
+ bilinear_src_8888_8_8888_process_four_pixels, \
+ bilinear_src_8888_8_8888_process_pixblock_head, \
+ bilinear_src_8888_8_8888_process_pixblock_tail, \
+ bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+ 8888, 0565, 2, 1, \
+ bilinear_src_8888_8_0565_process_last_pixel, \
+ bilinear_src_8888_8_0565_process_two_pixels, \
+ bilinear_src_8888_8_0565_process_four_pixels, \
+ bilinear_src_8888_8_0565_process_pixblock_head, \
+ bilinear_src_8888_8_0565_process_pixblock_tail, \
+ bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+ 0565, 8888, 1, 2, \
+ bilinear_src_0565_8_x888_process_last_pixel, \
+ bilinear_src_0565_8_x888_process_two_pixels, \
+ bilinear_src_0565_8_x888_process_four_pixels, \
+ bilinear_src_0565_8_x888_process_pixblock_head, \
+ bilinear_src_0565_8_x888_process_pixblock_tail, \
+ bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+ 0565, 0565, 1, 1, \
+ bilinear_src_0565_8_0565_process_last_pixel, \
+ bilinear_src_0565_8_0565_process_two_pixels, \
+ bilinear_src_0565_8_0565_process_four_pixels, \
+ bilinear_src_0565_8_0565_process_pixblock_head, \
+ bilinear_src_0565_8_0565_process_pixblock_tail, \
+ bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8888_process_last_pixel, \
+ bilinear_over_8888_8888_process_two_pixels, \
+ bilinear_over_8888_8888_process_four_pixels, \
+ bilinear_over_8888_8888_process_pixblock_head, \
+ bilinear_over_8888_8888_process_pixblock_tail, \
+ bilinear_over_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8_8888_process_last_pixel, \
+ bilinear_over_8888_8_8888_process_two_pixels, \
+ bilinear_over_8888_8_8888_process_four_pixels, \
+ bilinear_over_8888_8_8888_process_pixblock_head, \
+ bilinear_over_8888_8_8888_process_pixblock_tail, \
+ bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8888_process_last_pixel, \
+ bilinear_add_8888_8888_process_two_pixels, \
+ bilinear_add_8888_8888_process_four_pixels, \
+ bilinear_add_8888_8888_process_pixblock_head, \
+ bilinear_add_8888_8888_process_pixblock_tail, \
+ bilinear_add_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8_8888_process_last_pixel, \
+ bilinear_add_8888_8_8888_process_two_pixels, \
+ bilinear_add_8888_8_8888_process_four_pixels, \
+ bilinear_add_8888_8_8888_process_pixblock_head, \
+ bilinear_add_8888_8_8888_process_pixblock_tail, \
+ bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
--
1.6.6.1
@@ -0,0 +1,186 @@
From c8f7edaebd510ba120d74102a93ad4d202b0e806 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Wed, 21 Sep 2011 15:52:13 +0900
Subject: [PATCH 6/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 50.43 Mpix/s
after : 61.09 Mpix/s
---
pixman/pixman-arm-neon-asm-bilinear.S | 149 ++++++++++++++++++++++++++++++++-
1 files changed, 146 insertions(+), 3 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 25bcb24..76937e0 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -893,15 +893,158 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8888_process_pixblock_head
- bilinear_over_8888_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+
+ vld1.32 {d22}, [TMP1], STRIDE
+ vld1.32 {d23}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmull.u8 q8, d22, d28
+ vmlal.u8 q8, d23, d29
+
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmull.u8 q9, d22, d28
+ vmlal.u8 q9, d23, d29
+
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+
+ vshll.u16 q1, d18, #8
+ vmlsl.u16 q1, d18, d31
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail
+ vshll.u16 q2, d20, #8
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vmovn.u16 d7, q2
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vmvn.8 d4, d4
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vrshr.u16 q1, q11, #8
+ vrshr.u16 q10, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q10, q2
+ vqadd.u8 q3, q1, q3
+ vuzp.8 d6, d7
+ vuzp.8 d6, d7
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail_head
- bilinear_over_8888_8888_process_pixblock_tail
- bilinear_over_8888_8888_process_pixblock_head
+ vshll.u16 q2, d20, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vmlsl.u16 q2, d20, d30
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vmovn.u16 d7, q2
+ vld1.32 {d22}, [TMP3], STRIDE
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vld1.32 {d23}, [TMP3]
+ vmvn.8 d4, d4
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vrshr.u16 q1, q11, #8
+ vmlal.u16 q0, d17, d30
+ vrshr.u16 q8, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q8, q2
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vqadd.u8 q3, q1, q3
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vuzp.8 d6, d7
+ vshll.u16 q1, d18, #8
+ vuzp.8 d6, d7
+ vmlsl.u16 q1, d18, d31
+ vadd.u16 q12, q12, q13
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
/* over_8888_8_8888 */
--
1.6.6.1
@@ -0,0 +1,206 @@
From 94585f9a618821a5c06c3a497902579b4a08b05f Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 19:04:53 +0900
Subject: [PATCH 7/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 40.53 Mpix/s
after : 50.76 Mpix/s
---
pixman/pixman-arm-neon-asm-bilinear.S | 162 ++++++++++++++++++++++++++++++++-
1 files changed, 158 insertions(+), 4 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 76937e0..4ab46e1 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -949,7 +949,7 @@ pixman_asm_function fname
vshrn.u32 d0, q0, #16
vshrn.u32 d1, q1, #16
vld1.32 {d2, d3}, [OUT, :128]
- pld [OUT, PF_OFFS]
+ pld [OUT, #(prefetch_offset * 4)]
vshrn.u32 d4, q2, #16
vshr.u16 q15, q12, #8
vshrn.u32 d5, q3, #16
@@ -1061,15 +1061,169 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_head
- bilinear_over_8888_8_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vld1.32 {d3}, [TMP2]
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2}, [TMP3], STRIDE
+ vld1.32 {d3}, [TMP3]
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22[0]}, [MASK]!
+ pld [MASK, #prefetch_offset]
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d16, q0
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail
+ vshll.u16 q9, d6, #8
+ vshll.u16 q10, d2, #8
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vdup.32 d22, d17[1]
+ vmvn.8 d22, d22
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q0, q11, #8
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q0, q11
+ vqadd.u8 q9, q8, q9
+ vuzp.8 d18, d19
+ vuzp.8 d18, d19
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
- bilinear_over_8888_8_8888_process_pixblock_tail
- bilinear_over_8888_8_8888_process_pixblock_head
+ vshll.u16 q9, d6, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vshll.u16 q10, d2, #8
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vld1.32 {d3}, [TMP2]
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, #(prefetch_offset * 4)]
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vld1.32 {d2}, [TMP3], STRIDE
+ vdup.32 d22, d17[1]
+ vld1.32 {d3}, [TMP3]
+ vmvn.8 d22, d22
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q15, q11, #8
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q15, q11
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vqadd.u8 q9, q8, q9
+ vld1.32 {d22[0]}, [MASK]!
+ vuzp.8 d18, d19
+ vadd.u16 q12, q12, q13
+ vuzp.8 d18, d19
+ vmovn.u16 d16, q0
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
/* add_8888_8888 */
--
1.6.6.1
@@ -0,0 +1,114 @@
From d65a08904857d87dcd74b87681c9b94390b76eff Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Tue, 16 Mar 2010 16:55:28 +0100
Subject: [PATCH 8/8] Generic C implementation of pixman_blt with overlapping support
Uses memcpy/memmove functions to copy pixels, can handle the
case when both source and destination areas are in the same
image (this is useful for scrolling).
It is assumed that copying direction is only important when
using the same image for both source and destination (and
src_stride == dst_stride). Copying direction is undefined
for the images with different source and destination stride
which happen to be in the overlapped areas (but this is an
unrealistic case anyway).
---
pixman/pixman-general.c | 21 ++++++++++++++++++---
pixman/pixman-private.h | 43 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 2ccdfcd..6f7bb34 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -227,9 +227,24 @@ general_blt (pixman_implementation_t *imp,
int width,
int height)
{
- /* We can't blit unless we have sse2 or mmx */
-
- return FALSE;
+ uint8_t *dst_bytes = (uint8_t *)dst_bits;
+ uint8_t *src_bytes = (uint8_t *)src_bits;
+ int bpp;
+
+ if (src_bpp != dst_bpp || src_bpp & 7)
+ return FALSE;
+
+ bpp = src_bpp >> 3;
+ width *= bpp;
+ src_stride *= 4;
+ dst_stride *= 4;
+ pixman_blt_helper (src_bytes + src_y * src_stride + src_x * bpp,
+ dst_bytes + dest_y * dst_stride + dest_x * bpp,
+ src_stride,
+ dst_stride,
+ width,
+ height);
+ return TRUE;
}
static pixman_bool_t
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index cbd48f3..c20d9f0 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -10,6 +10,7 @@
#include "pixman.h"
#include <time.h>
+#include <string.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
@@ -998,4 +999,46 @@ void pixman_timer_register (pixman_timer_t *timer);
#endif /* PIXMAN_TIMERS */
+/* a helper function, can blit 8-bit images with src/dst overlapping support */
+static inline void
+pixman_blt_helper (uint8_t *src_bytes,
+ uint8_t *dst_bytes,
+ int src_stride,
+ int dst_stride,
+ int width,
+ int height)
+{
+ /*
+ * The second part of this check is not strictly needed, but it prevents
+ * unnecessary upside-down processing of areas which belong to different
+ * images. Upside-down processing can be slower with fixed-distance-ahead
+ * prefetch and perceived as having more tearing.
+ */
+ if (src_bytes < dst_bytes + width &&
+ src_bytes + src_stride * height > dst_bytes)
+ {
+ src_bytes += src_stride * height - src_stride;
+ dst_bytes += dst_stride * height - dst_stride;
+ dst_stride = -dst_stride;
+ src_stride = -src_stride;
+ /* Horizontal scrolling to the left needs memmove */
+ if (src_bytes + width > dst_bytes)
+ {
+ while (--height >= 0)
+ {
+ memmove (dst_bytes, src_bytes, width);
+ dst_bytes += dst_stride;
+ src_bytes += src_stride;
+ }
+ return;
+ }
+ }
+ while (--height >= 0)
+ {
+ memcpy (dst_bytes, src_bytes, width);
+ dst_bytes += dst_stride;
+ src_bytes += src_stride;
+ }
+}
+
#endif /* PIXMAN_PRIVATE_H */
--
1.6.6.1
@@ -0,0 +1,29 @@
require pixman.inc
# Some artefacts observed in webkit scrolling, need to see if it's a regression or not
DEFAULT_PREFERENCE = "-1"
LICENSE = "MIT & MIT-style & Public Domain"
LIC_FILES_CHKSUM = "file://COPYING;md5=14096c769ae0cbb5fcb94ec468be11b3\
file://pixman/pixman-matrix.c;endline=25;md5=ba6e8769bfaaee2c41698755af04c4be \
file://pixman/pixman-arm-neon-asm.h;endline=24;md5=9a9cc1e51abbf1da58f4d9528ec9d49b \
"
PR = "${INC_PR}.0"
SRC_URI = "http://xorg.freedesktop.org/archive/individual/lib/${BPN}-${PV}.tar.gz \
file://0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch \
file://0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch \
file://0005-ARM-NEON-Replace-old-bilinear-scanline-generator-wit.patch \
file://0006-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch \
file://0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch \
file://0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch \
"
SRC_URI[md5sum] = "27eb7a0ec440c89cccd7c396c3581041"
SRC_URI[sha256sum] = "4e35f49474e78a9430d93caaaea8bbf7e30b65f0da33c31f15a988c25a3ac369"
NEON = " --disable-arm-neon "
NEON_armv7a = " "
EXTRA_OECONF = "${NEON} --disable-gtk"