375 lines
12 KiB
ArmAsm
375 lines
12 KiB
ArmAsm
/*
|
|
* Copyright © 2009 Nokia Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*
|
|
* Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2018 RISC OS Open Ltd
|
|
*
|
|
* This software is provided 'as-is', without any express or implied
|
|
* warranty. In no event will the authors be held liable for any damages
|
|
* arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute it
|
|
* freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must not
|
|
* claim that you wrote the original software. If you use this software
|
|
* in a product, an acknowledgment in the product documentation would be
|
|
* appreciated but is not required.
|
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
|
* misrepresented as being the original software.
|
|
* 3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
|
|
/* Prevent the stack from becoming executable for no reason... */
|
|
#if defined(__linux__) && defined(__ELF__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|
|
|
|
.text
|
|
.fpu neon
|
|
.arch armv7a
|
|
.object_arch armv4
|
|
.eabi_attribute 10, 0 /* suppress Tag_FP_arch */
|
|
.eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
|
|
.arm
|
|
.altmacro
|
|
.p2align 2
|
|
|
|
#include "pixman-arm-asm.h"
|
|
#include "pixman-arm-neon-asm.h"
|
|
|
|
/* Global configuration options and preferences */
|
|
|
|
/*
|
|
* The code can optionally make use of unaligned memory accesses to improve
|
|
* performance of handling leading/trailing pixels for each scanline.
|
|
* Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
|
|
* example in linux if unaligned memory accesses are not configured to
|
|
* generate.exceptions.
|
|
*/
|
|
.set RESPECT_STRICT_ALIGNMENT, 1
|
|
|
|
/*
|
|
* Set default prefetch type. There is a choice between the following options:
|
|
*
|
|
* PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
|
|
* as NOP to workaround some HW bugs or for whatever other reason)
|
|
*
|
|
* PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
|
|
* advanced prefetch intruduces heavy overhead)
|
|
*
|
|
* PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
|
|
* which can run ARM and NEON instructions simultaneously so that extra ARM
|
|
* instructions do not add (many) extra cycles, but improve prefetch efficiency)
|
|
*
|
|
* Note: some types of function can't support advanced prefetch and fallback
|
|
* to simple one (those which handle 24bpp pixels)
|
|
*/
|
|
.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
|
|
|
|
/* Prefetch distance in pixels for simple prefetch */
|
|
.set PREFETCH_DISTANCE_SIMPLE, 64
|
|
|
|
/******************************************************************************/
|
|
|
|
/* We can actually do significantly better than the Pixman macros, at least for
|
|
* the case of fills, by using a carefully scheduled inner loop. Cortex-A53
|
|
* shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
|
|
*/
|
|
|
|
.macro generate_fillrect_function name, bpp, log2Bpp
|
|
/*
|
|
* void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
|
|
* On entry:
|
|
* a1 = width, pixels
|
|
* a2 = height, rows
|
|
* a3 = pointer to top-left destination pixel
|
|
* a4 = stride, pixels
|
|
* [sp] = pixel value to fill with
|
|
* Within the function:
|
|
* v1 = width remaining
|
|
* v2 = vst offset
|
|
* v3 = alternate pointer
|
|
* ip = data ARM register
|
|
*/
|
|
pixman_asm_function name
|
|
vld1.\bpp {d0[],d1[]}, [sp]
|
|
sub a4, a1
|
|
vld1.\bpp {d2[],d3[]}, [sp]
|
|
cmp a1, #(15+64) >> \log2Bpp
|
|
push {v1-v3,lr}
|
|
vmov ip, s0
|
|
blo 51f
|
|
|
|
/* Long-row case */
|
|
mov v2, #64
|
|
1: mov v1, a1
|
|
ands v3, a3, #15
|
|
beq 2f
|
|
/* Leading pixels */
|
|
rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */
|
|
sub v1, v1, v3, lsr #\log2Bpp
|
|
rbit v3, v3
|
|
.if bpp <= 16
|
|
.if bpp == 8
|
|
tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */
|
|
strneb ip, [a3], #1
|
|
tst v3, #1<<30
|
|
.else
|
|
tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
|
|
.endif
|
|
strneh ip, [a3], #2
|
|
.endif
|
|
movs v3, v3, lsl #3
|
|
vstmcs a3!, {s0}
|
|
vstmmi a3!, {d0}
|
|
2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
|
|
add v3, a3, #32
|
|
/* Inner loop */
|
|
3: vst1.\bpp {q0-q1}, [a3 :128], v2
|
|
subs v1, v1, #64 >> \log2Bpp
|
|
vst1.\bpp {q0-q1}, [v3 :128], v2
|
|
bhs 3b
|
|
/* Trailing pixels */
|
|
4: movs v1, v1, lsl #27 + \log2Bpp
|
|
bcc 5f
|
|
vst1.\bpp {q0-q1}, [a3 :128]!
|
|
5: bpl 6f
|
|
vst1.\bpp {q0}, [a3 :128]!
|
|
6: movs v1, v1, lsl #2
|
|
vstmcs a3!, {d0}
|
|
vstmmi a3!, {s0}
|
|
.if bpp <= 16
|
|
movs v1, v1, lsl #2
|
|
strcsh ip, [a3], #2
|
|
.if bpp == 8
|
|
strmib ip, [a3], #1
|
|
.endif
|
|
.endif
|
|
subs a2, a2, #1
|
|
add a3, a3, a4, lsl #\log2Bpp
|
|
bhi 1b
|
|
pop {v1-v3,pc}
|
|
|
|
/* Short-row case */
|
|
51: movs v1, a1
|
|
.if bpp == 8
|
|
tst a3, #3
|
|
beq 53f
|
|
52: subs v1, v1, #1
|
|
blo 57f
|
|
strb ip, [a3], #1
|
|
tst a3, #3
|
|
bne 52b
|
|
.elseif bpp == 16
|
|
tstne a3, #2
|
|
subne v1, v1, #1
|
|
strneh ip, [a3], #2
|
|
.endif
|
|
53: cmp v1, #32 >> \log2Bpp
|
|
bcc 54f
|
|
vst1.\bpp {q0-q1}, [a3]!
|
|
sub v1, v1, #32 >> \log2Bpp
|
|
/* Trailing pixels */
|
|
54: movs v1, v1, lsl #27 + \log2Bpp
|
|
bcc 55f
|
|
vst1.\bpp {q0-q1}, [a3]!
|
|
55: bpl 56f
|
|
vst1.\bpp {q0}, [a3]!
|
|
56: movs v1, v1, lsl #2
|
|
vstmcs a3!, {d0}
|
|
vstmmi a3!, {s0}
|
|
.if bpp <= 16
|
|
movs v1, v1, lsl #2
|
|
strcsh ip, [a3], #2
|
|
.if bpp == 8
|
|
strmib ip, [a3], #1
|
|
.endif
|
|
.endif
|
|
subs a2, a2, #1
|
|
add a3, a3, a4, lsl #\log2Bpp
|
|
bhi 51b
|
|
57: pop {v1-v3,pc}
|
|
|
|
.endfunc
|
|
.endm
|
|
|
|
generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
|
|
generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
|
|
generate_fillrect_function FillRect8ARMNEONAsm, 8, 0
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro RGBtoRGBPixelAlpha_process_pixblock_head
|
|
vmvn d30, d3 /* get inverted source alpha */
|
|
vmov d31, d7 /* dest alpha is always unchanged */
|
|
vmull.u8 q14, d0, d3
|
|
vmlal.u8 q14, d4, d30
|
|
vmull.u8 q0, d1, d3
|
|
vmlal.u8 q0, d5, d30
|
|
vmull.u8 q1, d2, d3
|
|
vmlal.u8 q1, d6, d30
|
|
vrshr.u16 q2, q14, #8
|
|
vrshr.u16 q3, q0, #8
|
|
vraddhn.u16 d28, q14, q2
|
|
vrshr.u16 q2, q1, #8
|
|
vraddhn.u16 d29, q0, q3
|
|
vraddhn.u16 d30, q1, q2
|
|
.endm
|
|
|
|
.macro RGBtoRGBPixelAlpha_process_pixblock_tail
|
|
/* nothing */
|
|
.endm
|
|
|
|
.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
|
|
vld4.8 {d0-d3}, [SRC]!
|
|
PF add PF_X, PF_X, #8
|
|
vst4.8 {d28-d31}, [DST_W :128]!
|
|
PF tst PF_CTL, #0xF
|
|
vld4.8 {d4-d7}, [DST_R :128]!
|
|
PF addne PF_X, PF_X, #8
|
|
vmvn d30, d3 /* get inverted source alpha */
|
|
vmov d31, d7 /* dest alpha is always unchanged */
|
|
vmull.u8 q14, d0, d3
|
|
PF subne PF_CTL, PF_CTL, #1
|
|
vmlal.u8 q14, d4, d30
|
|
PF cmp PF_X, ORIG_W
|
|
vmull.u8 q0, d1, d3
|
|
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
|
vmlal.u8 q0, d5, d30
|
|
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
|
vmull.u8 q1, d2, d3
|
|
PF subge PF_X, PF_X, ORIG_W
|
|
vmlal.u8 q1, d6, d30
|
|
PF subges PF_CTL, PF_CTL, #0x10
|
|
vrshr.u16 q2, q14, #8
|
|
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
|
vrshr.u16 q3, q0, #8
|
|
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
|
vraddhn.u16 d28, q14, q2
|
|
vrshr.u16 q2, q1, #8
|
|
vraddhn.u16 d29, q0, q3
|
|
vraddhn.u16 d30, q1, q2
|
|
.endm
|
|
|
|
generate_composite_function \
|
|
BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
|
|
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
|
|
8, /* number of pixels, processed in a single block */ \
|
|
5, /* prefetch distance */ \
|
|
default_init, \
|
|
default_cleanup, \
|
|
RGBtoRGBPixelAlpha_process_pixblock_head, \
|
|
RGBtoRGBPixelAlpha_process_pixblock_tail, \
|
|
RGBtoRGBPixelAlpha_process_pixblock_tail_head
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro ARGBto565PixelAlpha_process_pixblock_head
|
|
vmvn d6, d3
|
|
vshr.u8 d1, #2
|
|
vshr.u8 d3, #3
|
|
vshr.u8 d0, #3
|
|
vshrn.u16 d7, q2, #3
|
|
vshrn.u16 d25, q2, #8
|
|
vbic.i16 q2, #0xe0
|
|
vshr.u8 d6, #3
|
|
vshr.u8 d7, #2
|
|
vshr.u8 d2, #3
|
|
vmovn.u16 d24, q2
|
|
vshr.u8 d25, #3
|
|
vmull.u8 q13, d1, d3
|
|
vmlal.u8 q13, d7, d6
|
|
vmull.u8 q14, d0, d3
|
|
vmlal.u8 q14, d24, d6
|
|
vmull.u8 q15, d2, d3
|
|
vmlal.u8 q15, d25, d6
|
|
.endm
|
|
|
|
.macro ARGBto565PixelAlpha_process_pixblock_tail
|
|
vsra.u16 q13, #5
|
|
vsra.u16 q14, #5
|
|
vsra.u16 q15, #5
|
|
vrshr.u16 q13, #5
|
|
vrshr.u16 q14, #5
|
|
vrshr.u16 q15, #5
|
|
vsli.u16 q14, q13, #5
|
|
vsli.u16 q14, q15, #11
|
|
.endm
|
|
|
|
.macro ARGBto565PixelAlpha_process_pixblock_tail_head
|
|
vld4.8 {d0-d3}, [SRC]!
|
|
PF add PF_X, PF_X, #8
|
|
vsra.u16 q13, #5
|
|
PF tst PF_CTL, #0xF
|
|
vsra.u16 q14, #5
|
|
PF addne PF_X, PF_X, #8
|
|
vsra.u16 q15, #5
|
|
PF subne PF_CTL, PF_CTL, #1
|
|
vrshr.u16 q13, #5
|
|
PF cmp PF_X, ORIG_W
|
|
vrshr.u16 q14, #5
|
|
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
|
vrshr.u16 q15, #5
|
|
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
|
vld1.8 {d4-d5}, [DST_R]!
|
|
PF subge PF_X, PF_X, ORIG_W
|
|
vsli.u16 q14, q13, #5
|
|
PF subges PF_CTL, PF_CTL, #0x10
|
|
vsli.u16 q14, q15, #11
|
|
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
|
vst1.8 {q14}, [DST_W :128]!
|
|
vmvn d6, d3
|
|
vshr.u8 d1, #2
|
|
vshr.u8 d3, #3
|
|
vshr.u8 d0, #3
|
|
vshrn.u16 d7, q2, #3
|
|
vshrn.u16 d25, q2, #8
|
|
vbic.i16 q2, #0xe0
|
|
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
|
vshr.u8 d6, #3
|
|
vshr.u8 d7, #2
|
|
vshr.u8 d2, #3
|
|
vmovn.u16 d24, q2
|
|
vshr.u8 d25, #3
|
|
vmull.u8 q13, d1, d3
|
|
vmlal.u8 q13, d7, d6
|
|
vmull.u8 q14, d0, d3
|
|
vmlal.u8 q14, d24, d6
|
|
vmull.u8 q15, d2, d3
|
|
vmlal.u8 q15, d25, d6
|
|
.endm
|
|
|
|
generate_composite_function \
|
|
BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
|
|
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
|
|
8, /* number of pixels, processed in a single block */ \
|
|
6, /* prefetch distance */ \
|
|
default_init, \
|
|
default_cleanup, \
|
|
ARGBto565PixelAlpha_process_pixblock_head, \
|
|
ARGBto565PixelAlpha_process_pixblock_tail, \
|
|
ARGBto565PixelAlpha_process_pixblock_tail_head
|