From 2355eeb8f254a1c34dbb0241be5c70cdf6ed46d1 Mon Sep 17 00:00:00 2001 From: Arpad Panyik <Arpad.Panyik@arm.com> Date: Tue, 25 Jun 2024 23:36:11 +0200 Subject: [PATCH] AArch64: Move constants of DotProd subpel filters to .rodata The constants used for the subpel filters were placed in the .text section for simplicity and peak performance, but this does not work on systems with execute only .text sections (e.g.: OpenBSD). The performance cost of moving the constants to the .rodata section is small and mostly within the measurable noise. --- src/arm/64/mc_dotprod.S | 67 +++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 16d457c1..1c789b8d 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -45,32 +45,33 @@ ENABLE_DOTPROD #define LOOP_ALIGN 2 -// Lookup table used to help conversion of shifted 32-bit values to 8-bit. - .align 4 -L(hv_tbl_neon_dotprod): - .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 - -// Shuffle indices to permute horizontal samples in preparation for input to -// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the -// interval of [-3, 4] relative to the current sample position. - .align 4 -L(h_tbl_neon_dotprod): +const h_tbl_neon_dotprod, align=4 + // Shuffle indices to permute horizontal samples in preparation for + // input to SDOT instructions. The 8-tap horizontal convolution uses + // sample indices in the interval of [-3, 4] relative to the current + // sample position. .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -// Vertical convolutions are also using SDOT instructions, where a 128-bit -// register contains a transposed 4x4 matrix of values. Subsequent iterations of -// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop -// iteration. These shuffle indices shift and merge this 4x4 matrix with the -// values of a new line. - .align 4 -L(v_tbl_neon_dotprod): + // Lookup table used to help conversion of shifted 32-bit values to 8-bit. +#define OFFSET_CVT_32_8 48 + .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 +endconst + +const v_tbl_neon_dotprod, align=4 + // Vertical convolutions are also using SDOT instructions, where a + // 128-bit register contains a transposed 4x4 matrix of values. + // Subsequent iterations of the vertical convolution can reuse the + // 3x4 sub-matrix from the previous loop iteration. These shuffle + // indices shift and merge this 4x4 matrix with the values of a new + // line. .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 +endconst .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 @@ -109,7 +110,7 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN .align JUMP_ALIGN L(\type\()_8tap_v_\isa): madd \my, \my, w11, w10 - ldr q6, L(v_tbl_neon_dotprod) + movrel x13, v_tbl_neon_dotprod sub \src, \src, \s_strd .ifc \isa, neon_dotprod .ifc \type, prep @@ -121,12 +122,12 @@ L(\type\()_8tap_v_\isa): .endif ubfx w11, \my, #7, #7 and \my, \my, #0x7F - ldr q28, L(v_tbl_neon_dotprod) + 16 + ldp q6, q28, [x13] cmp \h, #4 csel \my, \my, w11, le sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 add \xmy, x12, \xmy, lsl #3 // subpel V filter address - ldr q29, L(v_tbl_neon_dotprod) + 32 + ldr q29, [x13, #32] .ifc \isa, neon_dotprod movi v5.16b, #128 .endif @@ -137,8 +138,7 @@ L(\type\()_8tap_v_\isa): // .align JUMP_ALIGN // fallthrough 160: // V - 16xN+ - ldr q30, L(v_tbl_neon_dotprod) + 48 - ldr q31, L(v_tbl_neon_dotprod) + 64 + ldp q30, q31, [x13, #48] .ifc \type, prep add \wd_strd, \w, \w .endif @@ -676,12 +676,13 @@ L(\type\()_8tap_v_\isa): L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 madd w14, \my, w11, w10 // for HV - ldr q28, L(h_tbl_neon_dotprod) .ifc \isa, neon_dotprod mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v27.4s, w13 // put H overrides this .endif + movrel x13, h_tbl_neon_dotprod sub \src, \src, #3 // src - 3 + ldr q28, [x13] ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7F ubfx w11, w14, #7, #7 // for HV @@ -702,8 +703,8 @@ L(\type\()_8tap_h_hv_\isa): mov x15, x30 ldr d7, [\xmy] .ifc \type, put - ldr q25, L(hv_tbl_neon_dotprod) -.endif + ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion +.endif // of 32b values to 8b sxtl v7.8h, v7.8b cmp w10, SHARP1 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 @@ -718,8 +719,7 @@ L(\type\()_8tap_h_hv_\isa): // .align JUMP_ALIGN // fallthrough 80: // HV8 - 8xN+ - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 + ldp q29, q30, [x13, #16] ldr d26, [\xmx] .ifc \type, prep add \wd_strd, \w, \w @@ -1005,13 +1005,11 @@ L(\type\()_6tap_hv_\isa): // .align JUMP_ALIGN // fallthrough 80: // HV6 - 8xN+ - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 + ldp q29, q30, [x13, #16] ldr d26, [\xmx] .ifc \type, prep add \wd_strd, \w, \w .endif - .align LOOP_ALIGN 81: mov \lsrc, \src @@ -1370,8 +1368,7 @@ L(\type\()_8tap_h_\isa): .align JUMP_ALIGN 80: // H - 8xN AARCH64_VALID_JUMP_TARGET - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 + ldp q29, q30, [x13, #16] ldr d26, [\xmx] .align LOOP_ALIGN @@ -1436,8 +1433,7 @@ L(\type\()_8tap_h_\isa): .align JUMP_ALIGN 160: // H - 16xN AARCH64_VALID_JUMP_TARGET - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 + ldp q29, q30, [x13, #16] ldr d26, [\xmx] .align LOOP_ALIGN @@ -1501,8 +1497,7 @@ L(\type\()_8tap_h_\isa): 640: 1280: AARCH64_VALID_JUMP_TARGET - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 + ldp q29, q30, [x13, #16] ldr d26, [\xmx] .ifc \type, put sub \d_strd, \d_strd, \w, uxtw -- GitLab