From 7ed753b10a61d0be95f683289dfb925b800b0676 Mon Sep 17 00:00:00 2001 From: Xiwei Gu <guxiwei-hf@loongson.cn> Date: Tue, 5 Mar 2024 14:35:43 +0800 Subject: [PATCH] loongarch: Enhance ultrafast encoding performance Using the following command, ultrafast encoding has improved from 182fps to 189fps: ./x264 --preset ultrafast -o out.mkv yuv_1920x1080.yuv --- common/loongarch/quant-a.S | 245 +++++++++++++++++++++++++++++++++++++ common/loongarch/quant.h | 12 ++ common/quant.c | 12 +- 3 files changed, 267 insertions(+), 2 deletions(-) diff --git a/common/loongarch/quant-a.S b/common/loongarch/quant-a.S index 279c83b6..a2f099d9 100644 --- a/common/loongarch/quant-a.S +++ b/common/loongarch/quant-a.S @@ -984,3 +984,248 @@ function_x264 decimate_score64_lsx jirl $r0, $r1, 0x0 .END_SCORE_64_LSX: endfunc_x264 + +/* + * int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel ) + */ +function_x264 coeff_level_run16_lasx + addi.w t0, zero, 15 + + xvld xr0, a0, 0 + xvldi xr2, 1 + + xvssrlni.bu.h xr0, xr0, 0 + xvpermi.d xr1, xr0, 0xd8 + xvsle.bu xr3, xr2, xr1 + xvsrlni.b.h xr3, xr3, 4 + xvpickve2gr.du t8, xr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN16_LASX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN16_LASX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LASX +.END_COEFF_LEVEL_RUN16_LASX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run15_lasx + addi.w t0, zero, 15 + + vld vr0, a0, 0 + vld vr1, a0, 16 + xvldi xr3, 1 + + vinsgr2vr.h vr1, zero, 7 + xvpermi.q xr1, xr0, 0x20 + + xvssrlni.bu.h xr1, xr1, 0 + xvpermi.d xr2, xr1, 0xd8 + xvsle.bu xr4, xr3, xr2 + xvsrlni.b.h xr4, xr4, 4 + xvpickve2gr.du t8, xr4, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN15_LASX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN15_LASX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LASX +.END_COEFF_LEVEL_RUN15_LASX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run16_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vld vr1, a0, 16 + vldi vr2, 1 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vsrlni.b.h vr3, vr3, 4 + vpickve2gr.du t8, vr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN16_LSX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN16_LSX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LSX +.END_COEFF_LEVEL_RUN16_LSX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run15_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vld vr1, a0, 16 + vldi vr2, 1 + vinsgr2vr.h vr1, zero, 7 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vsrlni.b.h vr3, vr3, 4 + vpickve2gr.du t8, vr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN15_LSX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN15_LSX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LSX +.END_COEFF_LEVEL_RUN15_LSX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run8_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vxor.v vr1, vr1, vr1 + vldi vr2, 1 + + vssrlni.bu.h vr0, vr0, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vsrlni.b.h vr3, vr3, 4 + vpickve2gr.du t8, vr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN8_LSX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN8_LSX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN8_LSX +.END_COEFF_LEVEL_RUN8_LSX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 diff --git a/common/loongarch/quant.h b/common/loongarch/quant.h index 36d88b4b..cc3a5399 100644 --- a/common/loongarch/quant.h +++ b/common/loongarch/quant.h @@ -81,4 +81,16 @@ void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx) void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx) +int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * ); +#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx) +int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * ); + +#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx) +int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * ); +#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx) +int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * ); +#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx) +int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * ); + #endif/* X264_LOONGARCH_QUANT_H */ diff --git a/common/quant.c b/common/quant.c index 64e33216..262c5c52 100644 --- a/common/quant.c +++ b/common/quant.c @@ -848,11 +848,17 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_lsx; pf->dequant_8x8 = x264_dequant_8x8_lsx; pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx; - pf->coeff_last4 = x264_coeff_last4_lsx; - pf->coeff_last8 = x264_coeff_last8_lsx; + pf->decimate_score15 = x264_decimate_score15_lsx; + pf->decimate_score16 = x264_decimate_score16_lsx; + pf->decimate_score64 = x264_decimate_score64_lsx; + pf->coeff_last4 = x264_coeff_last4_lsx; + pf->coeff_last8 = x264_coeff_last8_lsx; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx; + pf->coeff_level_run8 = x264_coeff_level_run8_lsx; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx; } if( cpu&X264_CPU_LASX ) { @@ -863,6 +869,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf ) pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx; } #endif -- GitLab