From 7ed753b10a61d0be95f683289dfb925b800b0676 Mon Sep 17 00:00:00 2001
From: Xiwei Gu <guxiwei-hf@loongson.cn>
Date: Tue, 5 Mar 2024 14:35:43 +0800
Subject: [PATCH] loongarch: Enhance ultrafast encoding performance

Using the following command, ultrafast encoding
has improved from 182fps to 189fps:
./x264 --preset ultrafast -o out.mkv yuv_1920x1080.yuv
---
 common/loongarch/quant-a.S | 245 +++++++++++++++++++++++++++++++++++++
 common/loongarch/quant.h   |  12 ++
 common/quant.c             |  12 +-
 3 files changed, 267 insertions(+), 2 deletions(-)

diff --git a/common/loongarch/quant-a.S b/common/loongarch/quant-a.S
index 279c83b6..a2f099d9 100644
--- a/common/loongarch/quant-a.S
+++ b/common/loongarch/quant-a.S
@@ -984,3 +984,248 @@ function_x264 decimate_score64_lsx
     jirl            $r0,    $r1,   0x0
 .END_SCORE_64_LSX:
 endfunc_x264
+
+/*
+ * int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel )
+ */
+function_x264 coeff_level_run16_lasx
+    addi.w          t0,     zero,  15
+
+    xvld            xr0,    a0,    0
+    xvldi           xr2,    1
+
+    xvssrlni.bu.h   xr0,    xr0,   0
+    xvpermi.d       xr1,    xr0,   0xd8
+    xvsle.bu        xr3,    xr2,   xr1
+    xvsrlni.b.h     xr3,    xr3,   4
+    xvpickve2gr.du  t8,     xr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN16_LASX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN16_LASX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN16_LASX
+.END_COEFF_LEVEL_RUN16_LASX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run15_lasx
+    addi.w          t0,     zero,  15
+
+    vld             vr0,    a0,    0
+    vld             vr1,    a0,    16
+    xvldi           xr3,    1
+
+    vinsgr2vr.h     vr1,    zero,  7
+    xvpermi.q       xr1,    xr0,   0x20
+
+    xvssrlni.bu.h   xr1,    xr1,   0
+    xvpermi.d       xr2,    xr1,   0xd8
+    xvsle.bu        xr4,    xr3,   xr2
+    xvsrlni.b.h     xr4,    xr4,   4
+    xvpickve2gr.du  t8,     xr4,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN15_LASX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN15_LASX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN15_LASX
+.END_COEFF_LEVEL_RUN15_LASX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run16_lsx
+    addi.w          t0,     zero,  15
+    vld             vr0,    a0,    0
+    vld             vr1,    a0,    16
+    vldi            vr2,    1
+
+    vssrlni.bu.h    vr0,    vr0,   0
+    vssrlni.bu.h    vr1,    vr1,   0
+    vpermi.w        vr1,    vr0,   0x44
+    vsle.bu         vr3,    vr2,   vr1
+    vsrlni.b.h      vr3,    vr3,   4
+    vpickve2gr.du   t8,     vr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN16_LSX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN16_LSX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN16_LSX
+.END_COEFF_LEVEL_RUN16_LSX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run15_lsx
+    addi.w          t0,     zero,  15
+    vld             vr0,    a0,    0
+    vld             vr1,    a0,    16
+    vldi            vr2,    1
+    vinsgr2vr.h     vr1,    zero,  7
+
+    vssrlni.bu.h    vr0,    vr0,   0
+    vssrlni.bu.h    vr1,    vr1,   0
+    vpermi.w        vr1,    vr0,   0x44
+    vsle.bu         vr3,    vr2,   vr1
+    vsrlni.b.h      vr3,    vr3,   4
+    vpickve2gr.du   t8,     vr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN15_LSX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN15_LSX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN15_LSX
+.END_COEFF_LEVEL_RUN15_LSX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
+
+function_x264 coeff_level_run8_lsx
+    addi.w          t0,     zero,  15
+    vld             vr0,    a0,    0
+    vxor.v          vr1,    vr1,   vr1
+    vldi            vr2,    1
+
+    vssrlni.bu.h    vr0,    vr0,   0
+    vpermi.w        vr1,    vr0,   0x44
+    vsle.bu         vr3,    vr2,   vr1
+    vsrlni.b.h      vr3,    vr3,   4
+    vpickve2gr.du   t8,     vr3,   0
+    clz.d           t1,     t8
+
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    st.w            t0,     a1,    0x00 // Store runlevel->last
+    addi.d          t3,     a1,    23
+    nor             t2,     zero,  zero
+    addi.d          t2,     t2,    -15
+    and             t3,     t3,    t2 // runlevel->level
+    xor             t4,     t4,    t4 // mask
+    xor             t5,     t5,    t5 // total: number of non-zero elements
+    addi.w          t6,     zero,  1  // const 1
+.LOOP_COEFF_LEVEL_RUN8_LSX:
+    slli.w          t7,     t0,    1
+    ldx.h           t2,     a0,    t7
+    st.h            t2,     t3,    0
+    addi.d          t3,     t3,    2
+
+    addi.w          t5,     t5,    1
+    sll.w           t2,     t6,    t0
+    or              t4,     t4,    t2
+    bge             zero,   t4,    .END_COEFF_LEVEL_RUN8_LSX
+
+    addi.w          t0,     t0,    -1
+    slli.w          t1,     t1,    2
+    addi.w          t1,     t1,    4
+    sll.d           t8,     t8,    t1
+    clz.d           t1,     t8
+    srai.w          t1,     t1,    2
+    sub.w           t0,     t0,    t1 // Index of the first non-zero element starting from the highest bit
+    bge             t0,     zero,  .LOOP_COEFF_LEVEL_RUN8_LSX
+.END_COEFF_LEVEL_RUN8_LSX:
+    st.w            t4,     a1,    4
+    move            a0,     t5
+endfunc_x264
diff --git a/common/loongarch/quant.h b/common/loongarch/quant.h
index 36d88b4b..cc3a5399 100644
--- a/common/loongarch/quant.h
+++ b/common/loongarch/quant.h
@@ -81,4 +81,16 @@ void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 #define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
 void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 
+#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
+int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
+int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
+
+#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
+int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
+int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
+#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
+int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
+
 #endif/* X264_LOONGARCH_QUANT_H */
diff --git a/common/quant.c b/common/quant.c
index 64e33216..262c5c52 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -848,11 +848,17 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
         pf->dequant_4x4    = x264_dequant_4x4_lsx;
         pf->dequant_8x8    = x264_dequant_8x8_lsx;
         pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx;
-        pf->coeff_last4    = x264_coeff_last4_lsx;
-        pf->coeff_last8    = x264_coeff_last8_lsx;
+        pf->decimate_score15 = x264_decimate_score15_lsx;
+        pf->decimate_score16 = x264_decimate_score16_lsx;
+        pf->decimate_score64 = x264_decimate_score64_lsx;
+        pf->coeff_last4              = x264_coeff_last4_lsx;
+        pf->coeff_last8              = x264_coeff_last8_lsx;
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx;
+        pf->coeff_level_run8         = x264_coeff_level_run8_lsx;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx;
     }
     if( cpu&X264_CPU_LASX )
     {
@@ -863,6 +869,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx;
     }
 #endif
 
-- 
GitLab