From 3d98a242a055438ca76020434a530ebe074fa892 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Fri, 22 Mar 2024 10:41:48 +0100
Subject: [PATCH] x86: Add 6-tap variants of high bit-depth mc AVX2 functions

---
 src/x86/mc16_avx2.asm | 1220 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 1075 insertions(+), 145 deletions(-)

diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm
index e1247557..6b442494 100644
--- a/src/x86/mc16_avx2.asm
+++ b/src/x86/mc16_avx2.asm
@@ -1222,7 +1222,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
 %assign FILTER_SHARP   (2*15 << 16) | 3*15
 
-%macro FN 4 ; prefix, type, type_h, type_v
+%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
 cglobal %1_%2_16bpc
     mov                 t0d, FILTER_%3
 %ifidn %3, %4
@@ -1230,8 +1230,8 @@ cglobal %1_%2_16bpc
 %else
     mov                 t1d, FILTER_%4
 %endif
-%ifnidn %2, regular ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%if %0 == 5 ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
 %endif
 %endmacro
 
@@ -1242,22 +1242,17 @@ DECLARE_REG_TMP 7, 8
 %endif
 
 %define PUT_8TAP_FN FN put_8tap,
-PUT_8TAP_FN sharp,          SHARP,   SHARP
-PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
-PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
-PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
+PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
 PUT_8TAP_FN regular,        REGULAR, REGULAR
 
-cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
 %define base r8-put_avx2
     imul                mxd, mxm, 0x010101
-    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
-    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    add                 myd, t1d ; 6tap_v, my, 4tap_v
     lea                  r8, [put_avx2]
     movifnidn            wd, wm
     movifnidn            hd, hm
@@ -1265,6 +1260,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     jnz .h
     test                myd, 0xf00
     jnz .v
+.put:
     tzcnt                wd, wd
     movzx                wd, word [r8+wq*2+table_offset(put,)]
     add                  wq, r8
@@ -1337,43 +1333,36 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     cmp                  wd, 4
     je .h_w4
     jl .h_w2
-    WIN64_SPILL_XMM      13
+    WIN64_SPILL_XMM      11
     shr                 mxd, 16
-    sub                srcq, 6
-    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
-    vbroadcasti128       m6, [subpel_h_shufA]
-    vbroadcasti128       m7, [subpel_h_shufB]
+    sub                srcq, 4
+    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
+    vbroadcasti128       m6, [base+subpel_h_shufA]
     punpcklbw            m0, m0
     psraw                m0, 8 ; sign-extend
-    pshufd               m8, m0, q0000
-    pshufd               m9, m0, q1111
-    pshufd              m10, m0, q2222
-    pshufd              m11, m0, q3333
+    pshufd               m7, m0, q0000
+    pshufd               m8, m0, q1111
+    pshufd               m9, m0, q2222
     sub                  wd, 16
     jge .h_w16
 .h_w8:
-%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
-    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
-    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
-    pmaddwd             m%5, m9, m%4   ; abcd1
-    pmaddwd             m%1, m8        ; abcd0
-    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
-    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
-    paddd               m%5, m4
-    paddd               m%1, m%5
-    pmaddwd             m%5, m11, m%2  ; abcd3
-    paddd               m%1, m%5
-    pmaddwd             m%5, m10, m%4  ; abcd2
-    pshufb              m%3, m7        ; a b b c c d d e
-    pmaddwd             m%4, m8        ; efgh0
-    paddd               m%1, m%5
-    pmaddwd             m%5, m9, m%2   ; efgh1
-    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
-    pmaddwd             m%3, m11       ; efgh3
-    pmaddwd             m%2, m10       ; efgh2
+%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+    pshufb              m%1, m6        ; 01 12 23 34
+    pshufb              m%2, m6        ; 45 56 67 78
+    pmaddwd             m%4, m7, m%1   ; a0
+    pshufb              m%3, m6        ; 89 9a ab bc
+    pmaddwd             m%5, m9, m%2   ; a2
+    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
+    paddd               m%4, m%5       ; a0+a2
+    pmaddwd             m%5, m7, m%2   ; b0
+    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
+    pmaddwd             m%3, m9        ; b2
+    pmaddwd             m%1, m8        ; a1
+    pmaddwd             m%2, m8        ; b1
+    paddd               m%3, m%5       ; b0+b2
     paddd               m%4, m4
-    paddd               m%4, m%5
-    paddd               m%3, m%4
+    paddd               m%3, m4
+    paddd               m%1, m%4
     paddd               m%2, m%3
     psrad               m%1, 6
     psrad               m%2, 6
@@ -1384,9 +1373,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     vinserti128          m0, [srcq+ssq*1+ 0], 1
     movu                xm2, [srcq+ssq*0+16]
     vinserti128          m2, [srcq+ssq*1+16], 1
-    lea                srcq, [srcq+ssq*2]
     shufpd               m1, m0, m2, 0x05
-    PUT_8TAP_H            0, 1, 2, 3, 12
+    lea                srcq, [srcq+ssq*2]
+    PUT_6TAP_H            0, 1, 2, 3, 10
     mova         [dstq+dsq*0], xm0
     vextracti128 [dstq+dsq*1], m0, 1
     lea                dstq, [dstq+dsq*2]
@@ -1399,7 +1388,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     movu                 m0, [srcq+r6*2+ 0]
     movu                 m1, [srcq+r6*2+ 8]
     movu                 m2, [srcq+r6*2+16]
-    PUT_8TAP_H            0, 1, 2, 3, 12
+    PUT_6TAP_H            0, 1, 2, 3, 10
     mova        [dstq+r6*2], m0
     sub                 r6d, 16
     jge .h_w16_loop
@@ -1408,6 +1397,445 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     dec                  hd
     jg .h_w16
     RET
+.v:
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
+    WIN64_SPILL_XMM      10, 12
+    vpbroadcastd         m5, [pd_32]
+    vpbroadcastw         m6, r8m
+    punpcklbw            m0, m0
+    mov                  r6, ssq
+    psraw                m0, 8 ; sign-extend
+    neg                  r6
+    pshufd               m7, m0, q0000
+    pshufd               m8, m0, q1111
+    pshufd               m9, m0, q2222
+    cmp                  wd, 4
+    jg .v_w8
+    je .v_w4
+.v_w2:
+    movd                xm2, [srcq+r6 *2]
+    pinsrd              xm2, [srcq+r6 *1], 1
+    pinsrd              xm2, [srcq+ssq*0], 2
+    pinsrd              xm2, [srcq+ssq*1], 3 ; 0 1 2 3
+    lea                srcq, [srcq+ssq*2]
+    movd                xm0, [srcq+ssq*0]
+    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
+    punpcklwd           xm1, xm2, xm3        ; 01 12
+    punpckhwd           xm2, xm3             ; 23 34
+.v_w2_loop:
+    movd                xm3, [srcq+ssq*1]
+    pmaddwd             xm4, xm7, xm1        ; a0 b0
+    mova                xm1, xm2
+    pmaddwd             xm2, xm8             ; a1 b1
+    lea                srcq, [srcq+ssq*2]
+    paddd               xm4, xm2
+    punpckldq           xm2, xm0, xm3        ; 4 5
+    movd                xm0, [srcq+ssq*0]
+    punpckldq           xm3, xm0             ; 5 6
+    punpcklwd           xm2, xm3             ; 45 56
+    pmaddwd             xm3, xm9, xm2        ; a2 b2
+    paddd               xm4, xm5
+    paddd               xm4, xm3
+    psrad               xm4, 6
+    packusdw            xm4, xm4
+    pminsw              xm4, xm6
+    movd       [dstq+dsq*0], xm4
+    pextrd     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movq                xm1, [srcq+r6 *2]
+    vpbroadcastq         m3, [srcq+r6 *1]
+    vpbroadcastq         m2, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpbroadcastq         m0, [srcq+ssq*0]
+    vpblendd             m1, m3, 0x30
+    vpblendd             m3, m2, 0x30
+    punpcklwd            m1, m3     ; 01 12
+    vpblendd             m2, m4, 0x30
+    vpblendd             m4, m0, 0x30
+    punpcklwd            m2, m4     ; 23 34
+.v_w4_loop:
+    vpbroadcastq         m3, [srcq+ssq*1]
+    pmaddwd              m4, m7, m1 ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m8     ; a1 b1
+    lea                srcq, [srcq+ssq*2]
+    paddd                m4, m2
+    vpblendd             m2, m0, m3, 0x30
+    vpbroadcastq         m0, [srcq+ssq*0]
+    vpblendd             m3, m0, 0x30
+    punpcklwd            m2, m3     ; 45 56
+    pmaddwd              m3, m9, m2 ; a2 b2
+    paddd                m4, m5
+    paddd                m4, m3
+    psrad                m4, 6
+    vextracti128        xm3, m4, 1
+    packusdw            xm4, xm3
+    pminsw              xm4, xm6
+    movq       [dstq+dsq*0], xm4
+    movhps     [dstq+dsq*1], xm4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    shl                  wd, 5
+    WIN64_PUSH_XMM       12
+    lea                  wd, [hq+wq-256]
+.v_w8_loop0:
+    vbroadcasti128       m3, [srcq+r6 *2]
+    vbroadcasti128       m4, [srcq+r6 *1]
+    lea                  r7, [srcq+ssq*2]
+    vbroadcasti128       m0, [srcq+ssq*0]
+    vbroadcasti128       m1, [srcq+ssq*1]
+    mov                  r8, dstq
+    vbroadcasti128       m2, [r7+ssq*0]
+    shufpd               m3, m0, 0x0c
+    shufpd               m4, m1, 0x0c
+    punpcklwd            m1, m3, m4 ; 01
+    punpckhwd            m3, m4     ; 23
+    shufpd               m0, m2, 0x0c
+    punpcklwd            m2, m4, m0 ; 12
+    punpckhwd            m4, m0     ; 34
+.v_w8_loop:
+    vbroadcasti128       m5, [r7+ssq*1]
+    pmaddwd             m10, m7, m1 ; a0
+    lea                  r7, [r7+ssq*2]
+    pmaddwd             m11, m7, m2 ; b0
+    mova                 m1, m3
+    pmaddwd              m3, m8     ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m8     ; b1
+    paddd               m10, m3
+    vbroadcasti128       m3, [r7+ssq*0]
+    paddd               m11, m4
+    shufpd               m4, m0, m5, 0x0d
+    shufpd               m0, m5, m3, 0x0c
+    punpcklwd            m3, m4, m0 ; 45
+    punpckhwd            m4, m0     ; 56
+    pmaddwd              m5, m9, m3 ; a2
+    paddd               m10, m5
+    pmaddwd              m5, m9, m4 ; b2
+    paddd                m5, m11
+    psrad               m10, 5
+    psrad                m5, 5
+    packusdw            m10, m5
+    pxor                 m5, m5
+    pavgw                m5, m10
+    pminsw               m5, m6
+    vpermq               m5, m5, q3120
+    mova         [r8+dsq*0], xm5
+    vextracti128 [r8+dsq*1], m5, 1
+    lea                  r8, [r8+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    add                srcq, 16
+    add                dstq, 16
+    movzx                hd, wb
+    sub                  wd, 1<<8
+    jg .v_w8_loop0
+    RET
+.hv:
+    WIN64_SPILL_XMM      12, 16
+    vpbroadcastd        m10, [pd_512]
+    vpbroadcastw        m11, r8m
+    cmp                  wd, 4
+    jg .hv_w8
+    movzx               mxd, mxb
+    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
+    mov                  r6, ssq
+    sub                srcq, 2
+    neg                  r6
+    pxor                 m6, m6
+    punpcklbw            m6, m0
+    punpcklbw            m1, m1
+    psraw                m1, 8 ; sign-extend
+    test          dword r8m, 0x800
+    jz .hv_10bit
+    psraw                m6, 2
+    psllw                m1, 2
+.hv_10bit:
+    pshufd               m7, m1, q0000
+    pshufd               m8, m1, q1111
+    pshufd               m9, m1, q2222
+    cmp                  wd, 4
+    je .hv_w4
+    vbroadcasti128       m5, [subpel_h_shuf2]
+    vbroadcasti128       m0, [srcq+ssq*0]
+    vinserti128          m2, m0, [srcq+r6*2], 1 ; 2 0
+    movu                xm1, [srcq+ssq*1]
+    vinserti128          m1, [srcq+r6 *1], 1    ; 3 1
+    lea                srcq, [srcq+ssq*2]
+    vinserti128          m0, [srcq+ssq*0], 0    ; 4 2
+    REPX    {pshufb  x, m5}, m2, m1, m0
+    REPX    {pmaddwd x, m6}, m2, m1, m0
+    phaddd               m2, m1
+    phaddd               m1, m0
+    paddd                m2, m10
+    paddd                m1, m10
+    psrad                m2, 10
+    psrad                m1, 10
+    packssdw             m2, m1       ; 2 3 3 4   0 1 1 2
+    punpckhqdq           m0, m2, m2
+    punpcklwd            m2, m0       ; 23 34
+    vextracti128        xm1, m2, 1    ; 01 12
+.hv_w2_loop:
+    movu                xm3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movu                xm4, [srcq+ssq*0]
+    pshufb              xm3, xm5
+    pshufb              xm4, xm5
+    pmaddwd             xm3, xm6
+    pmaddwd             xm4, xm6
+    phaddd              xm3, xm4
+    pmaddwd             xm4, xm7, xm1 ; a0 b0
+    mova                xm1, xm2
+    pmaddwd             xm2, xm8      ; a1 b1
+    paddd               xm4, xm2
+    paddd               xm3, xm10
+    psrad               xm3, 10
+    packssdw            xm3, xm3
+    palignr             xm2, xm3, xm0, 12
+    mova                xm0, xm3
+    punpcklwd           xm2, xm0      ; 45 56
+    pmaddwd             xm3, xm9, xm2 ; a2 b2
+    paddd               xm4, xm10
+    paddd               xm4, xm3
+    psrad               xm4, 10
+    packusdw            xm4, xm4
+    pminsw              xm4, xm11
+    movd       [dstq+dsq*0], xm4
+    pextrd     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    WIN64_PUSH_XMM       14
+    vbroadcasti128      m12, [subpel_h_shufA]
+    pshufd               m5, m6, q0000
+    vbroadcasti128      m13, [subpel_h_shufB]
+    pshufd               m6, m6, q1111
+    movu                xm2, [srcq+r6 *2]
+    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
+    movu                xm0, [srcq+ssq*0]
+    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
+    lea                srcq, [srcq+ssq*2]
+    movu                xm3, [srcq+ssq*0]    ; 4
+    pshufb               m1, m2, m12
+    pmaddwd              m1, m5
+    pshufb               m2, m13
+    pmaddwd              m2, m6
+    pshufb               m4, m0, m12
+    pmaddwd              m4, m5
+    pshufb               m0, m13
+    pmaddwd              m0, m6
+    paddd                m2, m1
+    pshufb              xm1, xm3, xm12
+    pmaddwd             xm1, xm5
+    pshufb              xm3, xm13
+    pmaddwd             xm3, xm6
+    paddd                m0, m4
+    paddd                m2, m10
+    paddd               xm1, xm10
+    paddd                m0, m10
+    paddd               xm3, xm1
+    REPX      {psrad x, 10}, m2, m0, xm3
+    packssdw             m2, m0     ; 0 2   1 3
+    packssdw            xm0, xm3    ; 2 4
+    vperm2i128           m0, m2, 0x03
+    punpcklwd            m1, m2, m0 ; 01 12
+    punpckhwd            m2, m0     ; 23 34
+.hv_w4_loop:
+    movu                xm3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vinserti128          m3, [srcq+ssq*0], 1
+    pmaddwd              m4, m7, m1 ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m8     ; a1 b1
+    paddd                m4, m2
+    pshufb               m2, m3, m12
+    pmaddwd              m2, m5
+    pshufb               m3, m13
+    pmaddwd              m3, m6
+    paddd                m2, m10
+    paddd                m3, m2
+    psrad                m3, 10
+    packssdw             m3, m3     ; 5 5   6 6
+    vperm2i128           m2, m0, m3, 0x21
+    mova                 m0, m3
+    punpckhwd            m2, m3     ; 45 56
+    pmaddwd              m3, m9, m2 ; a2 b2
+    paddd                m4, m10
+    paddd                m4, m3
+    psrad                m4, 10
+    vextracti128        xm3, m4, 1
+    packusdw            xm4, xm3
+    pminsw              xm4, xm11
+    movq       [dstq+dsq*0], xm4
+    movhps     [dstq+dsq*1], xm4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    WIN64_PUSH_XMM       16, 12
+    shr                 mxd, 16
+    vbroadcasti128      m12, [subpel_h_shufA]
+    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
+    shl                  wd, 5
+    mov                  r6, ssq
+    sub                srcq, 4
+    pxor                 m0, m0
+    neg                  r6
+    punpcklbw            m0, m2
+    lea                  wd, [hq+wq-256]
+    test          dword r8m, 0x800
+    jz .hv_w8_10bit
+    psraw                m0, 2
+    psllw               xm1, 2
+.hv_w8_10bit:
+    pshufd               m7, m0, q0000
+    pshufd               m8, m0, q1111
+%if WIN64
+    %define v_mul (rsp+stack_offset+40) ; r4m
+%else
+    %define v_mul (rsp+stack_offset+ 8) ; r6m
+%endif
+    mova            [v_mul], xm1
+    pshufd               m9, m0, q2222
+.hv_w8_loop0:
+    vbroadcasti128       m0, [srcq+ssq*0+ 0]
+    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
+    lea                  r7, [srcq+ssq*2]
+    vbroadcasti128       m2, [srcq+ssq*0+16]
+    vinserti128          m1, m2, [srcq+r6*2+16], 0
+    mov                  r8, dstq
+    vinserti128          m0, [r7  +ssq*0+ 0], 1
+    vinserti128          m2, [r7  +ssq*0+16], 1
+    shufpd               m4, m3, m1, 0x05
+%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+    pshufb              m%1, m12       ; 01 12 23 34
+    pshufb              m%2, m12       ; 45 56 67 78
+    pmaddwd             m%4, m7, m%1   ; a0
+    pshufb              m%3, m12       ; 89 9a ab bc
+    pmaddwd             m%5, m9, m%2   ; a2
+    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
+    paddd               m%4, m%5       ; a0+a2
+    pmaddwd             m%5, m7, m%2   ; b0
+    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
+    pmaddwd             m%3, m9        ; b2
+    pmaddwd             m%1, m8        ; a1
+    pmaddwd             m%2, m8        ; b1
+    paddd               m%3, m%5       ; b0+b2
+    paddd               m%4, m10
+    paddd               m%3, m10
+    paddd               m%1, m%4
+    paddd               m%2, m%3
+    psrad               m%1, 10
+    psrad               m%2, 10
+    packssdw            m%1, m%2
+%endmacro
+    PUT_6TAP_HV_H         3, 4, 1, 5, 6  ; 0 2
+    movu                xm4, [srcq+r6 *1+ 0]
+    vinserti128          m4, [srcq+ssq*1+ 0], 1
+    shufpd               m1, m0, m2, 0x05
+    PUT_6TAP_HV_H         0, 1, 2, 5, 6  ; 2 4
+    movu                xm2, [srcq+r6 *1+16]
+    vinserti128          m2, [srcq+ssq*1+16], 1
+    shufpd               m1, m4, m2, 0x05
+    PUT_6TAP_HV_H         4, 1, 2, 5, 6  ; 1 3
+    vpermq               m3, m3, q3120
+    vpermq               m4, m4, q3120
+    vpermq               m0, m0, q3120
+    punpcklwd            m1, m3, m4      ; 01
+    punpckhwd            m3, m4          ; 23
+    punpcklwd            m2, m4, m0      ; 12
+    punpckhwd            m4, m0          ; 34
+.hv_w8_loop:
+    vpbroadcastd        m15, [v_mul+4*0]
+    vpbroadcastd        m13, [v_mul+4*1]
+    movu                xm5, [r7+ssq*1+ 0]
+    movu                xm6, [r7+ssq*1+16]
+    lea                  r7, [r7+ssq*2]
+    pmaddwd             m14, m15, m1     ; a0
+    pmaddwd             m15, m2          ; b0
+    vinserti128          m5, [r7+ssq*0+ 0], 1
+    vinserti128          m6, [r7+ssq*0+16], 1
+    mova                 m1, m3
+    pmaddwd              m3, m13         ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m13         ; b1
+    paddd               m14, m3
+    shufpd               m3, m5, m6, 0x05
+    paddd               m15, m4
+    PUT_6TAP_HV_H         5, 3, 6, 4, 13 ; 5 6
+    vpbroadcastd         m6, [v_mul+4*2]
+    vpermq               m5, m5, q3120
+    shufpd               m4, m0, m5, 0x05
+    mova                 m0, m5
+    punpcklwd            m3, m4, m5      ; 45
+    punpckhwd            m4, m5          ; 56
+    pmaddwd              m5, m6, m3      ; a2
+    pmaddwd              m6, m4          ; b2
+    paddd               m14, m10
+    paddd               m15, m10
+    paddd                m5, m14
+    paddd                m6, m15
+    psrad                m5, 10
+    psrad                m6, 10
+    packusdw             m5, m6
+    pminsw               m5, m11
+    vpermq               m5, m5, q3120
+    mova         [r8+dsq*0], xm5
+    vextracti128 [r8+dsq*1], m5, 1
+    lea                  r8, [r8+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    add                srcq, 16
+    add                dstq, 16
+    movzx                hd, wb
+    sub                  wd, 1<<8
+    jg .hv_w8_loop0
+    RET
+
+PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
+PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
+PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
+PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
+PUT_8TAP_FN sharp,          SHARP,   SHARP
+
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx2
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    lea                  r8, [put_avx2]
+    movifnidn            wd, wm
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put
 .v:
     movzx               mxd, myb
     shr                 myd, 16
@@ -1585,27 +2013,109 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     sub                  wd, 1<<8
     jg .v_w8_loop0
     RET
-.hv:
-    WIN64_SPILL_XMM      16
-    vpbroadcastw        m15, r8m
+.h:
+    RESET_STACK_STATE
+    test                myd, 0xf00
+    jnz .hv
+    mov                 r7d, r8m
+    vpbroadcastw         m5, r8m
+    shr                 r7d, 11
+    vpbroadcastd         m4, [base+put_8tap_h_rnd+r7*4]
     cmp                  wd, 4
-    jg .hv_w8
-    movzx               mxd, mxb
-    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
-    movzx               mxd, myb
-    shr                 myd, 16
-    cmp                  hd, 6
-    cmovs               myd, mxd
-    vpbroadcastq         m1, [base+subpel_filters+myq*8]
-    vpbroadcastd         m6, [pd_512]
-    lea                  r6, [ssq*3]
-    sub                srcq, 2
-    sub                srcq, r6
-    pxor                 m7, m7
-    punpcklbw            m7, m0
-    punpcklbw            m1, m1
-    psraw                m1, 8 ; sign-extend
-    test          dword r8m, 0x800
+    jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2
+    je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4
+    WIN64_SPILL_XMM      13
+    shr                 mxd, 16
+    sub                srcq, 6
+    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
+    vbroadcasti128       m6, [subpel_h_shufA]
+    vbroadcasti128       m7, [subpel_h_shufB]
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd               m8, m0, q0000
+    pshufd               m9, m0, q1111
+    pshufd              m10, m0, q2222
+    pshufd              m11, m0, q3333
+    sub                  wd, 16
+    jge .h_w16
+.h_w8:
+%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
+    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
+    pmaddwd             m%5, m9, m%4   ; abcd1
+    pmaddwd             m%1, m8        ; abcd0
+    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
+    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+    paddd               m%5, m4
+    paddd               m%1, m%5
+    pmaddwd             m%5, m11, m%2  ; abcd3
+    paddd               m%1, m%5
+    pmaddwd             m%5, m10, m%4  ; abcd2
+    pshufb              m%3, m7        ; a b b c c d d e
+    pmaddwd             m%4, m8        ; efgh0
+    paddd               m%1, m%5
+    pmaddwd             m%5, m9, m%2   ; efgh1
+    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
+    pmaddwd             m%3, m11       ; efgh3
+    pmaddwd             m%2, m10       ; efgh2
+    paddd               m%4, m4
+    paddd               m%4, m%5
+    paddd               m%3, m%4
+    paddd               m%2, m%3
+    psrad               m%1, 6
+    psrad               m%2, 6
+    packusdw            m%1, m%2
+    pminsw              m%1, m5
+%endmacro
+    movu                xm0, [srcq+ssq*0+ 0]
+    vinserti128          m0, [srcq+ssq*1+ 0], 1
+    movu                xm2, [srcq+ssq*0+16]
+    vinserti128          m2, [srcq+ssq*1+16], 1
+    lea                srcq, [srcq+ssq*2]
+    shufpd               m1, m0, m2, 0x05
+    PUT_8TAP_H            0, 1, 2, 3, 12
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    mov                 r6d, wd
+.h_w16_loop:
+    movu                 m0, [srcq+r6*2+ 0]
+    movu                 m1, [srcq+r6*2+ 8]
+    movu                 m2, [srcq+r6*2+16]
+    PUT_8TAP_H            0, 1, 2, 3, 12
+    mova        [dstq+r6*2], m0
+    sub                 r6d, 16
+    jge .h_w16_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w16
+    RET
+.hv:
+    WIN64_SPILL_XMM      16
+    vpbroadcastw        m15, r8m
+    cmp                  wd, 4
+    jg .hv_w8
+    movzx               mxd, mxb
+    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovs               myd, mxd
+    vpbroadcastq         m1, [base+subpel_filters+myq*8]
+    vpbroadcastd         m6, [pd_512]
+    lea                  r6, [ssq*3]
+    sub                srcq, 2
+    sub                srcq, r6
+    pxor                 m7, m7
+    punpcklbw            m7, m0
+    punpcklbw            m1, m1
+    psraw                m1, 8 ; sign-extend
+    test          dword r8m, 0x800
     jz .hv_10bit
     psraw                m7, 2
     psllw                m1, 2
@@ -1788,14 +2298,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
 .hv_w8_10bit:
     pshufd              m11, m0, q0000
     pshufd              m12, m0, q1111
+    mova            [v_mul], xm1
     pshufd              m13, m0, q2222
     pshufd              m14, m0, q3333
-%if WIN64
-    %define v_mul (rsp+stack_offset+40) ; r4m
-%else
-    %define v_mul (rsp+stack_offset+8) ; r6m
-%endif
-    mova            [v_mul], xm1
 .hv_w8_loop0:
 %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
     pshufb               m2, m%1, m9   ; 2 3 3 4 4 5 5 6
@@ -1954,28 +2459,24 @@ DECLARE_REG_TMP 6, 7
 %endif
 
 %define PREP_8TAP_FN FN prep_8tap,
-PREP_8TAP_FN sharp,          SHARP,   SHARP
-PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
-PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
-PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
 PREP_8TAP_FN regular,        REGULAR, REGULAR
 
-cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
 %define base r7-prep_avx2
     imul                mxd, mxm, 0x010101
-    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
-    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    add                 myd, t1d ; 6tap_v, my, 4tap_v
     lea                  r7, [prep_avx2]
     movifnidn            hd, hm
     test                mxd, 0xf00
     jnz .h
     test                myd, 0xf00
     jnz .v
+.prep:
     tzcnt                wd, wd
     mov                 r6d, r7m ; bitdepth_max
     movzx                wd, word [r7+wq*2+table_offset(prep,)]
@@ -1983,7 +2484,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
     shr                 r6d, 11
     add                  wq, r7
     vpbroadcastd         m4, [base+prep_mul+r6*4]
-    lea                  r6, [strideq*3]
+    lea                  r6, [ssq*3]
 %if WIN64
     pop                  r7
 %endif
@@ -1993,6 +2494,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
     sub                srcq, 2
     pmovsxbw            xm0, [base+subpel_filters+mxq*8]
     vbroadcasti128       m3, [subpel_h_shufA]
+    lea                  r6, [ssq*3]
     vbroadcasti128       m4, [subpel_h_shufB]
     WIN64_SPILL_XMM       8
     pshufd              xm0, xm0, q2211
@@ -2003,11 +2505,11 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
     vpbroadcastq         m6, xm0
     vpermq               m7, m0, q1111
 .h_w4_loop:
-    movu                xm1, [srcq+strideq*0]
-    vinserti128          m1, [srcq+strideq*2], 1
-    movu                xm2, [srcq+strideq*1]
-    vinserti128          m2, [srcq+r6       ], 1
-    lea                srcq, [srcq+strideq*4]
+    movu                xm1, [srcq+ssq*0]
+    vinserti128          m1, [srcq+ssq*2], 1
+    movu                xm2, [srcq+ssq*1]
+    vinserti128          m2, [srcq+r6 *1], 1
+    lea                srcq, [srcq+ssq*4]
     pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
     pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
     pmaddwd              m0, m6
@@ -2032,62 +2534,54 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
-    lea                  r6, [strideq*3]
     cmp                  wd, 4
     je .h_w4
     shr                 mxd, 16
-    sub                srcq, 6
-    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
-    WIN64_SPILL_XMM      12
+    sub                srcq, 4
+    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
+    WIN64_SPILL_XMM      10
     vbroadcasti128       m6, [subpel_h_shufA]
-    vbroadcasti128       m7, [subpel_h_shufB]
     punpcklbw            m0, m0
     psraw                m0, 8 ; sign-extend
     test          dword r7m, 0x800
     jnz .h_12bpc
     psllw                m0, 2
 .h_12bpc:
-    pshufd               m8, m0, q0000
-    pshufd               m9, m0, q1111
-    pshufd              m10, m0, q2222
-    pshufd              m11, m0, q3333
+    pshufd               m7, m0, q0000
+    pshufd               m8, m0, q1111
+    pshufd               m9, m0, q2222
     cmp                  wd, 8
     jg .h_w16
 .h_w8:
-%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
-    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
-    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
-    pmaddwd             m%5, m9, m%4   ; abcd1
-    pmaddwd             m%1, m8        ; abcd0
-    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
-    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
-    paddd               m%5, m5
-    paddd               m%1, m%5
-    pmaddwd             m%5, m11, m%2  ; abcd3
-    paddd               m%1, m%5
-    pmaddwd             m%5, m10, m%4  ; abcd2
-    pshufb              m%3, m7        ; a b b c c d d e
-    pmaddwd             m%4, m8        ; efgh0
-    paddd               m%1, m%5
-    pmaddwd             m%5, m9, m%2   ; efgh1
-    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
-    pmaddwd             m%3, m11       ; efgh3
-    pmaddwd             m%2, m10       ; efgh2
+    movu                xm0, [srcq+ssq*0+ 0]
+    vinserti128          m0, [srcq+ssq*1+ 0], 1
+    movu                xm2, [srcq+ssq*0+16]
+    vinserti128          m2, [srcq+ssq*1+16], 1
+    lea                srcq, [srcq+ssq*2]
+    shufpd               m1, m0, m2, 0x05
+%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+    pshufb              m%1, m6        ; 01 12 23 34
+    pshufb              m%2, m6        ; 45 56 67 78
+    pmaddwd             m%4, m7, m%1   ; a0
+    pshufb              m%3, m6        ; 89 9a ab bc
+    pmaddwd             m%5, m9, m%2   ; a2
+    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
+    paddd               m%4, m%5       ; a0+a2
+    pmaddwd             m%5, m7, m%2   ; b0
+    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
+    pmaddwd             m%3, m9        ; b2
+    pmaddwd             m%1, m8        ; a1
+    pmaddwd             m%2, m8        ; b1
+    paddd               m%3, m%5       ; b0+b2
     paddd               m%4, m5
-    paddd               m%4, m%5
-    paddd               m%3, m%4
+    paddd               m%3, m5
+    paddd               m%1, m%4
     paddd               m%2, m%3
     psrad               m%1, 4
     psrad               m%2, 4
     packssdw            m%1, m%2
 %endmacro
-    movu                xm0, [srcq+strideq*0+ 0]
-    vinserti128          m0, [srcq+strideq*1+ 0], 1
-    movu                xm2, [srcq+strideq*0+16]
-    vinserti128          m2, [srcq+strideq*1+16], 1
-    lea                srcq, [srcq+strideq*2]
-    shufpd               m1, m0, m2, 0x05
-    PREP_8TAP_H           0, 1, 2, 3, 4
+    PREP_6TAP_H           0, 1, 2, 3, 4
     mova             [tmpq], m0
     add                tmpq, 32
     sub                  hd, 2
@@ -2101,15 +2595,370 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
     movu                 m0, [srcq+r6-32]
     movu                 m1, [srcq+r6-24]
     movu                 m2, [srcq+r6-16]
-    PREP_8TAP_H           0, 1, 2, 3, 4
+    PREP_6TAP_H           0, 1, 2, 3, 4
     mova       [tmpq+r6-32], m0
     sub                 r6d, 32
     jg .h_w16_loop
-    add                srcq, strideq
+    add                srcq, ssq
     add                tmpq, wq
     dec                  hd
     jg .h_w16_loop0
     RET
+.v:
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
+    WIN64_SPILL_XMM       9, 12
+    vpbroadcastd         m5, [prep_8tap_1d_rnd]
+    mov                  r6, ssq
+    punpcklbw            m0, m0
+    neg                  r6
+    psraw                m0, 8 ; sign-extend
+    test          dword r7m, 0x800
+    jnz .v_12bpc
+    psllw                m0, 2
+.v_12bpc:
+    pshufd               m6, m0, q0000
+    pshufd               m7, m0, q1111
+    pshufd               m8, m0, q2222
+    cmp                  wd, 4
+    jg .v_w8
+.v_w4:
+    movq                xm1, [srcq+r6 *2]
+    vpbroadcastq         m3, [srcq+r6 *1]
+    vpbroadcastq         m2, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpbroadcastq         m0, [srcq+ssq*0]
+    vpblendd             m1, m3, 0x30
+    vpblendd             m3, m2, 0x30
+    punpcklwd            m1, m3     ; 01 12
+    vpblendd             m2, m4, 0x30
+    vpblendd             m4, m0, 0x30
+    punpcklwd            m2, m4     ; 23 34
+.v_w4_loop:
+    vpbroadcastq         m3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd              m4, m6, m1 ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m7     ; a1 b1
+    paddd                m4, m2
+    vpblendd             m2, m0, m3, 0x30
+    vpbroadcastq         m0, [srcq+ssq*0]
+    vpblendd             m3, m0, 0x30
+    punpcklwd            m2, m3     ; 45 56
+    pmaddwd              m3, m8, m2 ; a2 b2
+    paddd                m4, m5
+    paddd                m4, m3
+    psrad                m4, 4
+    vextracti128        xm3, m4, 1
+    packssdw            xm4, xm3
+    mova             [tmpq], xm4
+    add                tmpq, 16
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    WIN64_PUSH_XMM       12
+%if WIN64
+    push                 r8
+%endif
+    mov                 r8d, wd
+    shl                  wd, 5
+    lea                  wd, [hq+wq-256]
+.v_w8_loop0:
+    vbroadcasti128       m3, [srcq+r6 *2]
+    vbroadcasti128       m4, [srcq+r6 *1]
+    lea                  r5, [srcq+ssq*2]
+    vbroadcasti128       m0, [srcq+ssq*0]
+    vbroadcasti128       m1, [srcq+ssq*1]
+    mov                  r7, tmpq
+    vbroadcasti128       m2, [r5+ssq*0]
+    shufpd               m3, m0, 0x0c
+    shufpd               m4, m1, 0x0c
+    punpcklwd            m1, m3, m4 ; 01
+    punpckhwd            m3, m4     ; 23
+    shufpd               m0, m2, 0x0c
+    punpcklwd            m2, m4, m0 ; 12
+    punpckhwd            m4, m0     ; 34
+.v_w8_loop:
+    vbroadcasti128       m9, [r5+ssq*1]
+    pmaddwd             m10, m6, m1 ; a0
+    lea                  r5, [r5+ssq*2]
+    pmaddwd             m11, m6, m2 ; b0
+    mova                 m1, m3
+    pmaddwd              m3, m7     ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m7     ; b1
+    paddd               m10, m5
+    paddd               m11, m5
+    paddd               m10, m3
+    vbroadcasti128       m3, [r5+ssq*0]
+    paddd               m11, m4
+    shufpd               m4, m0, m9, 0x0d
+    shufpd               m0, m9, m3, 0x0c
+    punpcklwd            m3, m4, m0 ; 45
+    punpckhwd            m4, m0     ; 56
+    pmaddwd              m9, m8, m3 ; a2
+    paddd               m10, m9
+    pmaddwd              m9, m8, m4 ; b2
+    paddd               m11, m9
+    psrad               m10, 4
+    psrad               m11, 4
+    packssdw            m10, m11
+    vpermq              m10, m10, q3120
+    mova          [r7+r8*0], xm10
+    vextracti128  [r7+r8*2], m10, 1
+    lea                  r7, [r7+r8*4]
+    sub                  hd, 2
+    jg .v_w8_loop
+    add                srcq, 16
+    add                tmpq, 16
+    movzx                hd, wb
+    sub                  wd, 1<<8
+    jg .v_w8_loop0
+%if WIN64
+    pop                  r8
+%endif
+    RET
+.hv:
+    WIN64_SPILL_XMM      13, 15
+    vpbroadcastd         m7, [prep_8tap_2d_rnd]
+    vbroadcasti128       m8, [subpel_h_shufA]
+    cmp                  wd, 4
+    jg .hv_w8
+    movzx               mxd, mxb
+    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
+    mov                  r6, ssq
+    sub                srcq, 2
+    pxor                 m6, m6
+    neg                  r6
+    punpcklbw            m6, m0
+    punpcklbw            m1, m1
+    psraw                m6, 4
+    psraw                m1, 8
+    test          dword r7m, 0x800
+    jz .hv_w4_10bit
+    psraw                m6, 2
+.hv_w4_10bit:
+    pshufd              m10, m1, q0000
+    pshufd              m11, m1, q1111
+    pshufd              m12, m1, q2222
+.hv_w4:
+    movu                xm2, [srcq+r6 *2]
+    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
+    pshufd               m5, m6, q0000
+    vbroadcasti128       m9, [base+subpel_h_shufB]
+    movu                xm0, [srcq+ssq*0]
+    pshufd               m6, m6, q1111
+    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
+    lea                srcq, [srcq+ssq*2]
+    movu                xm3, [srcq+ssq*0]    ; 4
+    pshufb               m1, m2, m8
+    pmaddwd              m1, m5
+    pshufb               m2, m9
+    pmaddwd              m2, m6
+    pshufb               m4, m0, m8
+    pmaddwd              m4, m5
+    pshufb               m0, m9
+    pmaddwd              m0, m6
+    paddd                m2, m1
+    pshufb              xm1, xm3, xm8
+    pmaddwd             xm1, xm5
+    pshufb              xm3, xm9
+    pmaddwd             xm3, xm6
+    paddd                m0, m4
+    paddd                m2, m7
+    paddd               xm1, xm7
+    paddd                m0, m7
+    paddd               xm3, xm1
+    REPX       {psrad x, 6}, m2, m0, xm3
+    packssdw             m2, m0      ; 0 2   1 3
+    packssdw            xm0, xm3     ; 2 4
+    vperm2i128           m0, m2, 0x03
+    punpcklwd            m1, m2, m0  ; 01 12
+    punpckhwd            m2, m0      ; 23 34
+.hv_w4_loop:
+    movu                xm3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vinserti128          m3, [srcq+ssq*0], 1
+    pmaddwd              m4, m10, m1 ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m11     ; a1 b1
+    paddd                m4, m2
+    pshufb               m2, m3, m8
+    pmaddwd              m2, m5
+    pshufb               m3, m9
+    pmaddwd              m3, m6
+    paddd                m2, m7
+    paddd                m3, m2
+    psrad                m3, 6
+    packssdw             m3, m3      ; 5 5   6 6
+    vperm2i128           m2, m0, m3, 0x21
+    mova                 m0, m3
+    punpckhwd            m2, m3      ; 45 56
+    pmaddwd              m3, m12, m2 ; a2 b2
+    paddd                m4, m7
+    paddd                m4, m3
+    psrad                m4, 6
+    vextracti128        xm3, m4, 1
+    packssdw            xm4, xm3
+    mova             [tmpq], xm4
+    add                tmpq, 16
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    shr                 mxd, 16
+    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
+    WIN64_PUSH_XMM       15
+%if WIN64
+    PUSH                 r8
+%endif
+    mov                 r8d, wd
+    shl                  wd, 5
+    mov                  r6, ssq
+    sub                srcq, 4
+    neg                  r6
+    lea                  wd, [hq+wq-256]
+    pxor                 m0, m0
+    punpcklbw            m0, m2
+    psraw                m0, 4
+    test          dword r7m, 0x800
+    jz .hv_w8_10bit
+    psraw                m0, 2
+.hv_w8_10bit:
+    pshufd              m10, m0, q0000
+    pshufd              m11, m0, q1111
+    mova            [v_mul], xm1
+    pshufd              m12, m0, q2222
+.hv_w8_loop0:
+    vbroadcasti128       m0, [srcq+ssq*0+ 0]
+    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
+    lea                  r5, [srcq+ssq*2]
+    vbroadcasti128       m2, [srcq+ssq*0+16]
+    vinserti128          m1, m2, [srcq+r6*2+16], 0
+    mov                  r7, tmpq
+    vinserti128          m0, [r5  +ssq*0+ 0], 1
+    vinserti128          m2, [r5  +ssq*0+16], 1
+    shufpd               m4, m3, m1, 0x05
+%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+    pshufb              m%1, m8        ; 01 12 23 34
+    pshufb              m%2, m8        ; 45 56 67 78
+    pmaddwd             m%4, m10, m%1  ; a0
+    pshufb              m%3, m8        ; 89 9a ab bc
+    pmaddwd             m%5, m12, m%2  ; a2
+    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
+    paddd               m%4, m%5       ; a0+a2
+    pmaddwd             m%5, m10, m%2  ; b0
+    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
+    pmaddwd             m%3, m12       ; b2
+    pmaddwd             m%1, m11       ; a1
+    pmaddwd             m%2, m11       ; b1
+    paddd               m%3, m%5       ; b0+b2
+    paddd               m%4, m7
+    paddd               m%3, m7
+    paddd               m%1, m%4
+    paddd               m%2, m%3
+    psrad               m%1, 6
+    psrad               m%2, 6
+    packssdw            m%1, m%2
+%endmacro
+    PREP_6TAP_HV_H        3, 4, 1, 5, 6  ; 0 2
+    movu                xm4, [srcq+r6 *1+ 0]
+    vinserti128          m4, [srcq+ssq*1+ 0], 1
+    shufpd               m1, m0, m2, 0x05
+    PREP_6TAP_HV_H        0, 1, 2, 5, 6  ; 2 4
+    movu                xm2, [srcq+r6 *1+16]
+    vinserti128          m2, [srcq+ssq*1+16], 1
+    shufpd               m1, m4, m2, 0x05
+    PREP_6TAP_HV_H        4, 1, 2, 5, 6  ; 1 3
+    vpermq               m3, m3, q3120
+    vpermq               m4, m4, q3120
+    vpermq               m0, m0, q3120
+    punpcklwd            m1, m3, m4     ; 01
+    punpckhwd            m3, m4         ; 23
+    punpcklwd            m2, m4, m0     ; 12
+    punpckhwd            m4, m0         ; 34
+.hv_w8_loop:
+    vpbroadcastd        m14, [v_mul+4*0]
+    vpbroadcastd         m9, [v_mul+4*1]
+    movu                xm5, [r5+ssq*1+ 0]
+    movu                xm6, [r5+ssq*1+16]
+    lea                  r5, [r5+ssq*2]
+    pmaddwd             m13, m14, m1    ; a0
+    pmaddwd             m14, m2         ; b0
+    vinserti128          m5, [r5+ssq*0+ 0], 1
+    vinserti128          m6, [r5+ssq*0+16], 1
+    mova                 m1, m3
+    pmaddwd              m3, m9         ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m9         ; b1
+    paddd               m13, m3
+    shufpd               m3, m5, m6, 0x05
+    paddd               m14, m4
+    PREP_6TAP_HV_H        5, 3, 6, 4, 9 ; 5 6
+    vpbroadcastd         m6, [v_mul+4*2]
+    vpermq               m5, m5, q3120
+    shufpd               m4, m0, m5, 0x05
+    mova                 m0, m5
+    punpcklwd            m3, m4, m5     ; 45
+    punpckhwd            m4, m5         ; 56
+    pmaddwd              m5, m6, m3     ; a2
+    pmaddwd              m6, m4         ; b2
+    paddd               m13, m7
+    paddd               m14, m7
+    paddd                m5, m13
+    paddd                m6, m14
+    psrad                m5, 6
+    psrad                m6, 6
+    packssdw             m5, m6
+    vpermq               m5, m5, q3120
+    mova          [r7+r8*0], xm5
+    vextracti128  [r7+r8*2], m5, 1
+    lea                  r7, [r7+r8*4]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    add                srcq, 16
+    add                tmpq, 16
+    movzx                hd, wb
+    sub                  wd, 1<<8
+    jg .hv_w8_loop0
+%if WIN64
+    POP                  r8
+%endif
+    RET
+
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
+PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
+PREP_8TAP_FN sharp,          SHARP,   SHARP
+
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+%define base r7-prep_avx2
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    lea                  r7, [prep_avx2]
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep
 .v:
     movzx               mxd, myb
     shr                 myd, 16
@@ -2251,6 +3100,87 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
     pop                  r8
 %endif
     RET
+.h:
+    test                myd, 0xf00
+    jnz .hv
+    vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
+    cmp                  wd, 4
+    je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4
+    shr                 mxd, 16
+    sub                srcq, 6
+    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
+    WIN64_SPILL_XMM      12
+    vbroadcasti128       m6, [subpel_h_shufA]
+    vbroadcasti128       m7, [subpel_h_shufB]
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    test          dword r7m, 0x800
+    jnz .h_12bpc
+    psllw                m0, 2
+.h_12bpc:
+    pshufd               m8, m0, q0000
+    pshufd               m9, m0, q1111
+    pshufd              m10, m0, q2222
+    pshufd              m11, m0, q3333
+    cmp                  wd, 8
+    jg .h_w16
+.h_w8:
+%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
+    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
+    pmaddwd             m%5, m9, m%4   ; abcd1
+    pmaddwd             m%1, m8        ; abcd0
+    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
+    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+    paddd               m%5, m5
+    paddd               m%1, m%5
+    pmaddwd             m%5, m11, m%2  ; abcd3
+    paddd               m%1, m%5
+    pmaddwd             m%5, m10, m%4  ; abcd2
+    pshufb              m%3, m7        ; a b b c c d d e
+    pmaddwd             m%4, m8        ; efgh0
+    paddd               m%1, m%5
+    pmaddwd             m%5, m9, m%2   ; efgh1
+    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
+    pmaddwd             m%3, m11       ; efgh3
+    pmaddwd             m%2, m10       ; efgh2
+    paddd               m%4, m5
+    paddd               m%4, m%5
+    paddd               m%3, m%4
+    paddd               m%2, m%3
+    psrad               m%1, 4
+    psrad               m%2, 4
+    packssdw            m%1, m%2
+%endmacro
+    movu                xm0, [srcq+strideq*0+ 0]
+    vinserti128          m0, [srcq+strideq*1+ 0], 1
+    movu                xm2, [srcq+strideq*0+16]
+    vinserti128          m2, [srcq+strideq*1+16], 1
+    lea                srcq, [srcq+strideq*2]
+    shufpd               m1, m0, m2, 0x05
+    PREP_8TAP_H           0, 1, 2, 3, 4
+    mova             [tmpq], m0
+    add                tmpq, 32
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    add                  wd, wd
+.h_w16_loop0:
+    mov                 r6d, wd
+.h_w16_loop:
+    movu                 m0, [srcq+r6-32]
+    movu                 m1, [srcq+r6-24]
+    movu                 m2, [srcq+r6-16]
+    PREP_8TAP_H           0, 1, 2, 3, 4
+    mova       [tmpq+r6-32], m0
+    sub                 r6d, 32
+    jg .h_w16_loop
+    add                srcq, strideq
+    add                tmpq, wq
+    dec                  hd
+    jg .h_w16_loop0
+    RET
 .hv:
     WIN64_SPILL_XMM      16
     vpbroadcastd        m15, [prep_8tap_2d_rnd]
@@ -4213,14 +5143,14 @@ DECLARE_REG_TMP 6, 8
 
 %define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
 BILIN_SCALED_FN put
-PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP
-PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
-PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
-PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
-PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
 PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 MC_8TAP_SCALED put
 
@@ -4232,14 +5162,14 @@ DECLARE_REG_TMP 6, 7
 
 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
 BILIN_SCALED_FN prep
-PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP
-PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
-PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
-PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
-PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
-PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
-PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
-PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
 PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 MC_8TAP_SCALED prep
 
-- 
GitLab