diff --git a/NEWS b/NEWS
index 16825ff87e4b0b73f25ff10ad99b708c9e49a00c..0302484c1b69bed02c5d0fcd53c186c8c2ddee9f 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,27 @@
+Changes for 1.5.0 'Road Runner':
+--------------------------------
+
+1.5.0 is a major release of dav1d, that:
+ - WARNING: we removed some of the SSE2 optimizations, so if you care about
+            systems without SSSE3, you should be careful when updating!
+ - Add Arm OpenBSD run-time CPU feature
+ - Optimize index offset calculations for decode_coefs
+ - picture: copy HDR10+ and T35 metadata only to visible frames
+ - SSSE3 new optimizations for 6-tap (8bit and hbd)
+ - AArch64/SVE: Add HBD subpel filters using 128-bit SVE2
+ - AArch64: Add USMMLA implempentation for 6-tap H/HV
+ - AArch64: Optimize Armv8.0 NEON for HBD horizontal filters and 6-tap filters
+ - Allow playing videos in full-screen mode in dav1dplay
+
+
+Changes for 1.4.3 'Road Runner':
+--------------------------------
+
+1.4.3 is a small release focused on security issues
+ - AArch64: Fix potential out of bounds access in DotProd H/HV filters
+ - cli: Prevent buffer over-read
+
+
 Changes for 1.4.2 'Road Runner':
 --------------------------------
 
diff --git a/examples/dav1dplay.c b/examples/dav1dplay.c
index 9cca8e8472b132522b5dc33c6b8126407735ee95..1f649444830f507e1c7419ac09398ada81bc0ced 100644
--- a/examples/dav1dplay.c
+++ b/examples/dav1dplay.c
@@ -120,6 +120,7 @@ static void dp_settings_print_usage(const char *const app,
             " --highquality:        enable high quality rendering\n"
             " --zerocopy/-z:        enable zero copy upload path\n"
             " --gpugrain/-g:        enable GPU grain synthesis\n"
+            " --fullscreen/-f:      enable full screen mode\n"
             " --version/-v:         print version and exit\n"
             " --renderer/-r:        select renderer backend (default: auto)\n");
     exit(1);
@@ -144,7 +145,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
     Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
 
     // Short options
-    static const char short_opts[] = "i:vuzgr:";
+    static const char short_opts[] = "i:vuzgfr:";
 
     enum {
         ARG_THREADS = 256,
@@ -162,6 +163,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
         { "highquality",    0, NULL, ARG_HIGH_QUALITY },
         { "zerocopy",       0, NULL, 'z' },
         { "gpugrain",       0, NULL, 'g' },
+        { "fullscreen",     0, NULL, 'f'},
         { "renderer",       0, NULL, 'r'},
         { NULL,             0, NULL, 0 },
     };
@@ -186,6 +188,9 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
             case 'g':
                 settings->gpugrain = true;
                 break;
+            case 'f':
+                settings->fullscreen = true;
+                break;
             case 'r':
                 settings->renderer_name = optarg;
                 break;
@@ -240,35 +245,37 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
         return NULL;
     }
 
+    // Parse and validate arguments
+    dav1d_default_settings(&rd_ctx->lib_settings);
+    memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
+    dp_rd_ctx_parse_args(rd_ctx, argc, argv);
+
+    // Init SDL2 library
+    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0) {
+        fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
+        goto fail;
+    }
+
     // Register a custom event to notify our SDL main thread
     // about new frames
     rd_ctx->event_types = SDL_RegisterEvents(3);
     if (rd_ctx->event_types == UINT32_MAX) {
         fprintf(stderr, "Failure to create custom SDL event types!\n");
-        free(rd_ctx);
-        return NULL;
+        goto fail;
     }
 
     rd_ctx->fifo = dp_fifo_create(5);
     if (rd_ctx->fifo == NULL) {
         fprintf(stderr, "Failed to create FIFO for output pictures!\n");
-        free(rd_ctx);
-        return NULL;
+        goto fail;
     }
 
     rd_ctx->lock = SDL_CreateMutex();
     if (rd_ctx->lock == NULL) {
         fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
-        dp_fifo_destroy(rd_ctx->fifo);
-        free(rd_ctx);
-        return NULL;
+        goto fail;
     }
 
-    // Parse and validate arguments
-    dav1d_default_settings(&rd_ctx->lib_settings);
-    memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
-    dp_rd_ctx_parse_args(rd_ctx, argc, argv);
-
     // Select renderer
     renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name);
 
@@ -279,15 +286,21 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
         printf("Using %s renderer\n", renderer_info->name);
     }
 
-    rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL;
+    rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer(&rd_ctx->settings) : NULL;
     if (rd_ctx->rd_priv == NULL) {
-        SDL_DestroyMutex(rd_ctx->lock);
-        dp_fifo_destroy(rd_ctx->fifo);
-        free(rd_ctx);
-        return NULL;
+        goto fail;
     }
 
     return rd_ctx;
+
+fail:
+    if (rd_ctx->lock)
+        SDL_DestroyMutex(rd_ctx->lock);
+    if (rd_ctx->fifo)
+        dp_fifo_destroy(rd_ctx->fifo);
+    free(rd_ctx);
+    SDL_Quit();
+    return NULL;
 }
 
 /**
@@ -662,10 +675,6 @@ int main(int argc, char **argv)
         return 1;
     }
 
-    // Init SDL2 library
-    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0)
-        return 10;
-
     // Create render context
     Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv);
     if (rd_ctx == NULL) {
@@ -711,9 +720,7 @@ int main(int argc, char **argv)
             if (e->type == SDL_QUIT) {
                 dp_rd_ctx_request_shutdown(rd_ctx);
                 dp_fifo_flush(rd_ctx->fifo, destroy_pic);
-                SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
-                SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME);
-                num_frame_events = 0;
+                goto out;
             } else if (e->type == SDL_WINDOWEVENT) {
                 if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
                     // TODO: Handle window resizes
@@ -724,6 +731,10 @@ int main(int argc, char **argv)
                 SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e;
                 if (kbde->keysym.sym == SDLK_SPACE) {
                     dp_rd_ctx_toggle_pause(rd_ctx);
+                } else if (kbde->keysym.sym == SDLK_ESCAPE) {
+                    dp_rd_ctx_request_shutdown(rd_ctx);
+                    dp_fifo_flush(rd_ctx->fifo, destroy_pic);
+                    goto out;
                 } else if (kbde->keysym.sym == SDLK_LEFT ||
                            kbde->keysym.sym == SDLK_RIGHT)
                 {
@@ -776,5 +787,6 @@ out:;
     int decoder_ret = 0;
     SDL_WaitThread(decoder_thread, &decoder_ret);
     dp_rd_ctx_destroy(rd_ctx);
+    SDL_Quit();
     return decoder_ret;
 }
diff --git a/examples/dp_renderer.h b/examples/dp_renderer.h
index 354e140a48376303622872b174780c5f6f32c352..513e2ad61fb5fbf1c7faa1fcdf62671e6e47fbeb 100644
--- a/examples/dp_renderer.h
+++ b/examples/dp_renderer.h
@@ -30,22 +30,32 @@
 #include "dav1d/dav1d.h"
 
 #include <SDL.h>
-#ifdef HAVE_PLACEBO
+#if HAVE_PLACEBO
 # include <libplacebo/config.h>
 #endif
 
 // Check libplacebo Vulkan rendering
-#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN)
+#if HAVE_VULKAN && defined(SDL_VIDEO_VULKAN)
 # if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN
-#  define HAVE_RENDERER_PLACEBO
-#  define HAVE_PLACEBO_VULKAN
+#  define HAVE_RENDERER_PLACEBO 1
+#  define HAVE_PLACEBO_VULKAN 1
 # endif
 #endif
 
 // Check libplacebo OpenGL rendering
 #if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL
-# define HAVE_RENDERER_PLACEBO
-# define HAVE_PLACEBO_OPENGL
+# define HAVE_RENDERER_PLACEBO 1
+# define HAVE_PLACEBO_OPENGL 1
+#endif
+
+#ifndef HAVE_RENDERER_PLACEBO
+#define HAVE_RENDERER_PLACEBO 0
+#endif
+#ifndef HAVE_PLACEBO_VULKAN
+#define HAVE_PLACEBO_VULKAN 0
+#endif
+#ifndef HAVE_PLACEBO_OPENGL
+#define HAVE_PLACEBO_OPENGL 0
 #endif
 
 /**
@@ -61,6 +71,7 @@ typedef struct {
     int untimed;
     int zerocopy;
     int gpugrain;
+    int fullscreen;
 } Dav1dPlaySettings;
 
 #define WINDOW_WIDTH  910
@@ -82,7 +93,7 @@ typedef struct rdr_info
     // Cookie passed to the renderer implementation callbacks
     void *cookie;
     // Callback to create the renderer
-    void* (*create_renderer)(void);
+    void* (*create_renderer)(const Dav1dPlaySettings *settings);
     // Callback to destroy the renderer
     void (*destroy_renderer)(void *cookie);
     // Callback to the render function that renders a prevously sent frame
diff --git a/examples/dp_renderer_placebo.c b/examples/dp_renderer_placebo.c
index 4ab1415f44af4febeb8a6a5c263a2282d6860e1a..972cc576883d0cd69f78686e4c9aa189a3a1b0ff 100644
--- a/examples/dp_renderer_placebo.c
+++ b/examples/dp_renderer_placebo.c
@@ -26,17 +26,17 @@
 
 #include "dp_renderer.h"
 
-#ifdef HAVE_RENDERER_PLACEBO
+#if HAVE_RENDERER_PLACEBO
 #include <assert.h>
 
 #include <libplacebo/renderer.h>
 #include <libplacebo/utils/dav1d.h>
 
-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
 # include <libplacebo/vulkan.h>
 # include <SDL_vulkan.h>
 #endif
-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
 # include <libplacebo/opengl.h>
 # include <SDL_opengl.h>
 #endif
@@ -53,7 +53,7 @@ typedef struct renderer_priv_ctx
     pl_log log;
     // Placebo renderer
     pl_renderer renderer;
-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
     // Placebo Vulkan handle
     pl_vulkan vk;
     // Placebo Vulkan instance
@@ -61,9 +61,11 @@ typedef struct renderer_priv_ctx
     // Vulkan surface
     VkSurfaceKHR surf;
 #endif
-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
     // Placebo OpenGL handle
     pl_opengl gl;
+    // SDL OpenGL context
+    SDL_GLContext gl_context;
 #endif
     // Placebo GPU
     pl_gpu gpu;
@@ -77,13 +79,18 @@ typedef struct renderer_priv_ctx
 } Dav1dPlayRendererPrivateContext;
 
 static Dav1dPlayRendererPrivateContext*
-    placebo_renderer_create_common(int window_flags)
+    placebo_renderer_create_common(const Dav1dPlaySettings *settings, int window_flags)
 {
+    if (settings->fullscreen)
+        window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP;
+
     // Create Window
     SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE);
     if (sdlwin == NULL)
         return NULL;
 
+    SDL_ShowCursor(0);
+
     // Alloc
     Dav1dPlayRendererPrivateContext *const rd_priv_ctx =
         calloc(1, sizeof(Dav1dPlayRendererPrivateContext));
@@ -118,24 +125,25 @@ static Dav1dPlayRendererPrivateContext*
     return rd_priv_ctx;
 }
 
-#ifdef HAVE_PLACEBO_OPENGL
-static void *placebo_renderer_create_gl(void)
+#if HAVE_PLACEBO_OPENGL
+static void *placebo_renderer_create_gl(const Dav1dPlaySettings *settings)
 {
     SDL_Window *sdlwin = NULL;
     SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
 
     // Common init
     Dav1dPlayRendererPrivateContext *rd_priv_ctx =
-        placebo_renderer_create_common(SDL_WINDOW_OPENGL);
+        placebo_renderer_create_common(settings, SDL_WINDOW_OPENGL);
 
     if (rd_priv_ctx == NULL)
         return NULL;
     sdlwin = rd_priv_ctx->win;
 
-    SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin);
-    SDL_GL_MakeCurrent(sdlwin, glcontext);
+    rd_priv_ctx->gl_context = SDL_GL_CreateContext(sdlwin);
+    SDL_GL_MakeCurrent(sdlwin, rd_priv_ctx->gl_context);
 
     rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->log, pl_opengl_params(
+        .allow_software = true,
 #ifndef NDEBUG
         .debug = true,
 #endif
@@ -173,14 +181,14 @@ static void *placebo_renderer_create_gl(void)
 }
 #endif
 
-#ifdef HAVE_PLACEBO_VULKAN
-static void *placebo_renderer_create_vk(void)
+#if HAVE_PLACEBO_VULKAN
+static void *placebo_renderer_create_vk(const Dav1dPlaySettings *settings)
 {
     SDL_Window *sdlwin = NULL;
 
     // Common init
     Dav1dPlayRendererPrivateContext *rd_priv_ctx =
-        placebo_renderer_create_common(SDL_WINDOW_VULKAN);
+        placebo_renderer_create_common(settings, SDL_WINDOW_VULKAN);
 
     if (rd_priv_ctx == NULL)
         return NULL;
@@ -270,16 +278,18 @@ static void placebo_renderer_destroy(void *cookie)
     for (int i = 0; i < 3; i++)
         pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i]));
 
-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
     if (rd_priv_ctx->vk) {
         pl_vulkan_destroy(&(rd_priv_ctx->vk));
         vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
         pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
     }
 #endif
-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
     if (rd_priv_ctx->gl)
         pl_opengl_destroy(&(rd_priv_ctx->gl));
+    if (rd_priv_ctx->gl_context)
+        SDL_GL_DeleteContext(rd_priv_ctx->gl_context);
 #endif
 
     SDL_DestroyWindow(rd_priv_ctx->win);
@@ -382,7 +392,7 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
     SDL_UnlockMutex(rd_priv_ctx->lock);
 }
 
-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
 const Dav1dPlayRenderInfo rdr_placebo_vk = {
     .name = "placebo-vk",
     .create_renderer = placebo_renderer_create_vk,
@@ -397,7 +407,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = {
 const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
 #endif
 
-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
 const Dav1dPlayRenderInfo rdr_placebo_gl = {
     .name = "placebo-gl",
     .create_renderer = placebo_renderer_create_gl,
diff --git a/examples/dp_renderer_sdl.c b/examples/dp_renderer_sdl.c
index 735b0664d3313764810b14b81d17410080c67ef3..39e6ac8e00e7286d6f1c1d8b29f888d05fc88bbe 100644
--- a/examples/dp_renderer_sdl.c
+++ b/examples/dp_renderer_sdl.c
@@ -43,12 +43,18 @@ typedef struct renderer_priv_ctx
     SDL_Texture *tex;
 } Dav1dPlayRendererPrivateContext;
 
-static void *sdl_renderer_create(void)
+static void *sdl_renderer_create(const Dav1dPlaySettings *settings)
 {
-    SDL_Window *win = dp_create_sdl_window(0);
+    int window_flags = 0;
+    if (settings->fullscreen)
+        window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP;
+
+    SDL_Window *win = dp_create_sdl_window(window_flags);
     if (win == NULL)
         return NULL;
 
+    SDL_ShowCursor(0);
+
     // Alloc
     Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
     if (rd_priv_ctx == NULL) {
@@ -79,7 +85,9 @@ static void sdl_renderer_destroy(void *cookie)
     Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
     assert(rd_priv_ctx != NULL);
 
+    SDL_DestroyTexture(rd_priv_ctx->tex);
     SDL_DestroyRenderer(rd_priv_ctx->renderer);
+    SDL_DestroyWindow(rd_priv_ctx->win);
     SDL_DestroyMutex(rd_priv_ctx->lock);
     free(rd_priv_ctx);
 }
@@ -142,6 +150,7 @@ static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
     if (texture == NULL) {
         texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV,
             SDL_TEXTUREACCESS_STREAMING, width, height);
+        SDL_RenderSetLogicalSize(rd_priv_ctx->renderer, width, height);
     }
 
     SDL_UpdateYUVTexture(texture, NULL,
diff --git a/examples/meson.build b/examples/meson.build
index 2b2b8bd5adc0a64432f31437cfe3536f6296915b..adbf85b7848203c435cff86eab1c0f73fcad3da7 100644
--- a/examples/meson.build
+++ b/examples/meson.build
@@ -48,19 +48,23 @@ if sdl2_dependency.found()
 
     placebo_dependency = dependency('libplacebo', version: '>= 4.160.0', required: false)
 
-    if placebo_dependency.found()
+    have_vulkan = false
+    have_placebo = placebo_dependency.found()
+    if have_placebo
         dav1dplay_deps += placebo_dependency
-        dav1dplay_cflags += '-DHAVE_PLACEBO'
 
         # If libplacebo is found, we might be able to use Vulkan
         # with it, in which case we need the Vulkan library too.
         vulkan_dependency = dependency('vulkan', required: false)
         if vulkan_dependency.found()
             dav1dplay_deps += vulkan_dependency
-            dav1dplay_cflags += '-DHAVE_VULKAN'
+            have_vulkan = true
         endif
     endif
 
+    dav1dplay_cflags += '-DHAVE_PLACEBO=' + (have_placebo ? '1' : '0')
+    dav1dplay_cflags += '-DHAVE_VULKAN=' + (have_vulkan ? '1' : '0')
+
     dav1dplay = executable('dav1dplay',
         dav1dplay_sources,
         rev_target,
diff --git a/include/common/attributes.h b/include/common/attributes.h
index cd058abf9b6c217dd9b3716f675e83b0569724c1..c8758c19aef74ca721c700ac1d4a995d01a9ab19 100644
--- a/include/common/attributes.h
+++ b/include/common/attributes.h
@@ -189,9 +189,13 @@ static inline int clzll(const unsigned long long mask) {
 #ifndef static_assert
 #define CHECK_OFFSET(type, field, name) \
     struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; }
+#define CHECK_SIZE(type, size) \
+    struct check_##type##_size { int x[(size == sizeof(type)) ? 1 : -1]; }
 #else
 #define CHECK_OFFSET(type, field, name) \
     static_assert(name == offsetof(type, field), #field)
+#define CHECK_SIZE(type, size) \
+    static_assert(size == sizeof(type), #type)
 #endif
 
 #ifdef _MSC_VER
diff --git a/include/common/intops.h b/include/common/intops.h
index 2d21998b7100c323e7f75149ebed3f162a16c0bb..089da5e15ed016af736951820a33eaaf9fa2edf9 100644
--- a/include/common/intops.h
+++ b/include/common/intops.h
@@ -65,11 +65,11 @@ static inline int apply_sign64(const int v, const int64_t s) {
 }
 
 static inline int ulog2(const unsigned v) {
-    return 31 - clz(v);
+    return 31 ^ clz(v);
 }
 
 static inline int u64log2(const uint64_t v) {
-    return 63 - clzll(v);
+    return 63 ^ clzll(v);
 }
 
 static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
diff --git a/include/compat/getopt.h b/include/compat/getopt.h
index 930e002a139142c8600538cc6f7fbcd0a4a4bd68..ad597691ef4860e511b619d585415ee7208412e8 100644
--- a/include/compat/getopt.h
+++ b/include/compat/getopt.h
@@ -13,7 +13,9 @@
 #define __GETOPT_H__
 
 /* All the headers include this file. */
+#ifdef _WIN32
 #include <crtdefs.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/dav1d/meson.build b/include/dav1d/meson.build
index 68faaf9a3695dc7bb40ee3367046d917b5204908..dfb69a1c164192221240449842af6925db797134 100644
--- a/include/dav1d/meson.build
+++ b/include/dav1d/meson.build
@@ -22,24 +22,15 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# installed version.h header generation
-version_h_data = configuration_data()
-version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major)
-version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor)
-version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision)
-version_h_target = configure_file(input: 'version.h.in',
-                                  output: 'version.h',
-                                  configuration: version_h_data)
-
 dav1d_api_headers = [
     'common.h',
     'data.h',
     'dav1d.h',
     'headers.h',
     'picture.h',
+    'version.h',
 ]
 
 # install headers
 install_headers(dav1d_api_headers,
-                version_h_target,
                 subdir : 'dav1d')
diff --git a/include/dav1d/version.h.in b/include/dav1d/version.h
similarity index 88%
rename from include/dav1d/version.h.in
rename to include/dav1d/version.h
index 4fa420ded31e977a49f05c574b68af2c3d33be37..43df60391531695037f582d64e19e912f6a14e40 100644
--- a/include/dav1d/version.h.in
+++ b/include/dav1d/version.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019-2024, VideoLAN and dav1d authors
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,9 +31,9 @@
 extern "C" {
 #endif
 
-#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@
-#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
-#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
+#define DAV1D_API_VERSION_MAJOR 7
+#define DAV1D_API_VERSION_MINOR 0
+#define DAV1D_API_VERSION_PATCH 0
 
 /**
  * Extract version components from the value returned by
diff --git a/meson.build b/meson.build
index f5010ac4855e55d30764e6d721a7474ade037e35..798abc1deb7b8680c529ca3de1f3a8e37478af60 100644
--- a/meson.build
+++ b/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2018-2022, VideoLAN and dav1d authors
+# Copyright © 2018-2024, VideoLAN and dav1d authors
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -23,19 +23,13 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 project('dav1d', ['c'],
-    version: '1.4.2',
+    version: '1.5.0',
     default_options: ['c_std=c99',
                       'warning_level=2',
                       'buildtype=release',
                       'b_ndebug=if-release'],
     meson_version: '>= 0.49.0')
 
-dav1d_soname_version       = '7.0.0'
-dav1d_api_version_array    = dav1d_soname_version.split('.')
-dav1d_api_version_major    = dav1d_api_version_array[0]
-dav1d_api_version_minor    = dav1d_api_version_array[1]
-dav1d_api_version_revision = dav1d_api_version_array[2]
-
 dav1d_src_root = meson.current_source_dir()
 cc = meson.get_compiler('c')
 
@@ -48,7 +42,18 @@ cdata_asm = configuration_data()
 # Include directories
 dav1d_inc_dirs = include_directories(['.', 'include/dav1d', 'include'])
 
-
+dav1d_api_version_major    = cc.get_define('DAV1D_API_VERSION_MAJOR',
+                                           prefix: '#include "dav1d/version.h"',
+                                           include_directories: dav1d_inc_dirs).strip()
+dav1d_api_version_minor    = cc.get_define('DAV1D_API_VERSION_MINOR',
+                                           prefix: '#include "dav1d/version.h"',
+                                           include_directories: dav1d_inc_dirs).strip()
+dav1d_api_version_revision = cc.get_define('DAV1D_API_VERSION_PATCH',
+                                           prefix: '#include "dav1d/version.h"',
+                                           include_directories: dav1d_inc_dirs).strip()
+dav1d_soname_version       = '@0@.@1@.@2@'.format(dav1d_api_version_major,
+                                                  dav1d_api_version_minor,
+                                                  dav1d_api_version_revision)
 
 #
 # Option handling
@@ -98,6 +103,10 @@ if host_machine.system() in ['linux', 'gnu', 'emscripten']
     add_project_arguments('-D_GNU_SOURCE', language: 'c')
 endif
 
+have_clock_gettime = false
+have_posix_memalign = false
+have_memalign = false
+have_aligned_alloc = false
 if host_machine.system() == 'windows'
     cdata.set('_WIN32_WINNT',           '0x0601')
     cdata.set('UNICODE',                1) # Define to 1 for Unicode (Wide Chars) APIs
@@ -145,20 +154,25 @@ else
 
     rt_dependency = []
     if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
-        cdata.set('HAVE_CLOCK_GETTIME', 1)
+        have_clock_gettime = true
     elif host_machine.system() not in ['darwin', 'ios', 'tvos']
         rt_dependency = cc.find_library('rt', required: false)
         if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
             error('clock_gettime not found')
         endif
-        cdata.set('HAVE_CLOCK_GETTIME', 1)
+        have_clock_gettime = true
     endif
 
-    if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
-        cdata.set('HAVE_POSIX_MEMALIGN', 1)
-    endif
+    have_posix_memalign = cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
+    have_memalign = cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
+    have_aligned_alloc = cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
 endif
 
+cdata.set10('HAVE_CLOCK_GETTIME', have_clock_gettime)
+cdata.set10('HAVE_POSIX_MEMALIGN', have_posix_memalign)
+cdata.set10('HAVE_MEMALIGN', have_memalign)
+cdata.set10('HAVE_ALIGNED_ALLOC', have_aligned_alloc)
+
 # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
 have_fseeko = true
 if host_machine.system() == 'android'
@@ -175,12 +189,12 @@ if host_machine.system() == 'android'
 endif
 
 libdl_dependency = []
+have_dlsym = false
 if host_machine.system() == 'linux'
     libdl_dependency = cc.find_library('dl', required : false)
-    if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
-        cdata.set('HAVE_DLSYM', 1)
-    endif
+    have_dlsym = cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
 endif
+cdata.set10('HAVE_DLSYM', have_dlsym)
 
 libm_dependency = cc.find_library('m', required: false)
 
@@ -209,19 +223,13 @@ if host_machine.cpu_family().startswith('wasm')
     stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
 endif
 
-if cc.check_header('unistd.h')
-    cdata.set('HAVE_UNISTD_H', 1)
-endif
-
-if cc.check_header('io.h')
-    cdata.set('HAVE_IO_H', 1)
-endif
-
-if cc.check_header('pthread_np.h')
-    cdata.set('HAVE_PTHREAD_NP_H', 1)
-    test_args += '-DHAVE_PTHREAD_NP_H'
-endif
+cdata.set10('HAVE_SYS_TYPES_H', cc.check_header('sys/types.h'))
+cdata.set10('HAVE_UNISTD_H', cc.check_header('unistd.h'))
+cdata.set10('HAVE_IO_H', cc.check_header('io.h'))
 
+have_pthread_np = cc.check_header('pthread_np.h')
+cdata.set10('HAVE_PTHREAD_NP_H', have_pthread_np)
+test_args += '-DHAVE_PTHREAD_NP_H=' + (have_pthread_np ? '1' : '0')
 
 # Function checks
 
@@ -234,35 +242,32 @@ else
     getopt_dependency = []
 endif
 
+have_getauxval = false
+have_elf_aux_info = false
 if (host_machine.cpu_family() == 'aarch64' or
     host_machine.cpu_family().startswith('arm') or
     host_machine.cpu_family().startswith('loongarch') or
     host_machine.cpu() == 'ppc64le' or
     host_machine.cpu_family().startswith('riscv'))
-    if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
-        cdata.set('HAVE_GETAUXVAL', 1)
-    endif
-    if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
-        cdata.set('HAVE_ELF_AUX_INFO', 1)
-    endif
+    have_getauxval = cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
+    have_elf_aux_info = cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
 endif
 
+cdata.set10('HAVE_GETAUXVAL', have_getauxval)
+cdata.set10('HAVE_ELF_AUX_INFO', have_elf_aux_info)
+
 pthread_np_prefix = '''
 #include <pthread.h>
-#ifdef HAVE_PTHREAD_NP_H
+#if HAVE_PTHREAD_NP_H
 #include <pthread_np.h>
 #endif
 '''
-if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
-    cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1)
-endif
-if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
-    cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1)
-endif
+cdata.set10('HAVE_PTHREAD_GETAFFINITY_NP', cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
+cdata.set10('HAVE_PTHREAD_SETAFFINITY_NP', cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
+cdata.set10('HAVE_PTHREAD_SETNAME_NP', cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
+cdata.set10('HAVE_PTHREAD_SET_NAME_NP', cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
 
-if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
-    cdata.set('HAVE_C11_GENERIC', 1)
-endif
+cdata.set10('HAVE_C11_GENERIC', cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args))
 
 # Compiler flag tests
 
@@ -343,6 +348,17 @@ endif
 
 cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64')
 cdata.set10('ARCH_ARM',     host_machine.cpu_family().startswith('arm') and host_machine.cpu() != 'arm64')
+
+have_as_func = false
+have_as_arch = false
+aarch64_extensions = {
+    'dotprod': 'udot v0.4s, v0.16b, v0.16b',
+    'i8mm':    'usdot v0.4s, v0.16b, v0.16b',
+    'sve':     'whilelt p0.s, x0, x1',
+    'sve2':    'sqrdmulh z0.s, z0.s, z0.s',
+}
+supported_aarch64_archexts = []
+supported_aarch64_instructions = []
 if (is_asm_enabled and
     (host_machine.cpu_family() == 'aarch64' or
      host_machine.cpu_family().startswith('arm')))
@@ -353,7 +369,6 @@ if (is_asm_enabled and
 );
 '''
     have_as_func = cc.compiles(as_func_code)
-    cdata.set10('HAVE_AS_FUNC', have_as_func)
 
     # fedora package build infrastructure uses a gcc specs file to enable
     # '-fPIE' by default. The chosen way only adds '-fPIE' to the C compiler
@@ -374,7 +389,6 @@ if (is_asm_enabled and
 
     if host_machine.cpu_family() == 'aarch64'
         have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''')
-        cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
         as_arch_str = ''
         if have_as_arch
             as_arch_level = 'armv8-a'
@@ -403,13 +417,7 @@ if (is_asm_enabled and
             cdata.set('AS_ARCH_LEVEL', as_arch_level)
             as_arch_str = '".arch ' + as_arch_level + '\\n"'
         endif
-        extensions = {
-            'dotprod': 'udot v0.4s, v0.16b, v0.16b',
-            'i8mm':    'usdot v0.4s, v0.16b, v0.16b',
-            'sve':     'whilelt p0.s, x0, x1',
-            'sve2':    'sqrdmulh z0.s, z0.s, z0.s',
-        }
-        foreach name, instr : extensions
+        foreach name, instr : aarch64_extensions
             # Test for support for the various extensions. First test if
             # the assembler supports the .arch_extension directive for
             # enabling/disabling the extension, then separately check whether
@@ -420,19 +428,27 @@ if (is_asm_enabled and
             code += '".arch_extension ' + name + '\\n"'
             code += ');'
             supports_archext = cc.compiles(code)
-            cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext)
             code = '__asm__ (' + as_arch_str
             if supports_archext
+                supported_aarch64_archexts += name
                 code += '".arch_extension ' + name + '\\n"'
             endif
             code += '"' + instr + '\\n"'
             code += ');'
-            supports_instr = cc.compiles(code, name: name.to_upper())
-            cdata.set10('HAVE_' + name.to_upper(), supports_instr)
+            if cc.compiles(code, name: name.to_upper())
+                supported_aarch64_instructions += name
+            endif
         endforeach
     endif
 endif
 
+cdata.set10('HAVE_AS_FUNC', have_as_func)
+cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
+foreach name, _ : aarch64_extensions
+    cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', name in supported_aarch64_archexts)
+    cdata.set10('HAVE_' + name.to_upper(), name in supported_aarch64_instructions)
+endforeach
+
 cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
 cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
 cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
diff --git a/src/arm/32/util.S b/src/arm/32/util.S
index c3710d37670f441dc69279c3b5f8a263940f2c6d..38d63f855d0166dad05bb4d2e2c6e1ea3af5d529 100644
--- a/src/arm/32/util.S
+++ b/src/arm/32/util.S
@@ -31,18 +31,36 @@
 
 #include "config.h"
 #include "src/arm/asm.S"
+#include "src/arm/arm-arch.h"
+
+.macro v4bx rd
+#if __ARM_ARCH >= 5 || defined(__ARM_ARCH_4T__)
+        bx              \rd
+#else
+        mov             pc, \rd
+#endif
+.endm
+
+.macro v4blx rd
+#if __ARM_ARCH >= 5
+        blx             \rd
+#else
+        mov             lr,  pc
+        v4bx            \rd
+#endif
+.endm
 
 .macro movrel_local rd, val, offset=0
-#if defined(PIC)
+#if (__ARM_ARCH >= 7 || defined(__ARM_ARCH_6T2__)) && !defined(PIC)
+        movw            \rd, #:lower16:\val+\offset
+        movt            \rd, #:upper16:\val+\offset
+#else
         ldr             \rd,  90001f
         b               90002f
 90001:
         .word           \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
 90002:
         add             \rd,  \rd,  pc
-#else
-        movw            \rd, #:lower16:\val+\offset
-        movt            \rd, #:upper16:\val+\offset
 #endif
 .endm
 
diff --git a/src/arm/64/filmgrain.S b/src/arm/64/filmgrain.S
index aa7f18bf39d53d824081550d1fc3ca39500dea1a..864ceba974a37450dc6f409baca1c49c4df03871 100644
--- a/src/arm/64/filmgrain.S
+++ b/src/arm/64/filmgrain.S
@@ -884,12 +884,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
 .else
         add             x4,  x1,  #FGD_AR_COEFFS_UV
 .endif
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
         ldr             w17, [x1, #FGD_AR_COEFF_LAG]
         add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
         dup             v31.8h,  w9    // 4 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
         neg             v31.8h,  v31.8h
 
 .ifc \type, uv_444
@@ -1075,13 +1075,14 @@ L(generate_grain_\type\()_lag3):
         ldp             x30, x19, [sp], #96
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm
 
 gen_grain_82 y
@@ -1118,12 +1119,12 @@ function generate_grain_\type\()_8bpc_neon, export=1
         ldr             w2,  [x1, #FGD_SEED]
         ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
         add             x4,  x1,  #FGD_AR_COEFFS_UV
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
         ldr             w17, [x1, #FGD_AR_COEFF_LAG]
         add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
         dup             v31.8h,  w9    // 4 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
         neg             v31.8h,  v31.8h
 
         cmp             w13, #0
@@ -1272,13 +1273,14 @@ L(generate_grain_\type\()_lag3):
         ldp             x30, x19, [sp], #96
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm
 
 gen_grain_44 uv_420
@@ -1407,18 +1409,18 @@ function fgy_32x32_8bpc_neon, export=1
         add_offset      x5,  w6,  x10, x5,  x9
 
         ldr             w11, [sp, #24]         // type
-        adr             x13, L(fgy_loop_tbl)
+        movrel          x13, fgy_loop_tbl
 
         add             x4,  x12, #32          // grain_lut += FG_BLOCK_SIZE * bx
         add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
 
         tst             w11, #1
-        ldrh            w11, [x13, w11, uxtw #1]
+        ldrsw           x11, [x13, w11, uxtw #2]
 
         add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
         add             x8,  x8,  #32          // grain_lut += FG_BLOCK_SIZE * bx
 
-        sub             x11, x13, w11, uxtw
+        add             x11, x13, x11
 
         b.eq            1f
         // y overlap
@@ -1555,14 +1557,15 @@ L(loop_\ox\oy):
         fgy             0, 1
         fgy             1, 0
         fgy             1, 1
-
-L(fgy_loop_tbl):
-        .hword L(fgy_loop_tbl) - L(loop_00)
-        .hword L(fgy_loop_tbl) - L(loop_01)
-        .hword L(fgy_loop_tbl) - L(loop_10)
-        .hword L(fgy_loop_tbl) - L(loop_11)
 endfunc
 
+jumptable fgy_loop_tbl
+        .word L(loop_00) - fgy_loop_tbl
+        .word L(loop_01) - fgy_loop_tbl
+        .word L(loop_10) - fgy_loop_tbl
+        .word L(loop_11) - fgy_loop_tbl
+endjumptable
+
 // void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
 //                                     const pixel *const src,
 //                                     const ptrdiff_t stride,
@@ -1646,11 +1649,11 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
         ldr             w13, [sp, #64]         // type
 
         movrel          x16, overlap_coeffs_\sx
-        adr             x14, L(fguv_loop_sx\sx\()_tbl)
+        movrel          x14, fguv_loop_sx\sx\()_tbl
 
         ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
         tst             w13, #1
-        ldrh            w13, [x14, w13, uxtw #1]
+        ldrsw           x13, [x14, w13, uxtw #2]
 
         b.eq            1f
         // y overlap
@@ -1658,7 +1661,7 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
         mov             w9,  #(2 >> \sy)
 
 1:
-        sub             x13, x14, w13, uxtw
+        add             x13, x14, x13
 
 .if \sy
         movi            v25.16b, #23
@@ -1848,18 +1851,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
         ldr             x30,      [sp], #32
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(fguv_loop_sx0_tbl):
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
 endfunc
 
+jumptable fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
+endjumptable
+
 function fguv_loop_sx1_neon
 .macro fguv_loop_sx1 csfl, ox, oy
 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
@@ -1997,14 +2001,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
         ldr             x30,      [sp], #32
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(fguv_loop_sx1_tbl):
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
 endfunc
+
+jumptable fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
+endjumptable
diff --git a/src/arm/64/filmgrain16.S b/src/arm/64/filmgrain16.S
index 75252acfb1a8398650bff3e0a2f235fb219e77d7..aa6b75b171e05022a363f1b15bd3b9a5e23714e2 100644
--- a/src/arm/64/filmgrain16.S
+++ b/src/arm/64/filmgrain16.S
@@ -740,12 +740,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
         add             x4,  x1,  #FGD_AR_COEFFS_UV
 .endif
         add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
         ldr             w17, [x1, #FGD_AR_COEFF_LAG]
         add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
         dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
         neg             v31.8h,  v31.8h
 
 .ifc \type, uv_444
@@ -945,13 +945,14 @@ L(generate_grain_\type\()_lag3):
         ldp             x30, x19, [sp], #96
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm
 
 gen_grain_82 y
@@ -991,12 +992,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
         ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
         add             x4,  x1,  #FGD_AR_COEFFS_UV
         add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
-        adr             x16, L(gen_grain_\type\()_tbl)
+        movrel          x16, gen_grain_\type\()_tbl
         ldr             w17, [x1, #FGD_AR_COEFF_LAG]
         add             w9,  w9,  #4
-        ldrh            w17, [x16, w17, uxtw #1]
+        ldrsw           x17, [x16, w17, uxtw #2]
         dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
-        sub             x16, x16, w17, uxtw
+        add             x16, x16, x17
         neg             v31.8h,  v31.8h
 
         cmp             w13, #0
@@ -1155,13 +1156,14 @@ L(generate_grain_\type\()_lag3):
         ldp             x30, x19, [sp], #96
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(gen_grain_\type\()_tbl):
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
-        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
 endfunc
+
+jumptable gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
+        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
+endjumptable
 .endm
 
 gen_grain_44 uv_420
@@ -1306,18 +1308,18 @@ function fgy_32x32_16bpc_neon, export=1
         add_offset      x5,  w6,  x10, x5,  x9
 
         ldr             w11, [sp, #88]         // type
-        adr             x13, L(fgy_loop_tbl)
+        movrel          x13, fgy_loop_tbl
 
         add             x4,  x12, #32*2        // grain_lut += FG_BLOCK_SIZE * bx
         add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
 
         tst             w11, #1
-        ldrh            w11, [x13, w11, uxtw #1]
+        ldrsw           x11, [x13, w11, uxtw #2]
 
         add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
         add             x8,  x8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
 
-        sub             x11, x13, w11, uxtw
+        add             x11, x13, x11
 
         b.eq            1f
         // y overlap
@@ -1480,14 +1482,15 @@ L(loop_\ox\oy):
         fgy             0, 1
         fgy             1, 0
         fgy             1, 1
-
-L(fgy_loop_tbl):
-        .hword L(fgy_loop_tbl) - L(loop_00)
-        .hword L(fgy_loop_tbl) - L(loop_01)
-        .hword L(fgy_loop_tbl) - L(loop_10)
-        .hword L(fgy_loop_tbl) - L(loop_11)
 endfunc
 
+jumptable fgy_loop_tbl
+        .word L(loop_00) - fgy_loop_tbl
+        .word L(loop_01) - fgy_loop_tbl
+        .word L(loop_10) - fgy_loop_tbl
+        .word L(loop_11) - fgy_loop_tbl
+endjumptable
+
 // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
 //                                      const pixel *const src,
 //                                      const ptrdiff_t stride,
@@ -1589,11 +1592,11 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
         ldr             w13, [sp, #112]        // type
 
         movrel          x16, overlap_coeffs_\sx
-        adr             x14, L(fguv_loop_sx\sx\()_tbl)
+        movrel          x14, fguv_loop_sx\sx\()_tbl
 
         ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
         tst             w13, #1
-        ldrh            w13, [x14, w13, uxtw #1]
+        ldrsw           x13, [x14, w13, uxtw #2]
 
         b.eq            1f
         // y overlap
@@ -1601,7 +1604,7 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
         mov             w9,  #(2 >> \sy)
 
 1:
-        sub             x13, x14, w13, uxtw
+        add             x13, x14, x13
 
 .if \sy
         movi            v25.8h,  #23
@@ -1818,18 +1821,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
         ldr             x30,      [sp], #80
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(fguv_loop_sx0_tbl):
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
-        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
 endfunc
 
+jumptable fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
+        .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
+endjumptable
+
 function fguv_loop_sx1_neon
 .macro fguv_loop_sx1 csfl, ox, oy
 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
@@ -1984,14 +1988,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
         ldr             x30,      [sp], #80
         AARCH64_VALIDATE_LINK_REGISTER
         ret
-
-L(fguv_loop_sx1_tbl):
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
-        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
 endfunc
+
+jumptable fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
+        .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
+endjumptable
diff --git a/src/arm/64/ipred.S b/src/arm/64/ipred.S
index 709238e2f85474218578811c16746ef152b100d5..5a375d8dca06446435126d0860488fb36feb25ed 100644
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -34,16 +34,17 @@
 //                             const int max_width, const int max_height);
 function ipred_dc_128_8bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_dc_128_tbl)
+        movrel          x5,  ipred_dc_128_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         movi            v0.16b,  #128
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
@@ -51,8 +52,9 @@ function ipred_dc_128_8bpc_neon, export=1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
@@ -60,8 +62,9 @@ function ipred_dc_128_8bpc_neon, export=1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
@@ -93,26 +96,27 @@ function ipred_dc_128_8bpc_neon, export=1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_dc_128_tbl):
-        .hword L(ipred_dc_128_tbl) - 640b
-        .hword L(ipred_dc_128_tbl) - 320b
-        .hword L(ipred_dc_128_tbl) -  16b
-        .hword L(ipred_dc_128_tbl) -   8b
-        .hword L(ipred_dc_128_tbl) -   4b
 endfunc
 
+jumptable ipred_dc_128_tbl
+        .word 640b - ipred_dc_128_tbl
+        .word 320b - ipred_dc_128_tbl
+        .word 160b - ipred_dc_128_tbl
+        .word 80b  - ipred_dc_128_tbl
+        .word 40b  - ipred_dc_128_tbl
+endjumptable
+
 // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                        const pixel *const topleft,
 //                        const int width, const int height, const int a,
 //                        const int max_width, const int max_height);
 function ipred_v_8bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_v_tbl)
+        movrel          x5,  ipred_v_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         add             x2,  x2,  #1
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -171,32 +175,34 @@ function ipred_v_8bpc_neon, export=1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_v_tbl):
-        .hword L(ipred_v_tbl) - 640b
-        .hword L(ipred_v_tbl) - 320b
-        .hword L(ipred_v_tbl) - 160b
-        .hword L(ipred_v_tbl) -  80b
-        .hword L(ipred_v_tbl) -  40b
 endfunc
 
+jumptable ipred_v_tbl
+        .word 640b - ipred_v_tbl
+        .word 320b - ipred_v_tbl
+        .word 160b - ipred_v_tbl
+        .word 80b  - ipred_v_tbl
+        .word 40b  - ipred_v_tbl
+endjumptable
+
 // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                        const pixel *const topleft,
 //                        const int width, const int height, const int a,
 //                        const int max_width, const int max_height);
 function ipred_h_8bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_h_tbl)
+        movrel          x5,  ipred_h_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         sub             x2,  x2,  #4
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         mov             x7,  #-4
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         st1             {v3.s}[0],  [x0], x1
         st1             {v2.s}[0],  [x6], x1
@@ -205,8 +211,9 @@ function ipred_h_8bpc_neon, export=1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         st1             {v3.8b},  [x0], x1
         st1             {v2.8b},  [x6], x1
@@ -215,8 +222,9 @@ function ipred_h_8bpc_neon, export=1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         st1             {v3.16b}, [x0], x1
         st1             {v2.16b}, [x6], x1
@@ -225,8 +233,9 @@ function ipred_h_8bpc_neon, export=1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
@@ -239,8 +248,9 @@ function ipred_h_8bpc_neon, export=1
         st1             {v0.16b}, [x6], x1
         b.gt            32b
         ret
-64:
+640:
         AARCH64_VALID_JUMP_TARGET
+64:
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
@@ -257,26 +267,27 @@ function ipred_h_8bpc_neon, export=1
         st1             {v0.16b}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_h_tbl):
-        .hword L(ipred_h_tbl) - 64b
-        .hword L(ipred_h_tbl) - 32b
-        .hword L(ipred_h_tbl) - 16b
-        .hword L(ipred_h_tbl) -  8b
-        .hword L(ipred_h_tbl) -  4b
 endfunc
 
+jumptable ipred_h_tbl
+        .word 640b - ipred_h_tbl
+        .word 320b - ipred_h_tbl
+        .word 160b - ipred_h_tbl
+        .word 80b  - ipred_h_tbl
+        .word 40b  - ipred_h_tbl
+endjumptable
+
 // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                             const pixel *const topleft,
 //                             const int width, const int height, const int a,
 //                             const int max_width, const int max_height);
 function ipred_dc_top_8bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_dc_top_tbl)
+        movrel          x5,  ipred_dc_top_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         add             x2,  x2,  #1
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -362,15 +373,16 @@ function ipred_dc_top_8bpc_neon, export=1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_dc_top_tbl):
-        .hword L(ipred_dc_top_tbl) - 640b
-        .hword L(ipred_dc_top_tbl) - 320b
-        .hword L(ipred_dc_top_tbl) - 160b
-        .hword L(ipred_dc_top_tbl) -  80b
-        .hword L(ipred_dc_top_tbl) -  40b
 endfunc
 
+jumptable ipred_dc_top_tbl
+        .word 640b - ipred_dc_top_tbl
+        .word 320b - ipred_dc_top_tbl
+        .word 160b - ipred_dc_top_tbl
+        .word 80b  - ipred_dc_top_tbl
+        .word 40b  - ipred_dc_top_tbl
+endjumptable
+
 // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height, const int a,
@@ -379,13 +391,13 @@ function ipred_dc_left_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         clz             w3,  w3
         clz             w7,  w4
-        adr             x5,  L(ipred_dc_left_tbl)
+        movrel          x5,  ipred_dc_left_tbl
         sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
         sub             w7,  w7,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
-        ldrh            w7,  [x5, w7, uxtw #1]
-        sub             x3,  x5,  w3, uxtw
-        sub             x5,  x5,  w7, uxtw
+        ldrsw           x3,  [x5, w3, uxtw #2]
+        ldrsw           x7,  [x5, w7, uxtw #2]
+        add             x3,  x5,  x3
+        add             x5,  x5,  x7
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -399,12 +411,13 @@ L(ipred_dc_left_h4):
         br              x3
 L(ipred_dc_left_w4):
         AARCH64_VALID_JUMP_TARGET
+1:
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
-        b.gt            L(ipred_dc_left_w4)
+        b.gt            1b
         ret
 
 L(ipred_dc_left_h8):
@@ -416,12 +429,13 @@ L(ipred_dc_left_h8):
         br              x3
 L(ipred_dc_left_w8):
         AARCH64_VALID_JUMP_TARGET
+1:
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
-        b.gt            L(ipred_dc_left_w8)
+        b.gt            1b
         ret
 
 L(ipred_dc_left_h16):
@@ -433,12 +447,13 @@ L(ipred_dc_left_h16):
         br              x3
 L(ipred_dc_left_w16):
         AARCH64_VALID_JUMP_TARGET
+1:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
-        b.gt            L(ipred_dc_left_w16)
+        b.gt            1b
         ret
 
 L(ipred_dc_left_h32):
@@ -488,20 +503,21 @@ L(ipred_dc_left_w64):
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            1b
         ret
-
-L(ipred_dc_left_tbl):
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
 endfunc
 
+jumptable ipred_dc_left_tbl
+        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
+endjumptable
+
 // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                         const pixel *const topleft,
 //                         const int width, const int height, const int a,
@@ -512,16 +528,16 @@ function ipred_dc_8bpc_neon, export=1
         clz             w3,  w3
         clz             w6,  w4
         dup             v16.8h, w7               // width + height
-        adr             x5,  L(ipred_dc_tbl)
+        movrel          x5,  ipred_dc_tbl
         rbit            w7,  w7                  // rbit(width + height)
         sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
         sub             w6,  w6,  #25
         clz             w7,  w7                  // ctz(width + height)
-        ldrh            w3,  [x5, w3, uxtw #1]
-        ldrh            w6,  [x5, w6, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
+        ldrsw           x6,  [x5, w6, uxtw #2]
         neg             w7,  w7                  // -ctz(width + height)
-        sub             x3,  x5,  w3, uxtw
-        sub             x5,  x5,  w6, uxtw
+        add             x3,  x5,  x3
+        add             x5,  x5,  x6
         ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
         dup             v17.8h,  w7              // -ctz(width + height)
         add             x6,  x0,  x1
@@ -713,33 +729,34 @@ L(ipred_dc_w64):
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            2b
         ret
-
-L(ipred_dc_tbl):
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
 endfunc
 
+jumptable ipred_dc_tbl
+        .word L(ipred_dc_h64) - ipred_dc_tbl
+        .word L(ipred_dc_h32) - ipred_dc_tbl
+        .word L(ipred_dc_h16) - ipred_dc_tbl
+        .word L(ipred_dc_h8)  - ipred_dc_tbl
+        .word L(ipred_dc_h4)  - ipred_dc_tbl
+        .word L(ipred_dc_w64) - ipred_dc_tbl
+        .word L(ipred_dc_w32) - ipred_dc_tbl
+        .word L(ipred_dc_w16) - ipred_dc_tbl
+        .word L(ipred_dc_w8)  - ipred_dc_tbl
+        .word L(ipred_dc_w4)  - ipred_dc_tbl
+endjumptable
+
 // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                            const pixel *const topleft,
 //                            const int width, const int height, const int a,
 //                            const int max_width, const int max_height);
 function ipred_paeth_8bpc_neon, export=1
         clz             w9,  w3
-        adr             x5,  L(ipred_paeth_tbl)
+        movrel          x5,  ipred_paeth_tbl
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v4.16b},  [x2]
         add             x8,  x2,  #1
         sub             x2,  x2,  #4
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         mov             x7,  #-4
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
@@ -898,15 +915,16 @@ function ipred_paeth_8bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_paeth_tbl):
-        .hword L(ipred_paeth_tbl) - 640b
-        .hword L(ipred_paeth_tbl) - 320b
-        .hword L(ipred_paeth_tbl) - 160b
-        .hword L(ipred_paeth_tbl) -  80b
-        .hword L(ipred_paeth_tbl) -  40b
 endfunc
 
+jumptable ipred_paeth_tbl
+        .word 640b - ipred_paeth_tbl
+        .word 320b - ipred_paeth_tbl
+        .word 160b - ipred_paeth_tbl
+        .word 80b  - ipred_paeth_tbl
+        .word 40b  - ipred_paeth_tbl
+endjumptable
+
 // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                             const pixel *const topleft,
 //                             const int width, const int height, const int a,
@@ -916,13 +934,13 @@ function ipred_smooth_8bpc_neon, export=1
         add             x11, x10, w4, uxtw
         add             x10, x10, w3, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_tbl)
+        movrel          x5,  ipred_smooth_tbl
         sub             x12, x2,  w4, uxtw
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v4.16b},  [x12] // bottom
         add             x8,  x2,  #1
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1079,15 +1097,16 @@ function ipred_smooth_8bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_smooth_tbl):
-        .hword L(ipred_smooth_tbl) - 640b
-        .hword L(ipred_smooth_tbl) - 320b
-        .hword L(ipred_smooth_tbl) - 160b
-        .hword L(ipred_smooth_tbl) -  80b
-        .hword L(ipred_smooth_tbl) -  40b
 endfunc
 
+jumptable ipred_smooth_tbl
+        .word 640b - ipred_smooth_tbl
+        .word 320b - ipred_smooth_tbl
+        .word 160b - ipred_smooth_tbl
+        .word 80b  - ipred_smooth_tbl
+        .word 40b  - ipred_smooth_tbl
+endjumptable
+
 // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height, const int a,
@@ -1096,13 +1115,13 @@ function ipred_smooth_v_8bpc_neon, export=1
         movrel          x7,  X(sm_weights)
         add             x7,  x7,  w4, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_v_tbl)
+        movrel          x5,  ipred_smooth_v_tbl
         sub             x8,  x2,  w4, uxtw
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v4.16b},  [x8] // bottom
         add             x2,  x2,  #1
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1220,15 +1239,16 @@ function ipred_smooth_v_8bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_smooth_v_tbl):
-        .hword L(ipred_smooth_v_tbl) - 640b
-        .hword L(ipred_smooth_v_tbl) - 320b
-        .hword L(ipred_smooth_v_tbl) - 160b
-        .hword L(ipred_smooth_v_tbl) -  80b
-        .hword L(ipred_smooth_v_tbl) -  40b
 endfunc
 
+jumptable ipred_smooth_v_tbl
+        .word 640b - ipred_smooth_v_tbl
+        .word 320b - ipred_smooth_v_tbl
+        .word 160b - ipred_smooth_v_tbl
+        .word 80b  - ipred_smooth_v_tbl
+        .word 40b  - ipred_smooth_v_tbl
+endjumptable
+
 // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height, const int a,
@@ -1237,12 +1257,12 @@ function ipred_smooth_h_8bpc_neon, export=1
         movrel          x8,  X(sm_weights)
         add             x8,  x8,  w3, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_h_tbl)
+        movrel          x5,  ipred_smooth_h_tbl
         add             x12, x2,  w3, uxtw
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v5.16b},  [x12] // right
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1366,15 +1386,16 @@ function ipred_smooth_h_8bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_smooth_h_tbl):
-        .hword L(ipred_smooth_h_tbl) - 640b
-        .hword L(ipred_smooth_h_tbl) - 320b
-        .hword L(ipred_smooth_h_tbl) - 160b
-        .hword L(ipred_smooth_h_tbl) -  80b
-        .hword L(ipred_smooth_h_tbl) -  40b
 endfunc
 
+jumptable ipred_smooth_h_tbl
+        .word 640b - ipred_smooth_h_tbl
+        .word 320b - ipred_smooth_h_tbl
+        .word 160b - ipred_smooth_h_tbl
+        .word 80b  - ipred_smooth_h_tbl
+        .word 40b  - ipred_smooth_h_tbl
+endjumptable
+
 const padding_mask_buf
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
@@ -1653,11 +1674,11 @@ endfunc
 //                               const int dx, const int max_base_x);
 function ipred_z1_fill1_8bpc_neon, export=1
         clz             w9,  w3
-        adr             x8,  L(ipred_z1_fill1_tbl)
+        movrel          x8,  ipred_z1_fill1_tbl
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
+        ldrsw           x9,  [x8, w9, uxtw #2]
         add             x10, x2,  w6,  uxtw       // top[max_base_x]
-        sub             x8,  x8,  w9,  uxtw
+        add             x8,  x8,  x9
         ld1r            {v31.16b}, [x10]          // padding
         mov             w7,  w5
         mov             w15, #64
@@ -1815,15 +1836,16 @@ function ipred_z1_fill1_8bpc_neon, export=1
         add             x13, x13, x1
         mov             w3,  w12
         b               169b
-
-L(ipred_z1_fill1_tbl):
-        .hword L(ipred_z1_fill1_tbl) - 640b
-        .hword L(ipred_z1_fill1_tbl) - 320b
-        .hword L(ipred_z1_fill1_tbl) - 160b
-        .hword L(ipred_z1_fill1_tbl) -  80b
-        .hword L(ipred_z1_fill1_tbl) -  40b
 endfunc
 
+jumptable ipred_z1_fill1_tbl
+        .word 640b - ipred_z1_fill1_tbl
+        .word 320b - ipred_z1_fill1_tbl
+        .word 160b - ipred_z1_fill1_tbl
+        .word 80b  - ipred_z1_fill1_tbl
+        .word 40b  - ipred_z1_fill1_tbl
+endjumptable
+
 function ipred_z1_fill2_8bpc_neon, export=1
         cmp             w3,  #8
         add             x10, x2,  w6,  uxtw       // top[max_base_x]
@@ -1940,11 +1962,11 @@ endconst
 //                               const int dx, const int dy);
 function ipred_z2_fill1_8bpc_neon, export=1
         clz             w10, w4
-        adr             x9,  L(ipred_z2_fill1_tbl)
+        movrel          x9,  ipred_z2_fill1_tbl
         sub             w10, w10, #25
-        ldrh            w10, [x9, w10, uxtw #1]
+        ldrsw           x10, [x9, w10, uxtw #2]
         mov             w8,  #(1 << 6)            // xpos = 1 << 6
-        sub             x9,  x9,  w10, uxtw
+        add             x9,  x9,  x10
         sub             w8,  w8,  w6              // xpos -= dx
 
         movrel          x11, increments
@@ -2650,15 +2672,16 @@ function ipred_z2_fill1_8bpc_neon, export=1
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
         ret
-
-L(ipred_z2_fill1_tbl):
-        .hword L(ipred_z2_fill1_tbl) - 640b
-        .hword L(ipred_z2_fill1_tbl) - 320b
-        .hword L(ipred_z2_fill1_tbl) - 160b
-        .hword L(ipred_z2_fill1_tbl) -  80b
-        .hword L(ipred_z2_fill1_tbl) -  40b
 endfunc
 
+jumptable ipred_z2_fill1_tbl
+        .word 640b - ipred_z2_fill1_tbl
+        .word 320b - ipred_z2_fill1_tbl
+        .word 160b - ipred_z2_fill1_tbl
+        .word 80b  - ipred_z2_fill1_tbl
+        .word 40b  - ipred_z2_fill1_tbl
+endjumptable
+
 function ipred_z2_fill2_8bpc_neon, export=1
         cmp             w4,  #8
         mov             w8,  #(2 << 6)            // xpos = 2 << 6
@@ -3160,11 +3183,11 @@ endfunc
 function ipred_z3_fill1_8bpc_neon, export=1
         cmp             w6,  #64
         clz             w9,  w3
-        adr             x8,  L(ipred_z3_fill1_tbl)
+        movrel          x8,  ipred_z3_fill1_tbl
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
+        ldrsw           x9,  [x8, w9, uxtw #2]
         add             x10, x2,  w6,  uxtw       // left[max_base_y]
-        sub             x8,  x8,  w9,  uxtw
+        add             x8,  x8,  x9
         movrel          x11, increments
         ld1r            {v31.16b}, [x10]          // padding
         ld1             {v30.8h},  [x11]          // increments
@@ -3502,19 +3525,20 @@ L(ipred_z3_fill1_large_h16):
         b               1b
 9:
         ret
-
-L(ipred_z3_fill1_tbl):
-        .hword L(ipred_z3_fill1_tbl) - 640b
-        .hword L(ipred_z3_fill1_tbl) - 320b
-        .hword L(ipred_z3_fill1_tbl) - 160b
-        .hword L(ipred_z3_fill1_tbl) -  80b
-        .hword L(ipred_z3_fill1_tbl) -  40b
 endfunc
 
+jumptable ipred_z3_fill1_tbl
+        .word 640b - ipred_z3_fill1_tbl
+        .word 320b - ipred_z3_fill1_tbl
+        .word 160b - ipred_z3_fill1_tbl
+        .word 80b  - ipred_z3_fill1_tbl
+        .word 40b  - ipred_z3_fill1_tbl
+endjumptable
+
 function ipred_z3_fill_padding_neon, export=0
         cmp             w3,  #16
-        adr             x8,  L(ipred_z3_fill_padding_tbl)
-        b.gt            L(ipred_z3_fill_padding_wide)
+        movrel          x8,  ipred_z3_fill_padding_tbl
+        b.gt            ipred_z3_fill_padding_wide
         // w3 = remaining width, w4 = constant height
         mov             w12, w4
 
@@ -3524,12 +3548,13 @@ function ipred_z3_fill_padding_neon, export=0
         // power of two in the remaining width, and repeating.
         clz             w9,  w3
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
-        sub             x9,  x8,  w9,  uxtw
+        ldrsw           x9,  [x8, w9, uxtw #2]
+        add             x9,  x8,  x9
         br              x9
 
-2:
+20:
         AARCH64_VALID_JUMP_TARGET
+2:
         st1             {v31.h}[0], [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.h}[0], [x13], x1
@@ -3547,8 +3572,9 @@ function ipred_z3_fill_padding_neon, export=0
         mov             w4,  w12
         b               1b
 
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         st1             {v31.s}[0], [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.s}[0], [x13], x1
@@ -3566,14 +3592,15 @@ function ipred_z3_fill_padding_neon, export=0
         mov             w4,  w12
         b               1b
 
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         st1             {v31.8b}, [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.8b}, [x13], x1
         st1             {v31.8b}, [x0],  x1
         st1             {v31.8b}, [x13], x1
-        b.gt            4b
+        b.gt            8b
         subs            w3,  w3,  #8
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
@@ -3585,16 +3612,17 @@ function ipred_z3_fill_padding_neon, export=0
         mov             w4,  w12
         b               1b
 
-16:
-32:
-64:
+160:
+320:
+640:
         AARCH64_VALID_JUMP_TARGET
+16:
         st1             {v31.16b}, [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.16b}, [x13], x1
         st1             {v31.16b}, [x0],  x1
         st1             {v31.16b}, [x13], x1
-        b.gt            4b
+        b.gt            16b
         subs            w3,  w3,  #16
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
@@ -3608,16 +3636,18 @@ function ipred_z3_fill_padding_neon, export=0
 
 9:
         ret
+endfunc
 
-L(ipred_z3_fill_padding_tbl):
-        .hword L(ipred_z3_fill_padding_tbl) - 64b
-        .hword L(ipred_z3_fill_padding_tbl) - 32b
-        .hword L(ipred_z3_fill_padding_tbl) - 16b
-        .hword L(ipred_z3_fill_padding_tbl) -  8b
-        .hword L(ipred_z3_fill_padding_tbl) -  4b
-        .hword L(ipred_z3_fill_padding_tbl) -  2b
+jumptable ipred_z3_fill_padding_tbl
+        .word 640b - ipred_z3_fill_padding_tbl
+        .word 320b - ipred_z3_fill_padding_tbl
+        .word 160b - ipred_z3_fill_padding_tbl
+        .word 80b  - ipred_z3_fill_padding_tbl
+        .word 40b  - ipred_z3_fill_padding_tbl
+        .word 20b  - ipred_z3_fill_padding_tbl
+endjumptable
 
-L(ipred_z3_fill_padding_wide):
+function ipred_z3_fill_padding_wide
         // Fill a WxH rectangle with padding, with W > 16.
         lsr             x1,  x1,  #1
         mov             w12, w3
@@ -3770,13 +3800,13 @@ function ipred_filter_8bpc_neon, export=1
         add             x6,  x6,  w5, uxtw
         ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
         clz             w9,  w3
-        adr             x5,  L(ipred_filter_tbl)
+        movrel          x5,  ipred_filter_tbl
         ld1             {v20.8b, v21.8b, v22.8b}, [x6]
         sub             w9,  w9,  #26
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         sxtl            v16.8h,  v16.8b
         sxtl            v17.8h,  v17.8b
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         sxtl            v18.8h,  v18.8b
         sxtl            v19.8h,  v19.8b
         add             x6,  x0,  x1
@@ -3916,30 +3946,32 @@ function ipred_filter_8bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_filter_tbl):
-        .hword L(ipred_filter_tbl) - 320b
-        .hword L(ipred_filter_tbl) - 160b
-        .hword L(ipred_filter_tbl) -  80b
-        .hword L(ipred_filter_tbl) -  40b
 endfunc
 
+jumptable ipred_filter_tbl
+        .word 320b - ipred_filter_tbl
+        .word 160b - ipred_filter_tbl
+        .word 80b  - ipred_filter_tbl
+        .word 40b  - ipred_filter_tbl
+endjumptable
+
 // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                         const pixel *const pal, const uint8_t *idx,
 //                         const int w, const int h);
 function pal_pred_8bpc_neon, export=1
         ld1             {v0.8b}, [x2]
         clz             w9,  w4
-        adr             x6,  L(pal_pred_tbl)
+        movrel          x6,  pal_pred_tbl
         sub             w9,  w9,  #25
         movi            v31.16b, #7
-        ldrh            w9,  [x6, w9, uxtw #1]
-        sub             x6,  x6,  w9, uxtw
+        ldrsw           x9,  [x6, w9, uxtw #2]
+        add             x6,  x6,  x9
         add             x2,  x0,  x1
         lsl             x1,  x1,  #1
         br              x6
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld1             {v1.8b}, [x3], #8
         subs            w5,  w5,  #4
         ushr            v3.8b,   v1.8b,   #4
@@ -3952,8 +3984,9 @@ function pal_pred_8bpc_neon, export=1
         st1             {v1.s}[3], [x2], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld1             {v1.16b}, [x3], #16
         subs            w5,  w5,  #4
         ushr            v4.16b,  v1.16b,  #4
@@ -3968,8 +4001,9 @@ function pal_pred_8bpc_neon, export=1
         st1             {v2.d}[1], [x2], x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ld1             {v1.16b, v2.16b}, [x3], #32
         subs            w5,  w5,  #4
         ushr            v5.16b,  v1.16b,  #4
@@ -3990,8 +4024,9 @@ function pal_pred_8bpc_neon, export=1
         st1             {v4.16b}, [x2], x1
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
         subs            w5,  w5,  #4
         ushr            v21.16b, v16.16b, #4
@@ -4024,8 +4059,9 @@ function pal_pred_8bpc_neon, export=1
         st1             {v22.16b, v23.16b}, [x2], x1
         b.gt            32b
         ret
-64:
+640:
         AARCH64_VALID_JUMP_TARGET
+64:
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
         subs            w5,  w5,  #2
         ushr            v21.16b, v16.16b, #4
@@ -4056,32 +4092,34 @@ function pal_pred_8bpc_neon, export=1
         st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
         b.gt            64b
         ret
-
-L(pal_pred_tbl):
-        .hword L(pal_pred_tbl) - 64b
-        .hword L(pal_pred_tbl) - 32b
-        .hword L(pal_pred_tbl) - 16b
-        .hword L(pal_pred_tbl) -  8b
-        .hword L(pal_pred_tbl) -  4b
 endfunc
 
+jumptable pal_pred_tbl
+        .word 640b - pal_pred_tbl
+        .word 320b - pal_pred_tbl
+        .word 160b - pal_pred_tbl
+        .word 80b  - pal_pred_tbl
+        .word 40b  - pal_pred_tbl
+endjumptable
+
 // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height,
 //                              const int16_t *ac, const int alpha);
 function ipred_cfl_128_8bpc_neon, export=1
         clz             w9,  w3
-        adr             x7,  L(ipred_cfl_128_tbl)
+        movrel          x7,  ipred_cfl_128_tbl
         sub             w9,  w9,  #26
-        ldrh            w9,  [x7, w9, uxtw #1]
+        ldrsw           x9,  [x7, w9, uxtw #2]
         movi            v0.8h,   #128 // dc
         dup             v1.8h,   w6   // alpha
-        sub             x7,  x7,  w9, uxtw
+        add             x7,  x7,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
 L(ipred_cfl_splat_w4):
         AARCH64_VALID_JUMP_TARGET
+1:
         ld1             {v2.8h, v3.8h}, [x5], #32
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
@@ -4100,10 +4138,11 @@ L(ipred_cfl_splat_w4):
         subs            w4,  w4,  #4
         st1             {v3.s}[0],  [x0], x1
         st1             {v3.s}[1],  [x6], x1
-        b.gt            L(ipred_cfl_splat_w4)
+        b.gt            1b
         ret
 L(ipred_cfl_splat_w8):
         AARCH64_VALID_JUMP_TARGET
+1:
         ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
@@ -4134,7 +4173,7 @@ L(ipred_cfl_splat_w8):
         subs            w4,  w4,  #4
         st1             {v4.8b},  [x0], x1
         st1             {v5.8b},  [x6], x1
-        b.gt            L(ipred_cfl_splat_w8)
+        b.gt            1b
         ret
 L(ipred_cfl_splat_w16):
         AARCH64_VALID_JUMP_TARGET
@@ -4180,27 +4219,28 @@ L(ipred_cfl_splat_w16):
         mov             w3,  w9
         b.gt            1b
         ret
-
-L(ipred_cfl_128_tbl):
-L(ipred_cfl_splat_tbl):
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
 endfunc
 
+jumptable ipred_cfl_128_tbl
+ipred_cfl_splat_tbl:
+        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
+        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
+        .word L(ipred_cfl_splat_w8)  - ipred_cfl_128_tbl
+        .word L(ipred_cfl_splat_w4)  - ipred_cfl_128_tbl
+endjumptable
+
 // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height,
 //                              const int16_t *ac, const int alpha);
 function ipred_cfl_top_8bpc_neon, export=1
         clz             w9,  w3
-        adr             x7,  L(ipred_cfl_top_tbl)
+        movrel          x7,  ipred_cfl_top_tbl
         sub             w9,  w9,  #26
-        ldrh            w9,  [x7, w9, uxtw #1]
+        ldrsw           x9,  [x7, w9, uxtw #2]
         dup             v1.8h,   w6   // alpha
         add             x2,  x2,  #1
-        sub             x7,  x7,  w9, uxtw
+        add             x7,  x7,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
@@ -4234,14 +4274,15 @@ function ipred_cfl_top_8bpc_neon, export=1
         urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         b               L(ipred_cfl_splat_w16)
-
-L(ipred_cfl_top_tbl):
-        .hword L(ipred_cfl_top_tbl) - 32b
-        .hword L(ipred_cfl_top_tbl) - 16b
-        .hword L(ipred_cfl_top_tbl) -  8b
-        .hword L(ipred_cfl_top_tbl) -  4b
 endfunc
 
+jumptable ipred_cfl_top_tbl
+        .word 32b - ipred_cfl_top_tbl
+        .word 16b - ipred_cfl_top_tbl
+        .word 8b  - ipred_cfl_top_tbl
+        .word 4b  - ipred_cfl_top_tbl
+endjumptable
+
 // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height,
@@ -4250,15 +4291,15 @@ function ipred_cfl_left_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         clz             w9,  w3
         clz             w8,  w4
-        adr             x10, L(ipred_cfl_splat_tbl)
-        adr             x7,  L(ipred_cfl_left_tbl)
+        movrel          x10, ipred_cfl_splat_tbl
+        movrel          x7,  ipred_cfl_left_tbl
         sub             w9,  w9,  #26
         sub             w8,  w8,  #26
-        ldrh            w9,  [x10, w9, uxtw #1]
-        ldrh            w8,  [x7,  w8, uxtw #1]
+        ldrsw           x9,  [x10, w9, uxtw #2]
+        ldrsw           x8,  [x7,  w8, uxtw #2]
         dup             v1.8h,   w6   // alpha
-        sub             x9,  x10, w9, uxtw
-        sub             x7,  x7,  w8, uxtw
+        add             x9,  x10, x9
+        add             x7,  x7,  x8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
@@ -4296,14 +4337,15 @@ L(ipred_cfl_left_h32):
         urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         br              x9
-
-L(ipred_cfl_left_tbl):
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
 endfunc
 
+jumptable ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
+endjumptable
+
 // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                          const pixel *const topleft,
 //                          const int width, const int height,
@@ -4315,16 +4357,16 @@ function ipred_cfl_8bpc_neon, export=1
         clz             w9,  w3
         clz             w6,  w4
         dup             v16.8h, w8               // width + height
-        adr             x7,  L(ipred_cfl_tbl)
+        movrel          x7,  ipred_cfl_tbl
         rbit            w8,  w8                  // rbit(width + height)
         sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
         sub             w6,  w6,  #26
         clz             w8,  w8                  // ctz(width + height)
-        ldrh            w9,  [x7, w9, uxtw #1]
-        ldrh            w6,  [x7, w6, uxtw #1]
+        ldrsw           x9,  [x7, w9, uxtw #2]
+        ldrsw           x6,  [x7, w6, uxtw #2]
         neg             w8,  w8                  // -ctz(width + height)
-        sub             x9,  x7,  w9, uxtw
-        sub             x7,  x7,  w6, uxtw
+        add             x9,  x7,  x9
+        add             x7,  x7,  x6
         ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
         dup             v17.8h,  w8              // -ctz(width + height)
         add             x6,  x0,  x1
@@ -4440,32 +4482,33 @@ L(ipred_cfl_w32):
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
-
-L(ipred_cfl_tbl):
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
 endfunc
 
+jumptable ipred_cfl_tbl
+        .word L(ipred_cfl_h32) - ipred_cfl_tbl
+        .word L(ipred_cfl_h16) - ipred_cfl_tbl
+        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
+        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
+        .word L(ipred_cfl_w32) - ipred_cfl_tbl
+        .word L(ipred_cfl_w16) - ipred_cfl_tbl
+        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
+        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
+endjumptable
+
 // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                           const ptrdiff_t stride, const int w_pad,
 //                           const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_420_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_420_tbl)
+        movrel          x7,  ipred_cfl_ac_420_tbl
         sub             w8,  w8,  #27
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldrsw           x8,  [x7, w8, uxtw #2]
         movi            v16.8h,  #0
         movi            v17.8h,  #0
         movi            v18.8h,  #0
         movi            v19.8h,  #0
-        sub             x7,  x7,  w8, uxtw
+        add             x7,  x7,  x8
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -4604,9 +4647,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
 
 L(ipred_cfl_ac_420_w16):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
-        ldrh            w3,  [x7, w3, uxtw #1]
-        sub             x7,  x7,  w3, uxtw
+        movrel          x7,  ipred_cfl_ac_420_w16_tbl
+        ldrsw           x3,  [x7, w3, uxtw #2]
+        add             x7,  x7,  x3
         br              x7
 
 L(ipred_cfl_ac_420_w16_wpad0):
@@ -4762,34 +4805,35 @@ L(ipred_cfl_ac_420_w16_hpad):
         // Double the height and reuse the w8 summing/subtracting
         lsl             w6,  w6,  #1
         b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
-
-L(ipred_cfl_ac_420_tbl):
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
-        .hword 0
-
-L(ipred_cfl_ac_420_w16_tbl):
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
 endfunc
 
+jumptable ipred_cfl_ac_420_tbl
+        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
+        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
+        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
+endjumptable
+
+jumptable ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
+endjumptable
+
 // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                           const ptrdiff_t stride, const int w_pad,
 //                           const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_422_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_422_tbl)
+        movrel          x7,  ipred_cfl_ac_422_tbl
         sub             w8,  w8,  #27
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldrsw           x8,  [x7, w8, uxtw #2]
         movi            v16.8h,  #0
         movi            v17.8h,  #0
         movi            v18.8h,  #0
         movi            v19.8h,  #0
-        sub             x7,  x7,  w8, uxtw
+        add             x7,  x7,  x8
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -4880,9 +4924,9 @@ L(ipred_cfl_ac_422_w8_wpad):
 
 L(ipred_cfl_ac_422_w16):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
-        ldrh            w3,  [x7, w3, uxtw #1]
-        sub             x7,  x7,  w3, uxtw
+        movrel          x7,  ipred_cfl_ac_422_w16_tbl
+        ldrsw           x3,  [x7, w3, uxtw #2]
+        add             x7,  x7,  x3
         br              x7
 
 L(ipred_cfl_ac_422_w16_wpad0):
@@ -4984,34 +5028,35 @@ L(ipred_cfl_ac_422_w16_wpad3):
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
-
-L(ipred_cfl_ac_422_tbl):
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
-        .hword 0
-
-L(ipred_cfl_ac_422_w16_tbl):
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
 endfunc
 
+jumptable ipred_cfl_ac_422_tbl
+        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
+        .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl
+        .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl
+endjumptable
+
+jumptable ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
+endjumptable
+
 // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                           const ptrdiff_t stride, const int w_pad,
 //                           const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_444_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_444_tbl)
+        movrel          x7,  ipred_cfl_ac_444_tbl
         sub             w8,  w8,  #26
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldrsw           x8,  [x7, w8, uxtw #2]
         movi            v16.8h,  #0
         movi            v17.8h,  #0
         movi            v18.8h,  #0
         movi            v19.8h,  #0
-        sub             x7,  x7,  w8, uxtw
+        add             x7,  x7,  x8
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -5132,9 +5177,10 @@ L(ipred_cfl_ac_444_w16_wpad):
 
 L(ipred_cfl_ac_444_w32):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
-        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
-        sub             x7,  x7,  w3, uxtw
+        movrel          x7,  ipred_cfl_ac_444_w32_tbl
+        lsr             w3,  w3,  #1
+        ldrsw           x3,  [x7, w3, uxtw #2]
+        add             x7,  x7,  x3
         br              x7
 
 L(ipred_cfl_ac_444_w32_wpad0):
@@ -5279,16 +5325,18 @@ L(ipred_cfl_ac_444_w32_hpad):
         urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
         b               L(ipred_cfl_ac_420_w8_subtract_dc)
-
-L(ipred_cfl_ac_444_tbl):
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
-
-L(ipred_cfl_ac_444_w32_tbl):
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
 endfunc
+
+jumptable ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
+endjumptable
+
+jumptable ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
+endjumptable
diff --git a/src/arm/64/ipred16.S b/src/arm/64/ipred16.S
index 3f8cff986932d9189159305fec60ed64d3fbbe5d..2292a855655bb5c2c631c0b3afa0a5f81db991ae 100644
--- a/src/arm/64/ipred16.S
+++ b/src/arm/64/ipred16.S
@@ -36,17 +36,18 @@
 function ipred_dc_128_16bpc_neon, export=1
         ldr             w8,  [sp]
         clz             w3,  w3
-        adr             x5,  L(ipred_dc_128_tbl)
+        movrel          x5,  ipred_dc_128_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         dup             v0.8h,   w8
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         urshr           v0.8h,   v0.8h,  #1
         br              x5
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         subs            w4,  w4,  #4
@@ -54,8 +55,9 @@ function ipred_dc_128_16bpc_neon, export=1
         st1             {v0.4h},  [x6], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         subs            w4,  w4,  #4
@@ -105,26 +107,27 @@ function ipred_dc_128_16bpc_neon, export=1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_dc_128_tbl):
-        .hword L(ipred_dc_128_tbl) - 640b
-        .hword L(ipred_dc_128_tbl) - 320b
-        .hword L(ipred_dc_128_tbl) - 160b
-        .hword L(ipred_dc_128_tbl) -   8b
-        .hword L(ipred_dc_128_tbl) -   4b
 endfunc
 
+jumptable ipred_dc_128_tbl
+        .word 640b - ipred_dc_128_tbl
+        .word 320b - ipred_dc_128_tbl
+        .word 160b - ipred_dc_128_tbl
+        .word 80b  - ipred_dc_128_tbl
+        .word 40b  - ipred_dc_128_tbl
+endjumptable
+
 // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                         const pixel *const topleft,
 //                         const int width, const int height, const int a,
 //                         const int max_width, const int max_height);
 function ipred_v_16bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_v_tbl)
+        movrel          x5,  ipred_v_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         add             x2,  x2,  #2
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -189,32 +192,34 @@ function ipred_v_16bpc_neon, export=1
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_v_tbl):
-        .hword L(ipred_v_tbl) - 640b
-        .hword L(ipred_v_tbl) - 320b
-        .hword L(ipred_v_tbl) - 160b
-        .hword L(ipred_v_tbl) -  80b
-        .hword L(ipred_v_tbl) -  40b
 endfunc
 
+jumptable ipred_v_tbl
+        .word 640b - ipred_v_tbl
+        .word 320b - ipred_v_tbl
+        .word 160b - ipred_v_tbl
+        .word 80b  - ipred_v_tbl
+        .word 40b  - ipred_v_tbl
+endjumptable
+
 // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                         const pixel *const topleft,
 //                         const int width, const int height, const int a,
 //                         const int max_width, const int max_height);
 function ipred_h_16bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_h_tbl)
+        movrel          x5,  ipred_h_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         sub             x2,  x2,  #8
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         mov             x7,  #-8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         st1             {v3.4h},  [x0], x1
         st1             {v2.4h},  [x6], x1
@@ -223,8 +228,9 @@ function ipred_h_16bpc_neon, export=1
         st1             {v0.4h},  [x6], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         st1             {v3.8h},  [x0], x1
         st1             {v2.8h},  [x6], x1
@@ -233,8 +239,9 @@ function ipred_h_16bpc_neon, export=1
         st1             {v0.8h},  [x6], x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
@@ -247,8 +254,9 @@ function ipred_h_16bpc_neon, export=1
         st1             {v0.8h}, [x6], x1
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
@@ -265,8 +273,9 @@ function ipred_h_16bpc_neon, export=1
         st1             {v0.8h}, [x6], x1
         b.gt            32b
         ret
-64:
+640:
         AARCH64_VALID_JUMP_TARGET
+64:
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
@@ -291,26 +300,27 @@ function ipred_h_16bpc_neon, export=1
         st1             {v0.8h}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_h_tbl):
-        .hword L(ipred_h_tbl) - 64b
-        .hword L(ipred_h_tbl) - 32b
-        .hword L(ipred_h_tbl) - 16b
-        .hword L(ipred_h_tbl) -  8b
-        .hword L(ipred_h_tbl) -  4b
 endfunc
 
+jumptable ipred_h_tbl
+        .word 640b - ipred_h_tbl
+        .word 320b - ipred_h_tbl
+        .word 160b - ipred_h_tbl
+        .word 80b  - ipred_h_tbl
+        .word 40b  - ipred_h_tbl
+endjumptable
+
 // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height, const int a,
 //                              const int max_width, const int max_height);
 function ipred_dc_top_16bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_dc_top_tbl)
+        movrel          x5,  ipred_dc_top_tbl
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         add             x2,  x2,  #2
-        sub             x5,  x5,  w3, uxtw
+        add             x5,  x5,  x3
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -408,15 +418,16 @@ function ipred_dc_top_16bpc_neon, export=1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            64b
         ret
-
-L(ipred_dc_top_tbl):
-        .hword L(ipred_dc_top_tbl) - 640b
-        .hword L(ipred_dc_top_tbl) - 320b
-        .hword L(ipred_dc_top_tbl) - 160b
-        .hword L(ipred_dc_top_tbl) -  80b
-        .hword L(ipred_dc_top_tbl) -  40b
 endfunc
 
+jumptable ipred_dc_top_tbl
+        .word 640b - ipred_dc_top_tbl
+        .word 320b - ipred_dc_top_tbl
+        .word 160b - ipred_dc_top_tbl
+        .word 80b  - ipred_dc_top_tbl
+        .word 40b  - ipred_dc_top_tbl
+endjumptable
+
 // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height, const int a,
@@ -425,13 +436,13 @@ function ipred_dc_left_16bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw #1
         clz             w3,  w3
         clz             w7,  w4
-        adr             x5,  L(ipred_dc_left_tbl)
+        movrel          x5,  ipred_dc_left_tbl
         sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
         sub             w7,  w7,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
-        ldrh            w7,  [x5, w7, uxtw #1]
-        sub             x3,  x5,  w3, uxtw
-        sub             x5,  x5,  w7, uxtw
+        ldrsw           x3,  [x5, w3, uxtw #2]
+        ldrsw           x7,  [x5, w7, uxtw #2]
+        add             x3,  x5,  x3
+        add             x5,  x5,  x7
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -445,12 +456,13 @@ L(ipred_dc_left_h4):
         br              x3
 L(ipred_dc_left_w4):
         AARCH64_VALID_JUMP_TARGET
+1:
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
-        b.gt            L(ipred_dc_left_w4)
+        b.gt            1b
         ret
 
 L(ipred_dc_left_h8):
@@ -462,12 +474,13 @@ L(ipred_dc_left_h8):
         br              x3
 L(ipred_dc_left_w8):
         AARCH64_VALID_JUMP_TARGET
+1:
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
-        b.gt            L(ipred_dc_left_w8)
+        b.gt            1b
         ret
 
 L(ipred_dc_left_h16):
@@ -549,20 +562,21 @@ L(ipred_dc_left_w64):
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            1b
         ret
-
-L(ipred_dc_left_tbl):
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
 endfunc
 
+jumptable ipred_dc_left_tbl
+        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
+        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
+        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
+endjumptable
+
 // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                          const pixel *const topleft,
 //                          const int width, const int height, const int a,
@@ -573,16 +587,16 @@ function ipred_dc_16bpc_neon, export=1
         clz             w3,  w3
         clz             w6,  w4
         dup             v16.4s, w7               // width + height
-        adr             x5,  L(ipred_dc_tbl)
+        movrel          x5,  ipred_dc_tbl
         rbit            w7,  w7                  // rbit(width + height)
         sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
         sub             w6,  w6,  #25
         clz             w7,  w7                  // ctz(width + height)
-        ldrh            w3,  [x5, w3, uxtw #1]
-        ldrh            w6,  [x5, w6, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
+        ldrsw           x6,  [x5, w6, uxtw #2]
         neg             w7,  w7                  // -ctz(width + height)
-        sub             x3,  x5,  w3, uxtw
-        sub             x5,  x5,  w6, uxtw
+        add             x3,  x5,  x3
+        add             x5,  x5,  x6
         ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
         dup             v17.4s,  w7              // -ctz(width + height)
         add             x6,  x0,  x1
@@ -794,33 +808,34 @@ L(ipred_dc_w64):
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            2b
         ret
-
-L(ipred_dc_tbl):
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
 endfunc
 
+jumptable ipred_dc_tbl
+        .word L(ipred_dc_h64) - ipred_dc_tbl
+        .word L(ipred_dc_h32) - ipred_dc_tbl
+        .word L(ipred_dc_h16) - ipred_dc_tbl
+        .word L(ipred_dc_h8)  - ipred_dc_tbl
+        .word L(ipred_dc_h4)  - ipred_dc_tbl
+        .word L(ipred_dc_w64) - ipred_dc_tbl
+        .word L(ipred_dc_w32) - ipred_dc_tbl
+        .word L(ipred_dc_w16) - ipred_dc_tbl
+        .word L(ipred_dc_w8)  - ipred_dc_tbl
+        .word L(ipred_dc_w4)  - ipred_dc_tbl
+endjumptable
+
 // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                             const pixel *const topleft,
 //                             const int width, const int height, const int a,
 //                             const int max_width, const int max_height);
 function ipred_paeth_16bpc_neon, export=1
         clz             w9,  w3
-        adr             x5,  L(ipred_paeth_tbl)
+        movrel          x5,  ipred_paeth_tbl
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v4.8h},  [x2]
         add             x8,  x2,  #2
         sub             x2,  x2,  #8
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         mov             x7,  #-8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
@@ -933,15 +948,16 @@ function ipred_paeth_16bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_paeth_tbl):
-        .hword L(ipred_paeth_tbl) - 640b
-        .hword L(ipred_paeth_tbl) - 320b
-        .hword L(ipred_paeth_tbl) - 160b
-        .hword L(ipred_paeth_tbl) -  80b
-        .hword L(ipred_paeth_tbl) -  40b
 endfunc
 
+jumptable ipred_paeth_tbl
+        .word 640b - ipred_paeth_tbl
+        .word 320b - ipred_paeth_tbl
+        .word 160b - ipred_paeth_tbl
+        .word 80b  - ipred_paeth_tbl
+        .word 40b  - ipred_paeth_tbl
+endjumptable
+
 // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height, const int a,
@@ -951,13 +967,13 @@ function ipred_smooth_16bpc_neon, export=1
         add             x11, x10, w4, uxtw
         add             x10, x10, w3, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_tbl)
+        movrel          x5,  ipred_smooth_tbl
         sub             x12, x2,  w4, uxtw #1
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v4.8h},  [x12] // bottom
         add             x8,  x2,  #2
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1137,15 +1153,16 @@ function ipred_smooth_16bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_smooth_tbl):
-        .hword L(ipred_smooth_tbl) - 640b
-        .hword L(ipred_smooth_tbl) - 320b
-        .hword L(ipred_smooth_tbl) - 160b
-        .hword L(ipred_smooth_tbl) -  80b
-        .hword L(ipred_smooth_tbl) -  40b
 endfunc
 
+jumptable ipred_smooth_tbl
+        .word 640b - ipred_smooth_tbl
+        .word 320b - ipred_smooth_tbl
+        .word 160b - ipred_smooth_tbl
+        .word 80b  - ipred_smooth_tbl
+        .word 40b  - ipred_smooth_tbl
+endjumptable
+
 // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                                const pixel *const topleft,
 //                                const int width, const int height, const int a,
@@ -1154,13 +1171,13 @@ function ipred_smooth_v_16bpc_neon, export=1
         movrel          x7,  X(sm_weights)
         add             x7,  x7,  w4, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_v_tbl)
+        movrel          x5,  ipred_smooth_v_tbl
         sub             x8,  x2,  w4, uxtw #1
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v4.8h},  [x8] // bottom
         add             x2,  x2,  #2
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1264,15 +1281,16 @@ function ipred_smooth_v_16bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_smooth_v_tbl):
-        .hword L(ipred_smooth_v_tbl) - 640b
-        .hword L(ipred_smooth_v_tbl) - 320b
-        .hword L(ipred_smooth_v_tbl) - 160b
-        .hword L(ipred_smooth_v_tbl) -  80b
-        .hword L(ipred_smooth_v_tbl) -  40b
 endfunc
 
+jumptable ipred_smooth_v_tbl
+        .word 640b - ipred_smooth_v_tbl
+        .word 320b - ipred_smooth_v_tbl
+        .word 160b - ipred_smooth_v_tbl
+        .word 80b  - ipred_smooth_v_tbl
+        .word 40b  - ipred_smooth_v_tbl
+endjumptable
+
 // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                                const pixel *const topleft,
 //                                const int width, const int height, const int a,
@@ -1281,12 +1299,12 @@ function ipred_smooth_h_16bpc_neon, export=1
         movrel          x8,  X(sm_weights)
         add             x8,  x8,  w3, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_h_tbl)
+        movrel          x5,  ipred_smooth_h_tbl
         add             x12, x2,  w3, uxtw #1
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         ld1r            {v5.8h},  [x12] // right
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1396,15 +1414,16 @@ function ipred_smooth_h_16bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_smooth_h_tbl):
-        .hword L(ipred_smooth_h_tbl) - 640b
-        .hword L(ipred_smooth_h_tbl) - 320b
-        .hword L(ipred_smooth_h_tbl) - 160b
-        .hword L(ipred_smooth_h_tbl) -  80b
-        .hword L(ipred_smooth_h_tbl) -  40b
 endfunc
 
+jumptable ipred_smooth_h_tbl
+        .word 640b - ipred_smooth_h_tbl
+        .word 320b - ipred_smooth_h_tbl
+        .word 160b - ipred_smooth_h_tbl
+        .word 80b  - ipred_smooth_h_tbl
+        .word 40b  - ipred_smooth_h_tbl
+endjumptable
+
 const padding_mask_buf
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
@@ -1728,11 +1747,11 @@ endfunc
 //                                const int dx, const int max_base_x);
 function ipred_z1_fill1_16bpc_neon, export=1
         clz             w9,  w3
-        adr             x8,  L(ipred_z1_fill1_tbl)
+        movrel          x8,  ipred_z1_fill1_tbl
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
+        ldrsw           x9,  [x8, w9, uxtw #2]
         add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
-        sub             x8,  x8,  w9,  uxtw
+        add             x8,  x8,  x9
         ld1r            {v31.8h}, [x10]           // padding
         mov             w7,  w5
         mov             w15, #64
@@ -1916,15 +1935,16 @@ function ipred_z1_fill1_16bpc_neon, export=1
         add             x13, x13, x1
         mov             w3,  w12
         b               169b
-
-L(ipred_z1_fill1_tbl):
-        .hword L(ipred_z1_fill1_tbl) - 640b
-        .hword L(ipred_z1_fill1_tbl) - 320b
-        .hword L(ipred_z1_fill1_tbl) - 160b
-        .hword L(ipred_z1_fill1_tbl) -  80b
-        .hword L(ipred_z1_fill1_tbl) -  40b
 endfunc
 
+jumptable ipred_z1_fill1_tbl
+        .word 640b - ipred_z1_fill1_tbl
+        .word 320b - ipred_z1_fill1_tbl
+        .word 160b - ipred_z1_fill1_tbl
+        .word 80b  - ipred_z1_fill1_tbl
+        .word 40b  - ipred_z1_fill1_tbl
+endjumptable
+
 function ipred_z1_fill2_16bpc_neon, export=1
         cmp             w3,  #8
         add             x10, x2,  w6,  uxtw       // top[max_base_x]
@@ -2050,11 +2070,11 @@ endconst
 //                                const int dx, const int dy);
 function ipred_z2_fill1_16bpc_neon, export=1
         clz             w10, w4
-        adr             x9,  L(ipred_z2_fill1_tbl)
+        movrel          x9,  ipred_z2_fill1_tbl
         sub             w10, w10, #25
-        ldrh            w10, [x9, w10, uxtw #1]
+        ldrsw           x10, [x9, w10, uxtw #2]
         mov             w8,  #(1 << 6)            // xpos = 1 << 6
-        sub             x9,  x9,  w10, uxtw
+        add             x9,  x9,  x10
         sub             w8,  w8,  w6              // xpos -= dx
 
         movrel          x11, increments
@@ -2814,15 +2834,16 @@ function ipred_z2_fill1_16bpc_neon, export=1
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
         ret
-
-L(ipred_z2_fill1_tbl):
-        .hword L(ipred_z2_fill1_tbl) - 640b
-        .hword L(ipred_z2_fill1_tbl) - 320b
-        .hword L(ipred_z2_fill1_tbl) - 160b
-        .hword L(ipred_z2_fill1_tbl) -  80b
-        .hword L(ipred_z2_fill1_tbl) -  40b
 endfunc
 
+jumptable ipred_z2_fill1_tbl
+        .word 640b - ipred_z2_fill1_tbl
+        .word 320b - ipred_z2_fill1_tbl
+        .word 160b - ipred_z2_fill1_tbl
+        .word 80b  - ipred_z2_fill1_tbl
+        .word 40b  - ipred_z2_fill1_tbl
+endjumptable
+
 function ipred_z2_fill2_16bpc_neon, export=1
         cmp             w4,  #8
         mov             w8,  #(2 << 6)            // xpos = 2 << 6
@@ -3432,11 +3453,11 @@ endfunc
 //                                const int dy, const int max_base_y);
 function ipred_z3_fill1_16bpc_neon, export=1
         clz             w9,  w4
-        adr             x8,  L(ipred_z3_fill1_tbl)
+        movrel          x8,  ipred_z3_fill1_tbl
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
+        ldrsw           x9,  [x8, w9, uxtw #2]
         add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
-        sub             x8,  x8,  w9,  uxtw
+        add             x8,  x8,  x9
         ld1r            {v31.8h}, [x10]           // padding
         mov             w7,  w5
         mov             w15, #64
@@ -3637,19 +3658,20 @@ function ipred_z3_fill1_16bpc_neon, export=1
         b               1b
 9:
         ret
-
-L(ipred_z3_fill1_tbl):
-        .hword L(ipred_z3_fill1_tbl) - 640b
-        .hword L(ipred_z3_fill1_tbl) - 320b
-        .hword L(ipred_z3_fill1_tbl) - 160b
-        .hword L(ipred_z3_fill1_tbl) -  80b
-        .hword L(ipred_z3_fill1_tbl) -  40b
 endfunc
 
+jumptable ipred_z3_fill1_tbl
+        .word 640b - ipred_z3_fill1_tbl
+        .word 320b - ipred_z3_fill1_tbl
+        .word 160b - ipred_z3_fill1_tbl
+        .word 80b  - ipred_z3_fill1_tbl
+        .word 40b  - ipred_z3_fill1_tbl
+endjumptable
+
 function ipred_z3_fill_padding_neon, export=0
         cmp             w3,  #8
-        adr             x8,  L(ipred_z3_fill_padding_tbl)
-        b.gt            L(ipred_z3_fill_padding_wide)
+        movrel          x8,  ipred_z3_fill_padding_tbl
+        b.gt            ipred_z3_fill_padding_wide
         // w3 = remaining width, w4 = constant height
         mov             w12, w4
 
@@ -3659,12 +3681,13 @@ function ipred_z3_fill_padding_neon, export=0
         // power of two in the remaining width, and repeating.
         clz             w9,  w3
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
-        sub             x9,  x8,  w9,  uxtw
+        ldrsw           x9,  [x8, w9, uxtw #2]
+        add             x9,  x8,  x9
         br              x9
 
-2:
+20:
         AARCH64_VALID_JUMP_TARGET
+2:
         st1             {v31.s}[0], [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.s}[0], [x13], x1
@@ -3682,8 +3705,9 @@ function ipred_z3_fill_padding_neon, export=0
         mov             w4,  w12
         b               1b
 
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         st1             {v31.4h}, [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.4h}, [x13], x1
@@ -3701,17 +3725,18 @@ function ipred_z3_fill_padding_neon, export=0
         mov             w4,  w12
         b               1b
 
-8:
-16:
-32:
-64:
+80:
+160:
+320:
+640:
         AARCH64_VALID_JUMP_TARGET
+8:
         st1             {v31.8h}, [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.8h}, [x13], x1
         st1             {v31.8h}, [x0],  x1
         st1             {v31.8h}, [x13], x1
-        b.gt            4b
+        b.gt            8b
         subs            w3,  w3,  #8
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
@@ -3725,16 +3750,18 @@ function ipred_z3_fill_padding_neon, export=0
 
 9:
         ret
+endfunc
 
-L(ipred_z3_fill_padding_tbl):
-        .hword L(ipred_z3_fill_padding_tbl) - 64b
-        .hword L(ipred_z3_fill_padding_tbl) - 32b
-        .hword L(ipred_z3_fill_padding_tbl) - 16b
-        .hword L(ipred_z3_fill_padding_tbl) -  8b
-        .hword L(ipred_z3_fill_padding_tbl) -  4b
-        .hword L(ipred_z3_fill_padding_tbl) -  2b
+jumptable ipred_z3_fill_padding_tbl
+        .word 640b - ipred_z3_fill_padding_tbl
+        .word 320b - ipred_z3_fill_padding_tbl
+        .word 160b - ipred_z3_fill_padding_tbl
+        .word 80b  - ipred_z3_fill_padding_tbl
+        .word 40b  - ipred_z3_fill_padding_tbl
+        .word 20b  - ipred_z3_fill_padding_tbl
+endjumptable
 
-L(ipred_z3_fill_padding_wide):
+function ipred_z3_fill_padding_wide
         // Fill a WxH rectangle with padding, with W > 8.
         lsr             x1,  x1,  #1
         mov             w12, w3
@@ -3883,13 +3910,13 @@ function ipred_filter_\bpc\()bpc_neon
         add             x6,  x6,  w5, uxtw
         ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
         clz             w9,  w3
-        adr             x5,  L(ipred_filter\bpc\()_tbl)
+        movrel          x5,  ipred_filter\bpc\()_tbl
         ld1             {v20.8b, v21.8b, v22.8b}, [x6]
         sub             w9,  w9,  #26
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldrsw           x9,  [x5, w9, uxtw #2]
         sxtl            v16.8h,  v16.8b
         sxtl            v17.8h,  v17.8b
-        sub             x5,  x5,  w9, uxtw
+        add             x5,  x5,  x9
         sxtl            v18.8h,  v18.8b
         sxtl            v19.8h,  v19.8b
         add             x6,  x0,  x1
@@ -4162,13 +4189,14 @@ function ipred_filter_\bpc\()bpc_neon
         b               1b
 9:
         ret
-
-L(ipred_filter\bpc\()_tbl):
-        .hword L(ipred_filter\bpc\()_tbl) - 320b
-        .hword L(ipred_filter\bpc\()_tbl) - 160b
-        .hword L(ipred_filter\bpc\()_tbl) -  80b
-        .hword L(ipred_filter\bpc\()_tbl) -  40b
 endfunc
+
+jumptable ipred_filter\bpc\()_tbl
+        .word 320b - ipred_filter\bpc\()_tbl
+        .word 160b - ipred_filter\bpc\()_tbl
+        .word 80b  - ipred_filter\bpc\()_tbl
+        .word 40b  - ipred_filter\bpc\()_tbl
+endjumptable
 .endm
 
 filter_fn 10
@@ -4187,12 +4215,12 @@ endfunc
 function pal_pred_16bpc_neon, export=1
         ld1             {v30.8h}, [x2]
         clz             w9,  w4
-        adr             x6,  L(pal_pred_tbl)
+        movrel          x6,  pal_pred_tbl
         sub             w9,  w9,  #25
         movi            v29.16b, #7
-        ldrh            w9,  [x6, w9, uxtw #1]
+        ldrsw           x9,  [x6, w9, uxtw #2]
         movi            v31.8h,  #1, lsl #8
-        sub             x6,  x6,  w9, uxtw
+        add             x6,  x6,  x9
         br              x6
 40:
         AARCH64_VALID_JUMP_TARGET
@@ -4391,15 +4419,16 @@ function pal_pred_16bpc_neon, export=1
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
         b.gt            64b
         ret
-
-L(pal_pred_tbl):
-        .hword L(pal_pred_tbl) - 640b
-        .hword L(pal_pred_tbl) - 320b
-        .hword L(pal_pred_tbl) - 160b
-        .hword L(pal_pred_tbl) -  80b
-        .hword L(pal_pred_tbl) -  40b
 endfunc
 
+jumptable pal_pred_tbl
+        .word 640b - pal_pred_tbl
+        .word 320b - pal_pred_tbl
+        .word 160b - pal_pred_tbl
+        .word 80b  - pal_pred_tbl
+        .word 40b  - pal_pred_tbl
+endjumptable
+
 // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height,
@@ -4408,18 +4437,19 @@ endfunc
 function ipred_cfl_128_16bpc_neon, export=1
         dup             v31.8h,  w7   // bitdepth_max
         clz             w9,  w3
-        adr             x7,  L(ipred_cfl_128_tbl)
+        movrel          x7,  ipred_cfl_128_tbl
         sub             w9,  w9,  #26
-        ldrh            w9,  [x7, w9, uxtw #1]
+        ldrsw           x9,  [x7, w9, uxtw #2]
         urshr           v0.8h,   v31.8h,  #1
         dup             v1.8h,   w6   // alpha
-        sub             x7,  x7,  w9, uxtw
+        add             x7,  x7,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
         br              x7
 L(ipred_cfl_splat_w4):
         AARCH64_VALID_JUMP_TARGET
+1:
         ld1             {v4.8h, v5.8h}, [x5], #32
         subs            w4,  w4,  #4
         smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
@@ -4448,10 +4478,11 @@ L(ipred_cfl_splat_w4):
         st1             {v2.d}[1],  [x6], x1
         st1             {v3.d}[0],  [x0], x1
         st1             {v3.d}[1],  [x6], x1
-        b.gt            L(ipred_cfl_splat_w4)
+        b.gt            1b
         ret
 L(ipred_cfl_splat_w8):
         AARCH64_VALID_JUMP_TARGET
+1:
         ld1             {v4.8h, v5.8h}, [x5], #32
         subs            w4,  w4,  #2
         smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
@@ -4478,7 +4509,7 @@ L(ipred_cfl_splat_w8):
         smin            v3.8h,   v3.8h,   v31.8h
         st1             {v2.8h},  [x0], x1
         st1             {v3.8h},  [x6], x1
-        b.gt            L(ipred_cfl_splat_w8)
+        b.gt            1b
         ret
 L(ipred_cfl_splat_w16):
         AARCH64_VALID_JUMP_TARGET
@@ -4544,15 +4575,16 @@ L(ipred_cfl_splat_w16):
         mov             w3,  w9
         b.gt            1b
         ret
-
-L(ipred_cfl_128_tbl):
-L(ipred_cfl_splat_tbl):
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
 endfunc
 
+jumptable ipred_cfl_128_tbl
+ipred_cfl_splat_tbl:
+        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
+        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
+        .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl
+        .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl
+endjumptable
+
 // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height,
@@ -4561,12 +4593,12 @@ endfunc
 function ipred_cfl_top_16bpc_neon, export=1
         dup             v31.8h,  w7   // bitdepth_max
         clz             w9,  w3
-        adr             x7,  L(ipred_cfl_top_tbl)
+        movrel          x7,  ipred_cfl_top_tbl
         sub             w9,  w9,  #26
-        ldrh            w9,  [x7, w9, uxtw #1]
+        ldrsw           x9,  [x7, w9, uxtw #2]
         dup             v1.8h,   w6   // alpha
         add             x2,  x2,  #2
-        sub             x7,  x7,  w9, uxtw
+        add             x7,  x7,  x9
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
@@ -4603,14 +4635,15 @@ function ipred_cfl_top_16bpc_neon, export=1
         rshrn           v0.4h,   v0.4s,   #5
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
-
-L(ipred_cfl_top_tbl):
-        .hword L(ipred_cfl_top_tbl) - 32b
-        .hword L(ipred_cfl_top_tbl) - 16b
-        .hword L(ipred_cfl_top_tbl) -  8b
-        .hword L(ipred_cfl_top_tbl) -  4b
 endfunc
 
+jumptable ipred_cfl_top_tbl
+        .word 32b - ipred_cfl_top_tbl
+        .word 16b - ipred_cfl_top_tbl
+        .word 8b  - ipred_cfl_top_tbl
+        .word 4b  - ipred_cfl_top_tbl
+endjumptable
+
 // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                                const pixel *const topleft,
 //                                const int width, const int height,
@@ -4621,15 +4654,15 @@ function ipred_cfl_left_16bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw #1
         clz             w9,  w3
         clz             w8,  w4
-        adr             x10, L(ipred_cfl_splat_tbl)
-        adr             x7,  L(ipred_cfl_left_tbl)
+        movrel          x10, ipred_cfl_splat_tbl
+        movrel          x7,  ipred_cfl_left_tbl
         sub             w9,  w9,  #26
         sub             w8,  w8,  #26
-        ldrh            w9,  [x10, w9, uxtw #1]
-        ldrh            w8,  [x7,  w8, uxtw #1]
+        ldrsw           x9,  [x10, w9, uxtw #2]
+        ldrsw           x8,  [x7,  w8, uxtw #2]
         dup             v1.8h,   w6   // alpha
-        sub             x9,  x10, w9, uxtw
-        sub             x7,  x7,  w8, uxtw
+        add             x9,  x10, x9
+        add             x7,  x7,  x8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
@@ -4670,14 +4703,15 @@ L(ipred_cfl_left_h32):
         rshrn           v0.4h,   v0.4s,   #5
         dup             v0.8h,   v0.h[0]
         br              x9
-
-L(ipred_cfl_left_tbl):
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
 endfunc
 
+jumptable ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
+        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
+endjumptable
+
 // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                           const pixel *const topleft,
 //                           const int width, const int height,
@@ -4691,16 +4725,16 @@ function ipred_cfl_16bpc_neon, export=1
         clz             w9,  w3
         clz             w6,  w4
         dup             v16.4s, w8               // width + height
-        adr             x7,  L(ipred_cfl_tbl)
+        movrel          x7,  ipred_cfl_tbl
         rbit            w8,  w8                  // rbit(width + height)
         sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
         sub             w6,  w6,  #26
         clz             w8,  w8                  // ctz(width + height)
-        ldrh            w9,  [x7, w9, uxtw #1]
-        ldrh            w6,  [x7, w6, uxtw #1]
+        ldrsw           x9,  [x7, w9, uxtw #2]
+        ldrsw           x6,  [x7, w6, uxtw #2]
         neg             w8,  w8                  // -ctz(width + height)
-        sub             x9,  x7,  w9, uxtw
-        sub             x7,  x7,  w6, uxtw
+        add             x9,  x7,  x9
+        add             x7,  x7,  x6
         ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
         dup             v17.4s,  w8              // -ctz(width + height)
         add             x6,  x0,  x1
@@ -4823,32 +4857,33 @@ L(ipred_cfl_w32):
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
-
-L(ipred_cfl_tbl):
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
 endfunc
 
+jumptable ipred_cfl_tbl
+        .word L(ipred_cfl_h32) - ipred_cfl_tbl
+        .word L(ipred_cfl_h16) - ipred_cfl_tbl
+        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
+        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
+        .word L(ipred_cfl_w32) - ipred_cfl_tbl
+        .word L(ipred_cfl_w16) - ipred_cfl_tbl
+        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
+        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
+endjumptable
+
 // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                            const ptrdiff_t stride, const int w_pad,
 //                            const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_420_16bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_420_tbl)
+        movrel          x7,  ipred_cfl_ac_420_tbl
         sub             w8,  w8,  #27
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldrsw           x8,  [x7, w8, uxtw #2]
         movi            v24.4s,  #0
         movi            v25.4s,  #0
         movi            v26.4s,  #0
         movi            v27.4s,  #0
-        sub             x7,  x7,  w8, uxtw
+        add             x7,  x7,  x8
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -4980,9 +5015,9 @@ L(ipred_cfl_ac_420_w8_hpad):
 
 L(ipred_cfl_ac_420_w16):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
-        ldrh            w3,  [x7, w3, uxtw #1]
-        sub             x7,  x7,  w3, uxtw
+        movrel          x7,  ipred_cfl_ac_420_w16_tbl
+        ldrsw           x3,  [x7, w3, uxtw #2]
+        add             x7,  x7,  x3
         br              x7
 
 L(ipred_cfl_ac_420_w16_wpad0):
@@ -5158,34 +5193,35 @@ L(ipred_cfl_ac_420_w16_hpad):
         // Quadruple the height and reuse the w4 summing/subtracting
         lsl             w6,  w6,  #2
         b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
-
-L(ipred_cfl_ac_420_tbl):
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
-        .hword 0
-
-L(ipred_cfl_ac_420_w16_tbl):
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
 endfunc
 
+jumptable ipred_cfl_ac_420_tbl
+        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
+        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
+        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
+endjumptable
+
+jumptable ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
+        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
+endjumptable
+
 // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                            const ptrdiff_t stride, const int w_pad,
 //                            const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_422_16bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_422_tbl)
+        movrel          x7,  ipred_cfl_ac_422_tbl
         sub             w8,  w8,  #27
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldrsw           x8,  [x7, w8, uxtw #2]
         movi            v24.4s,  #0
         movi            v25.4s,  #0
         movi            v26.4s,  #0
         movi            v27.4s,  #0
-        sub             x7,  x7,  w8, uxtw
+        add             x7,  x7,  x8
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -5286,9 +5322,9 @@ L(ipred_cfl_ac_422_w8_wpad):
 
 L(ipred_cfl_ac_422_w16):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
-        ldrh            w3,  [x7, w3, uxtw #1]
-        sub             x7,  x7,  w3, uxtw
+        movrel          x7,  ipred_cfl_ac_422_w16_tbl
+        ldrsw           x3,  [x7, w3, uxtw #2]
+        add             x7,  x7,  x3
         br              x7
 
 L(ipred_cfl_ac_422_w16_wpad0):
@@ -5406,34 +5442,35 @@ L(ipred_cfl_ac_422_w16_wpad3):
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
-
-L(ipred_cfl_ac_422_tbl):
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
-        .hword 0
-
-L(ipred_cfl_ac_422_w16_tbl):
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
 endfunc
 
+jumptable ipred_cfl_ac_422_tbl
+        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
+        .word L(ipred_cfl_ac_422_w8)  - ipred_cfl_ac_422_tbl
+        .word L(ipred_cfl_ac_422_w4)  - ipred_cfl_ac_422_tbl
+endjumptable
+
+jumptable ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
+        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
+endjumptable
+
 // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                            const ptrdiff_t stride, const int w_pad,
 //                            const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_444_16bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_444_tbl)
+        movrel          x7,  ipred_cfl_ac_444_tbl
         sub             w8,  w8,  #26
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldrsw           x8,  [x7, w8, uxtw #2]
         movi            v24.4s,  #0
         movi            v25.4s,  #0
         movi            v26.4s,  #0
         movi            v27.4s,  #0
-        sub             x7,  x7,  w8, uxtw
+        add             x7,  x7,  x8
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -5542,10 +5579,11 @@ L(ipred_cfl_ac_444_w16_wpad):
 
 L(ipred_cfl_ac_444_w32):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
-        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
+        movrel          x7,  ipred_cfl_ac_444_w32_tbl
+        lsr             w3,  w3,  #1
+        ldrsw           x3,  [x7, w3, uxtw #2]
         lsr             x2,  x2,  #1 // Restore the stride to one line increments
-        sub             x7,  x7,  w3, uxtw
+        add             x7,  x7,  x3
         br              x7
 
 L(ipred_cfl_ac_444_w32_wpad0):
@@ -5659,16 +5697,18 @@ L(ipred_cfl_ac_444_w32_hpad):
         //  Multiply the height by eight and reuse the w4 subtracting
         lsl             w6,  w6,  #3
         b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
-
-L(ipred_cfl_ac_444_tbl):
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
-
-L(ipred_cfl_ac_444_w32_tbl):
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
 endfunc
+
+jumptable ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
+        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
+endjumptable
+
+jumptable ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
+        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
+endjumptable
diff --git a/src/arm/64/looprestoration_common.S b/src/arm/64/looprestoration_common.S
index 745f6c20f49168be10d33292ec52c6b4b6b7e210..c10a9f3d7c44165aa2f42a999e62897de6955e3b 100644
--- a/src/arm/64/looprestoration_common.S
+++ b/src/arm/64/looprestoration_common.S
@@ -28,14 +28,77 @@
 #include "src/arm/asm.S"
 #include "util.S"
 
+// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table.
+// In the comments, let RefTable denote the original, reference table.
+const x_by_x_tables
+// RangeMins
+//
+// Min(RefTable[i*8:i*8+8])
+// First two values are zeroed.
+//
+// Lookup using RangeMins[(x >> 3)]
+        .byte 0,  0, 11,  8,  6,  5,  5,  4,  4,  3,  3,  3,  2,  2,  2,  2
+        .byte 2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0
+
+// DiffMasks
+//
+// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range
+// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of
+// RefTable changes at that particular index.
+// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of
+// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15).
+//
+// Lookup using DiffMasks[(x >> 3)]
+        .byte 0x00, 0x00, 0xD4, 0x44
+        .byte 0x42, 0x04, 0x00, 0x00
+        .byte 0x00, 0x80, 0x00, 0x00
+        .byte 0x04, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x40, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x02
+// Binary form:
+// 0b00000000, 0b00000000, 0b11010100, 0b01000100
+// 0b01000010, 0b00000100, 0b00000000, 0b00000000
+// 0b00000000, 0b10000000, 0b00000000, 0b00000000
+// 0b00000100, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b01000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000010
+
+// RefLo
+//
+// RefTable[0:16]
+//      i.e. First 16 elements of the original table.
+// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable.
+//
+// Lookup using RangeMins[x] (tbl will replace x > 15 with 0)
+        .byte 255, 128,  85,  64,  51,  43,  37,  32, 28,  26,  23,  21,  20,  18,  17,  16
+
+// Pseudo assembly
+//
+// hi_bits = x >> 3
+// tbl             ref,    {RefLo}, x
+// tbl             diffs,  {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits
+// tbl             min,    {RangeMins[0:16], RangeMins[16:32]}, hi_bits
+// lo_bits = x & 0x7
+// diffs = diffs << lo_bits
+// ref = ref + min
+// integral = popcnt(diffs)
+// ref = ref + integral
+// return ref
+endconst
+
 // void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
 //                               int32_t *AA, int16_t *BB,
 //                               const int w, const int s,
 //                               const int bitdepth_max);
 function sgr_box3_vert_neon, export=1
-        stp             d8,  d9,  [sp, #-0x30]!
+        stp             d8,  d9,  [sp, #-0x40]!
         stp             d10, d11, [sp, #0x10]
         stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
 
         add             w4,  w4,  #2
         clz             w9,  w6        // bitdepth_max
@@ -49,93 +112,112 @@ function sgr_box3_vert_neon, export=1
         movi            v31.4s,   #9   // n
 
         sub             w9,  w9,  #24  // -bitdepth_min_8
-        movrel          x12, X(sgr_x_by_x)
+        movrel          x12, x_by_x_tables
         mov             w13, #455      // one_by_x
-        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks
+        movi            v22.16b, #0x7
+        ldr             q23, [x12, #64] //RefLo
         dup             v6.8h,    w9   // -bitdepth_min_8
-        movi            v19.16b,  #5
-        movi            v20.8b,   #55  // idx of last 5
-        movi            v21.8b,   #72  // idx of last 4
-        movi            v22.8b,   #101 // idx of last 3
-        movi            v23.8b,   #169 // idx of last 2
-        movi            v24.8b,   #254 // idx of last 1
         saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
         movi            v29.8h,   #1, lsl #8
         dup             v30.4s,   w13  // one_by_x
 
-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b
-
-        ld1             {v8.4s,  v9.4s},  [x5], #32
-        ld1             {v10.4s, v11.4s}, [x6], #32
-        ld1             {v12.8h},         [x7], #16
-        ld1             {v13.8h},         [x8], #16
-        ld1             {v0.4s, v1.4s},   [x0], #32
-        ld1             {v2.8h},          [x1], #16
+        ld1             {v8.4s,  v9.4s,  v10.4s, v11.4s}, [x5], #64
+        ld1             {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
+        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+        ld1             {v20.8h, v21.8h}, [x8], #32
+        ld1             {v0.8h,  v1.8h},  [x7], #32
 1:
-
-        add             v8.4s,   v8.4s,   v10.4s
-        add             v9.4s,   v9.4s,   v11.4s
-
-        add             v12.8h,  v12.8h,  v13.8h
-
-        subs            w4,  w4,  #8
-        add             v0.4s,   v0.4s,   v8.4s
-        add             v1.4s,   v1.4s,   v9.4s
-        add             v2.8h,   v2.8h,   v12.8h
-
-        srshl           v0.4s,   v0.4s,   v7.4s
-        srshl           v1.4s,   v1.4s,   v7.4s
-        srshl           v4.8h,   v2.8h,   v6.8h
-        mul             v0.4s,   v0.4s,   v31.4s // a * n
-        mul             v1.4s,   v1.4s,   v31.4s // a * n
-        umull           v3.4s,   v4.4h,   v4.4h  // b * b
-        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
-        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
-        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
-        mul             v0.4s,   v0.4s,   v28.4s // p * s
-        mul             v1.4s,   v1.4s,   v28.4s // p * s
-        ld1             {v8.4s,  v9.4s},  [x5], #32
-        uqshrn          v0.4h,   v0.4s,   #16
-        uqshrn2         v0.8h,   v1.4s,   #16
-        ld1             {v10.4s, v11.4s}, [x6], #32
-        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
-
-        ld1             {v12.8h},         [x7], #16
-
-        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        add             v25.8b,  v25.8b,  v26.8b
-        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b,  v27.8b,  v4.8b
-        add             v5.8b,   v5.8b,   v19.8b
-        add             v25.8b,  v25.8b,  v27.8b
-        add             v5.8b,   v1.8b,   v5.8b
-        ld1             {v13.8h},         [x8], #16
-        add             v5.8b,   v5.8b,   v25.8b
-        ld1             {v0.4s, v1.4s},   [x0], #32
-        uxtl            v5.8h,   v5.8b           // x
-
-        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
-        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
-        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
-        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
-        srshr           v3.4s,   v3.4s,   #12    // AA[i]
-        srshr           v4.4s,   v4.4s,   #12    // AA[i]
-        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
-        ld1             {v2.8h},          [x1], #16
-
-        st1             {v3.4s, v4.4s}, [x2], #32
-        st1             {v5.8h}, [x3], #16
+        ld1             {v2.8h,  v3.8h},   [x1], #32
+        add             v8.4s,   v8.4s,   v12.4s
+        add             v9.4s,   v9.4s,   v13.4s
+        add             v10.4s,  v10.4s,  v14.4s
+        add             v11.4s,  v11.4s,  v15.4s
+        add             v0.8h,   v0.8h,   v20.8h
+        add             v1.8h,   v1.8h,   v21.8h
+
+        add             v16.4s,  v16.4s,  v8.4s
+        add             v17.4s,  v17.4s,  v9.4s
+        add             v18.4s,  v18.4s,  v10.4s
+        add             v19.4s,  v19.4s,  v11.4s
+        add             v4.8h,   v2.8h,   v0.8h
+        add             v5.8h,   v3.8h,   v1.8h
+
+        srshl           v16.4s,  v16.4s,  v7.4s
+        srshl           v17.4s,  v17.4s,  v7.4s
+        srshl           v18.4s,  v18.4s,  v7.4s
+        srshl           v19.4s,  v19.4s,  v7.4s
+        srshl           v9.8h,   v4.8h,   v6.8h
+        srshl           v13.8h,  v5.8h,   v6.8h
+        mul             v16.4s,  v16.4s,  v31.4s // a * n
+        mul             v17.4s,  v17.4s,  v31.4s // a * n
+        mul             v18.4s,  v18.4s,  v31.4s // a * n
+        mul             v19.4s,  v19.4s,  v31.4s // a * n
+        umull           v8.4s,   v9.4h,   v9.4h  // b * b
+        umull2          v9.4s,   v9.8h,   v9.8h  // b * b
+        umull           v12.4s,  v13.4h,  v13.4h // b * b
+        umull2          v13.4s,  v13.8h,  v13.8h // b * b
+        uqsub           v16.4s,  v16.4s,  v8.4s  // imax(a * n - b * b, 0)
+        uqsub           v17.4s,  v17.4s,  v9.4s  // imax(a * n - b * b, 0)
+        uqsub           v18.4s,  v18.4s,  v12.4s // imax(a * n - b * b, 0)
+        uqsub           v19.4s,  v19.4s,  v13.4s // imax(a * n - b * b, 0)
+        mul             v16.4s,  v16.4s,  v28.4s // p * s
+        mul             v17.4s,  v17.4s,  v28.4s // p * s
+        mul             v18.4s,  v18.4s,  v28.4s // p * s
+        mul             v19.4s,  v19.4s,  v28.4s // p * s
+        uqshrn          v16.4h,  v16.4s,  #16
+        uqshrn2         v16.8h,  v17.4s,  #16
+        uqshrn          v18.4h,  v18.4s,  #16
+        uqshrn2         v18.8h,  v19.4s,  #16
+        uqrshrn         v1.8b,   v16.8h,  #4     // imin(z, 255)
+        uqrshrn2        v1.16b,  v18.8h,  #4     // imin(z, 255)
+
+        ld1             {v16.4s, v17.4s}, [x0], #32
+        subs            w4,  w4,  #16
+
+        ushr            v0.16b,  v1.16b,  #3
+        ld1             {v8.4s,  v9.4s}, [x5], #32
+        tbl             v2.16b,  {v26.16b, v27.16b}, v0.16b // RangeMins
+        tbl             v0.16b,  {v24.16b, v25.16b}, v0.16b // DiffMasks
+        tbl             v3.16b,  {v23.16b}, v1.16b          // RefLo
+        and             v1.16b,  v1.16b,   v22.16b
+        ld1             {v12.4s, v13.4s}, [x6], #32
+        ushl            v1.16b,  v2.16b,  v1.16b
+        ld1             {v20.8h, v21.8h}, [x8], #32
+        add             v3.16b,  v3.16b,  v0.16b
+        cnt             v1.16b,  v1.16b
+        ld1             {v18.4s, v19.4s}, [x0], #32
+        add             v3.16b,  v3.16b,  v1.16b
+        ld1             {v10.4s, v11.4s}, [x5], #32
+        uxtl            v0.8h,   v3.8b           // x
+        uxtl2           v1.8h,   v3.16b          // x
+
+        ld1             {v14.4s, v15.4s}, [x6], #32
+
+        umull           v2.4s,   v0.4h,   v4.4h // x * BB[i]
+        umull2          v3.4s,   v0.8h,   v4.8h // x * BB[i]
+        umull           v4.4s,   v1.4h,   v5.4h // x * BB[i]
+        umull2          v5.4s,   v1.8h,   v5.8h // x * BB[i]
+        sub             v0.8h,   v29.8h,  v0.8h // 256 - x
+        sub             v1.8h,   v29.8h,  v1.8h // 256 - x
+        mul             v2.4s,   v2.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v3.4s,   v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,   v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v5.4s,   v5.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        st1             {v0.8h, v1.8h}, [x3], #32
+        ld1             {v0.8h, v1.8h}, [x7], #32
+        srshr           v2.4s,   v2.4s,  #12    // AA[i]
+        srshr           v3.4s,   v3.4s,  #12    // AA[i]
+        srshr           v4.4s,   v4.4s,  #12    // AA[i]
+        srshr           v5.4s,   v5.4s,  #12    // AA[i]
+
+        st1             {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
         b.gt            1b
 
+        ldp             d14, d15, [sp, #0x30]
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
-        ldp             d8,  d9,  [sp], 0x30
+        ldp             d8,  d9,  [sp], 0x40
         ret
 endfunc
 
@@ -144,10 +226,9 @@ endfunc
 //                               const int w, const int s,
 //                               const int bitdepth_max);
 function sgr_box5_vert_neon, export=1
-        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d8,  d9,  [sp, #-0x30]!
         stp             d10, d11, [sp, #0x10]
         stp             d12, d13, [sp, #0x20]
-        stp             d14, d15, [sp, #0x30]
 
         add             w4,  w4,  #2
         clz             w15, w6        // bitdepth_max
@@ -163,24 +244,19 @@ function sgr_box5_vert_neon, export=1
         movi            v31.4s,   #25   // n
 
         sub             w15, w15, #24  // -bitdepth_min_8
-        movrel          x13, X(sgr_x_by_x)
-        mov             w14, #164      // one_by_x
-        ld1             {v16.16b, v17.16b, v18.16b}, [x13]
+        movrel          x13, x_by_x_tables
+        movi            v30.4s,  #164
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks
         dup             v6.8h,   w15  // -bitdepth_min_8
-        movi            v19.16b, #5
-        movi            v24.8b,  #254 // idx of last 1
+        movi            v19.8b,  #0x7
+        ldr             q18, [x13, #64] // RefLo
         saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
         movi            v29.8h,  #1, lsl #8
-        dup             v30.4s,  w14  // one_by_x
-
-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b
 
         ld1             {v8.4s,  v9.4s},  [x5], #32
         ld1             {v10.4s, v11.4s}, [x6], #32
         ld1             {v12.4s, v13.4s}, [x7], #32
-        ld1             {v14.4s, v15.4s}, [x8], #32
+        ld1             {v16.4s, v17.4s}, [x8], #32
         ld1             {v20.8h},         [x9], #16
         ld1             {v21.8h},         [x10], #16
         ld1             {v22.8h},         [x11], #16
@@ -191,8 +267,8 @@ function sgr_box5_vert_neon, export=1
 1:
         add             v8.4s,   v8.4s,   v10.4s
         add             v9.4s,   v9.4s,   v11.4s
-        add             v12.4s,  v12.4s,  v14.4s
-        add             v13.4s,  v13.4s,  v15.4s
+        add             v12.4s,  v12.4s,  v16.4s
+        add             v13.4s,  v13.4s,  v17.4s
 
         add             v20.8h,  v20.8h,  v21.8h
         add             v22.8h,  v22.8h,  v23.8h
@@ -207,11 +283,6 @@ function sgr_box5_vert_neon, export=1
 
         subs            w4,  w4,  #8
 
-        movi            v20.8b,  #55  // idx of last 5
-        movi            v21.8b,  #72  // idx of last 4
-        movi            v22.8b,  #101 // idx of last 3
-        movi            v23.8b,  #169 // idx of last 2
-
         srshl           v0.4s,   v0.4s,   v7.4s
         srshl           v1.4s,   v1.4s,   v7.4s
         srshl           v4.8h,   v2.8h,   v6.8h
@@ -231,22 +302,19 @@ function sgr_box5_vert_neon, export=1
 
         ld1             {v12.4s, v13.4s}, [x7], #32
 
-        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        ld1             {v14.4s, v15.4s}, [x8], #32
-        add             v25.8b,  v25.8b,  v26.8b
-        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b,  v27.8b,  v4.8b
+        ushr            v1.8b,   v0.8b,  #3
+        ld1             {v16.4s, v17.4s}, [x8], #32
+        tbl             v5.8b,   {v26.16b, v27.16b}, v1.8b // RangeMins
+        tbl             v1.8b,   {v24.16b, v25.16b}, v1.8b // DiffMasks
+        tbl             v4.8b,   {v18.16b}, v0.8b          // RefLo
+        and             v0.8b,   v0.8b,  v19.8b
         ld1             {v20.8h},         [x9], #16
-        add             v5.8b,   v5.8b,   v19.8b
-        add             v25.8b,  v25.8b,  v27.8b
+        ushl            v5.8b,   v5.8b,  v0.8b
+        add             v4.8b,   v4.8b,  v1.8b
         ld1             {v21.8h},         [x10], #16
-        add             v5.8b,   v1.8b,   v5.8b
+        cnt             v5.8b,   v5.8b
         ld1             {v22.8h},         [x11], #16
-        add             v5.8b,   v5.8b,   v25.8b
+        add             v5.8b,   v4.8b,  v5.8b
         ld1             {v23.8h},         [x12], #16
         uxtl            v5.8h,   v5.8b           // x
 
@@ -264,9 +332,8 @@ function sgr_box5_vert_neon, export=1
         st1             {v5.8h}, [x3], #16
         b.gt            1b
 
-        ldp             d14, d15, [sp, #0x30]
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
-        ldp             d8,  d9,  [sp], 0x40
+        ldp             d8,  d9,  [sp], 0x30
         ret
 endfunc
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 736b2bb4e699beece7f67b222de5b753583a3945..24ef4d298ae92c24f2d4680a78d1ff6fe49b60df 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -79,11 +79,11 @@ function \type\()_8bpc_neon, export=1
 .ifc \type, mask
         movi            v31.16b, #256-2
 .endif
-        adr             x7,  L(\type\()_tbl)
+        movrel          x7,  \type\()_tbl
         sub             w4,  w4,  #24
-        ldrh            w4,  [x7, x4, lsl #1]
+        ldrsw           x4,  [x7, x4, lsl #2]
         \type           v4,  v0,  v1,  v2,  v3
-        sub             x7,  x7,  w4, uxtw
+        add             x7,  x7,  x4
         br              x7
 40:
         AARCH64_VALID_JUMP_TARGET
@@ -119,17 +119,18 @@ function \type\()_8bpc_neon, export=1
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 8:
-        st1             {v4.d}[0],  [x0], x1
+        st1             {v4.8b},    [x0], x1
         \type           v5,  v0,  v1,  v2,  v3
         st1             {v4.d}[1],  [x7], x1
-        st1             {v5.d}[0],  [x0], x1
+        st1             {v5.8b},    [x0], x1
         subs            w5,  w5,  #4
         st1             {v5.d}[1],  [x7], x1
         b.le            0f
         \type           v4,  v0,  v1,  v2,  v3
         b               8b
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         \type           v5,  v0,  v1,  v2,  v3
         st1             {v4.16b}, [x0], x1
         \type           v6,  v0,  v1,  v2,  v3
@@ -192,14 +193,16 @@ function \type\()_8bpc_neon, export=1
         b               128b
 0:
         ret
-L(\type\()_tbl):
-        .hword L(\type\()_tbl) - 1280b
-        .hword L(\type\()_tbl) -  640b
-        .hword L(\type\()_tbl) -  320b
-        .hword L(\type\()_tbl) -   16b
-        .hword L(\type\()_tbl) -   80b
-        .hword L(\type\()_tbl) -   40b
 endfunc
+
+jumptable \type\()_tbl
+        .word 1280b - \type\()_tbl
+        .word 640b  - \type\()_tbl
+        .word 320b  - \type\()_tbl
+        .word 160b  - \type\()_tbl
+        .word 80b   - \type\()_tbl
+        .word 40b   - \type\()_tbl
+endjumptable
 .endm
 
 bidir_fn avg
@@ -210,10 +213,10 @@ bidir_fn mask
 .macro w_mask_fn type
 function w_mask_\type\()_8bpc_neon, export=1
         clz             w8,  w4
-        adr             x9,  L(w_mask_\type\()_tbl)
+        movrel          x9,  w_mask_\type\()_tbl
         sub             w8,  w8,  #24
-        ldrh            w8,  [x9,  x8,  lsl #1]
-        sub             x9,  x9,  w8,  uxtw
+        ldrsw           x8,  [x9,  x8,  lsl #2]
+        add             x9,  x9,  x8
         mov             w10, #6903
         dup             v0.8h,   w10
 .if \type == 444
@@ -230,8 +233,9 @@ function w_mask_\type\()_8bpc_neon, export=1
         add             x12,  x0,  x1
         lsl             x1,   x1,  #1
         br              x9
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
         ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
         subs            w5,  w5,  #4
@@ -267,7 +271,7 @@ function w_mask_\type\()_8bpc_neon, export=1
         addp            v18.8h,   v24.8h,  v24.8h
         sub             v18.4h,   v3.4h,   v18.4h
         rshrn           v18.8b,   v18.8h,  #2
-        st1             {v18.s}[0],  [x6],  #4
+        str             s18,         [x6],  #4
 .endif
         st1             {v22.s}[0],  [x0],  x1
         st1             {v22.s}[1],  [x12], x1
@@ -275,8 +279,9 @@ function w_mask_\type\()_8bpc_neon, export=1
         st1             {v23.s}[1],  [x12], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld1             {v4.8h,   v5.8h},   [x2],  #32
         ld1             {v6.8h,   v7.8h},   [x3],  #32
         subs            w5,  w5,  #2
@@ -310,7 +315,7 @@ function w_mask_\type\()_8bpc_neon, export=1
         addp            v18.8h,  v18.8h,  v18.8h
         sub             v18.4h,  v3.4h,   v18.4h
         rshrn           v18.8b,  v18.8h,  #2
-        st1             {v18.s}[0],  [x6],  #4
+        str             s18,       [x6],  #4
 .endif
         st1             {v22.8b},  [x0],  x1
         st1             {v23.8b},  [x12], x1
@@ -413,14 +418,16 @@ function w_mask_\type\()_8bpc_neon, export=1
         add             x12, x12, x1
         b.gt            161b
         ret
-L(w_mask_\type\()_tbl):
-        .hword L(w_mask_\type\()_tbl) - 1280b
-        .hword L(w_mask_\type\()_tbl) -  640b
-        .hword L(w_mask_\type\()_tbl) -  320b
-        .hword L(w_mask_\type\()_tbl) -  160b
-        .hword L(w_mask_\type\()_tbl) -    8b
-        .hword L(w_mask_\type\()_tbl) -    4b
 endfunc
+
+jumptable w_mask_\type\()_tbl
+        .word 1280b - w_mask_\type\()_tbl
+        .word 640b  - w_mask_\type\()_tbl
+        .word 320b  - w_mask_\type\()_tbl
+        .word 160b  - w_mask_\type\()_tbl
+        .word 80b   - w_mask_\type\()_tbl
+        .word 40b   - w_mask_\type\()_tbl
+endjumptable
 .endm
 
 w_mask_fn 444
@@ -429,20 +436,21 @@ w_mask_fn 420
 
 
 function blend_8bpc_neon, export=1
-        adr             x6,  L(blend_tbl)
+        movrel          x6,  blend_tbl
         clz             w3,  w3
         sub             w3,  w3,  #26
-        ldrh            w3,  [x6,  x3,  lsl #1]
-        sub             x6,  x6,  w3,  uxtw
+        ldrsw           x3,  [x6,  x3,  lsl #2]
+        add             x6,  x6,  x3
         movi            v4.16b,  #64
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         br              x6
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
-        ld1             {v2.8b},     [x5],  #8
-        ld1             {v1.d}[0],   [x2],  #8
-        ld1             {v0.s}[0],   [x0]
+4:
+        ld1             {v2.8b},  [x5],  #8
+        ldr             d1,       [x2],  #8
+        ldr             s0,       [x0]
         subs            w4,  w4,  #2
         ld1             {v0.s}[1],   [x8]
         sub             v3.8b,   v4.8b,   v2.8b
@@ -453,12 +461,13 @@ function blend_8bpc_neon, export=1
         st1             {v6.s}[1],   [x8],  x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld1             {v2.16b},  [x5],  #16
         ld1             {v1.16b},  [x2],  #16
-        ld1             {v0.d}[0],   [x0]
-        ld1             {v0.d}[1],   [x8]
+        ldr             d0,        [x0]
+        ld1             {v0.d}[1], [x8]
         sub             v3.16b,  v4.16b,  v2.16b
         subs            w4,  w4,  #2
         umull           v5.8h,   v1.8b,   v2.8b
@@ -466,13 +475,14 @@ function blend_8bpc_neon, export=1
         umull2          v6.8h,   v1.16b,  v2.16b
         umlal2          v6.8h,   v0.16b,  v3.16b
         rshrn           v7.8b,   v5.8h,   #6
-        rshrn2          v7.16b,  v6.8h,   #6
-        st1             {v7.d}[0],   [x0],  x1
-        st1             {v7.d}[1],   [x8],  x1
+        rshrn           v16.8b,  v6.8h,   #6
+        st1             {v7.8b},   [x0],  x1
+        st1             {v16.8b},  [x8],  x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ld1             {v1.16b,  v2.16b},  [x5],  #32
         ld1             {v5.16b,  v6.16b},  [x2],  #32
         ld1             {v0.16b},  [x0]
@@ -496,8 +506,9 @@ function blend_8bpc_neon, export=1
         st1             {v19.16b}, [x8],  x1
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
         ld1             {v20.16b, v21.16b}, [x0]
@@ -535,15 +546,17 @@ function blend_8bpc_neon, export=1
         st1             {v27.16b, v28.16b}, [x8],  x1
         b.gt            32b
         ret
-L(blend_tbl):
-        .hword L(blend_tbl) - 32b
-        .hword L(blend_tbl) - 16b
-        .hword L(blend_tbl) -  8b
-        .hword L(blend_tbl) -  4b
 endfunc
 
+jumptable blend_tbl
+        .word 320b - blend_tbl
+        .word 160b - blend_tbl
+        .word 80b  - blend_tbl
+        .word 40b  - blend_tbl
+endjumptable
+
 function blend_h_8bpc_neon, export=1
-        adr             x6,  L(blend_h_tbl)
+        movrel          x6,  blend_h_tbl
         movrel          x5,  X(obmc_masks)
         add             x5,  x5,  w4,  uxtw
         sub             w4,  w4,  w4,  lsr #2
@@ -552,15 +565,16 @@ function blend_h_8bpc_neon, export=1
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w7,  w7,  #24
-        ldrh            w7,  [x6,  x7,  lsl #1]
-        sub             x6,  x6,  w7, uxtw
+        ldrsw           x7,  [x6,  x7,  lsl #2]
+        add             x6,  x6,  x7
         br              x6
-2:
+20:
         AARCH64_VALID_JUMP_TARGET
-        ld1             {v0.h}[0],   [x5],  #2
-        ld1             {v1.s}[0],   [x2],  #4
+2:
+        ldr             h0,  [x5],  #2
+        ldr             s1,  [x2],  #4
         subs            w4,  w4,  #2
-        ld1             {v2.h}[0],   [x0]
+        ldr             h2,  [x0]
         zip1            v0.8b,   v0.8b,   v0.8b
         sub             v3.8b,   v4.8b,   v0.8b
         ld1             {v2.h}[1],   [x8]
@@ -571,13 +585,14 @@ function blend_h_8bpc_neon, export=1
         st1             {v5.h}[1],   [x8],  x1
         b.gt            2b
         ret
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld2r            {v0.8b,   v1.8b},   [x5],  #2
         ld1             {v2.8b},   [x2],  #8
         subs            w4,  w4,  #2
         ext             v0.8b,   v0.8b,   v1.8b,   #4
-        ld1             {v3.s}[0],   [x0]
+        ldr             s3,          [x0]
         sub             v5.8b,   v4.8b,   v0.8b
         ld1             {v3.s}[1],   [x8]
         umull           v6.8h,   v2.8b,   v0.8b
@@ -587,27 +602,29 @@ function blend_h_8bpc_neon, export=1
         st1             {v6.s}[1],   [x8],  x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld2r            {v0.16b,  v1.16b},  [x5],  #2
         ld1             {v2.16b},  [x2],  #16
-        ld1             {v3.d}[0],   [x0]
+        ldr             d3,        [x0]
         ext             v0.16b,  v0.16b,  v1.16b,  #8
         sub             v5.16b,  v4.16b,  v0.16b
-        ld1             {v3.d}[1],   [x8]
+        ld1             {v3.d}[1], [x8]
         subs            w4,  w4,  #2
         umull           v6.8h,   v0.8b,   v2.8b
         umlal           v6.8h,   v3.8b,   v5.8b
         umull2          v7.8h,   v0.16b,  v2.16b
         umlal2          v7.8h,   v3.16b,  v5.16b
         rshrn           v16.8b,  v6.8h,   #6
-        rshrn2          v16.16b, v7.8h,   #6
-        st1             {v16.d}[0],  [x0],  x1
-        st1             {v16.d}[1],  [x8],  x1
+        rshrn           v17.8b,  v7.8h,   #6
+        st1             {v16.8b},  [x0],  x1
+        st1             {v17.8b},  [x8],  x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ld2r            {v0.16b,  v1.16b},  [x5],  #2
         ld1             {v2.16b,  v3.16b},  [x2],  #32
         ld1             {v5.16b},  [x0]
@@ -682,18 +699,20 @@ function blend_h_8bpc_neon, export=1
         add             x7,  x7,  w3,  uxtw
         b.gt            321b
         ret
-L(blend_h_tbl):
-        .hword L(blend_h_tbl) - 1280b
-        .hword L(blend_h_tbl) -  640b
-        .hword L(blend_h_tbl) -  320b
-        .hword L(blend_h_tbl) -   16b
-        .hword L(blend_h_tbl) -    8b
-        .hword L(blend_h_tbl) -    4b
-        .hword L(blend_h_tbl) -    2b
 endfunc
 
+jumptable blend_h_tbl
+        .word 1280b - blend_h_tbl
+        .word 640b  - blend_h_tbl
+        .word 320b  - blend_h_tbl
+        .word 160b  - blend_h_tbl
+        .word 80b   - blend_h_tbl
+        .word 40b   - blend_h_tbl
+        .word 20b   - blend_h_tbl
+endjumptable
+
 function blend_v_8bpc_neon, export=1
-        adr             x6,  L(blend_v_tbl)
+        movrel          x6,  blend_v_tbl
         movrel          x5,  X(obmc_masks)
         add             x5,  x5,  w3,  uxtw
         clz             w3,  w3
@@ -701,16 +720,16 @@ function blend_v_8bpc_neon, export=1
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w3,  w3,  #26
-        ldrh            w3,  [x6,  x3,  lsl #1]
-        sub             x6,  x6,  w3,  uxtw
+        ldrsw           x3,  [x6,  x3,  lsl #2]
+        add             x6,  x6,  x3
         br              x6
 20:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.8b},   [x5]
         sub             v1.8b,   v4.8b,   v0.8b
 2:
-        ld1             {v2.h}[0],   [x2],  #2
-        ld1             {v3.b}[0],   [x0]
+        ldr             h2,          [x2],  #2
+        ldr             b3,          [x0]
         subs            w4,  w4,  #2
         ld1             {v2.b}[1],   [x2]
         ld1             {v3.b}[1],   [x8]
@@ -729,13 +748,13 @@ function blend_v_8bpc_neon, export=1
         sub             v1.8b,   v4.8b,   v0.8b
 4:
         ld1             {v2.8b},   [x2],  #8
-        ld1             {v3.s}[0],   [x0]
+        ldr             s3,          [x0]
         ld1             {v3.s}[1],   [x8]
         subs            w4,  w4,  #2
         umull           v5.8h,   v2.8b,   v0.8b
         umlal           v5.8h,   v3.8b,   v1.8b
         rshrn           v5.8b,   v5.8h,   #6
-        st1             {v5.h}[0],   [x0],  #2
+        str             h5,          [x0],  #2
         st1             {v5.h}[2],   [x8],  #2
         st1             {v5.b}[2],   [x0],  x1
         st1             {v5.b}[6],   [x8],  x1
@@ -746,21 +765,22 @@ function blend_v_8bpc_neon, export=1
         ld1r            {v0.2d},   [x5]
         sub             x1,  x1,  #4
         sub             v1.16b,  v4.16b,  v0.16b
+        zip2            v16.2d,  v1.2d,   v1.2d
 8:
         ld1             {v2.16b},  [x2],  #16
-        ld1             {v3.d}[0],   [x0]
-        ld1             {v3.d}[1],   [x8]
+        ldr             d3,          [x0]
+        ldr             d4,          [x8]
         subs            w4,  w4,  #2
         umull           v5.8h,  v0.8b,  v2.8b
         umlal           v5.8h,  v3.8b,  v1.8b
         umull2          v6.8h,  v0.16b, v2.16b
-        umlal2          v6.8h,  v3.16b, v1.16b
+        umlal           v6.8h,  v4.8b,  v16.8b
         rshrn           v7.8b,  v5.8h,  #6
-        rshrn2          v7.16b, v6.8h,  #6
-        st1             {v7.s}[0],   [x0],  #4
-        st1             {v7.s}[2],   [x8],  #4
+        rshrn           v17.8b, v6.8h,  #6
+        str             s7,          [x0],  #4
+        str             s17,         [x8],  #4
         st1             {v7.h}[2],   [x0],  x1
-        st1             {v7.h}[6],   [x8],  x1
+        st1             {v17.h}[2],  [x8],  x1
         b.gt            8b
         ret
 160:
@@ -826,21 +846,23 @@ function blend_v_8bpc_neon, export=1
         st1             {v27.8b},  [x8],  x1
         b.gt            32b
         ret
-L(blend_v_tbl):
-        .hword L(blend_v_tbl) - 320b
-        .hword L(blend_v_tbl) - 160b
-        .hword L(blend_v_tbl) -  80b
-        .hword L(blend_v_tbl) -  40b
-        .hword L(blend_v_tbl) -  20b
 endfunc
 
+jumptable blend_v_tbl
+        .word 320b - blend_v_tbl
+        .word 160b - blend_v_tbl
+        .word 80b  - blend_v_tbl
+        .word 40b  - blend_v_tbl
+        .word 20b  - blend_v_tbl
+endjumptable
+
 
 // This has got the same signature as the put_8tap functions,
 // and assumes that x8 is set to (clz(w)-24).
 function put_neon, export=1
-        adr             x9,  L(put_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  x8
+        movrel          x9,  put_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
+        add             x9,  x9,  x8
         br              x9
 
 20:
@@ -933,34 +955,39 @@ function put_neon, export=1
         add             x0, x0, x1
         b.gt            128b
         ret
-
-L(put_tbl):
-        .hword L(put_tbl) - 1280b
-        .hword L(put_tbl) -  640b
-        .hword L(put_tbl) -  320b
-        .hword L(put_tbl) -  160b
-        .hword L(put_tbl) -   80b
-        .hword L(put_tbl) -   40b
-        .hword L(put_tbl) -   20b
 endfunc
 
+jumptable put_tbl
+        .word 1280b - put_tbl
+        .word 640b  - put_tbl
+        .word 320b  - put_tbl
+        .word 160b  - put_tbl
+        .word 80b   - put_tbl
+        .word 40b   - put_tbl
+        .word 20b   - put_tbl
+endjumptable
+
 
 // This has got the same signature as the prep_8tap functions,
 // and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
 function prep_neon, export=1
-        adr             x9,  L(prep_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
+        movrel          x9,  prep_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
         movi            v24.16b, #16
-        sub             x9,  x9,  x8
+        add             x9,  x9,  x8
         br              x9
 
 40:
         AARCH64_VALID_JUMP_TARGET
 4:
-        ld1             {v0.s}[0], [x1], x2
-        ld1             {v0.s}[1], [x1], x2
-        ld1             {v1.s}[0], [x1], x2
-        ld1             {v1.s}[1], [x1], x2
+        ldr             s0, [x1]
+        ldr             s2, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s1, [x1]
+        ldr             s3, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        mov             v0.s[1], v2.s[0]
+        mov             v1.s[1], v3.s[0]
         ushll           v0.8h, v0.8b, #4
         ushll           v1.8h, v1.8b, #4
         subs            w4, w4, #4
@@ -1092,16 +1119,17 @@ function prep_neon, export=1
         add             x0, x0, #256
         b.gt            128b
         ret
-
-L(prep_tbl):
-        .hword L(prep_tbl) - 1280b
-        .hword L(prep_tbl) -  640b
-        .hword L(prep_tbl) -  320b
-        .hword L(prep_tbl) -  160b
-        .hword L(prep_tbl) -   80b
-        .hword L(prep_tbl) -   40b
 endfunc
 
+jumptable prep_tbl
+        .word 1280b - prep_tbl
+        .word 640b  - prep_tbl
+        .word 320b  - prep_tbl
+        .word 160b  - prep_tbl
+        .word 80b   - prep_tbl
+        .word 40b   - prep_tbl
+endjumptable
+
 
 .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
         ld1             {\d0\wd}[0], [\s0], \strd
@@ -1335,10 +1363,10 @@ endfunc
 .endif
 .endm
 .macro st_d strd, r0, r1
-        st1             {\r0\().d}[0], [x0], \strd
+        st1             {\r0\().8b},   [x0], \strd
         st1             {\r0\().d}[1], [x8], \strd
 .ifnb \r1
-        st1             {\r1\().d}[0], [x0], \strd
+        st1             {\r1\().8b},   [x0], \strd
         st1             {\r1\().d}[1], [x8], \strd
 .endif
 .endm
@@ -1439,16 +1467,15 @@ L(\type\()_\taps\()_h):
         add             \xmx, x10, \mx, uxtw #3
         b.ne            L(\type\()_\taps\()_hv)
 
-        adr             x9,  L(\type\()_\taps\()_h_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        movrel          x9,  \type\()_\taps\()_h_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
+        add             x9,  x9,  x8
         br              x9
 
 20:     // 2xN h
         AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
         sub             \src,  \src,  #1
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1481,8 +1508,7 @@ L(\type\()_\taps\()_h):
 
 40:     // 4xN h
         AARCH64_VALID_JUMP_TARGET
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
         sub             \src,  \src,  #1
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1514,8 +1540,10 @@ L(\type\()_\taps\()_h):
 .ifc \type, put
         sqrshrun        v16.8b,  v16.8h,  #4
         sqrshrun        v20.8b,  v20.8h,  #4
-        st1             {v16.s}[0], [\dst], \d_strd
-        st1             {v20.s}[0], [\ds2], \d_strd
+        str             s16,  [\dst]
+        str             s20,  [\ds2]
+        add             \dst, \dst, \d_strd
+        add             \ds2, \ds2, \d_strd
 .else
         st1             {v16.4h}, [\dst], \d_strd
         st1             {v20.4h}, [\ds2], \d_strd
@@ -1526,7 +1554,11 @@ L(\type\()_\taps\()_h):
 80:     // 8xN h
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
         sub             \src,  \src,  #3
+.endif
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
@@ -1541,25 +1573,23 @@ L(\type\()_\taps\()_h):
         uxtl            v21.8h,  v21.8b
 
 .ifc \taps, 6tap
-        ext             v19.16b, v16.16b, v17.16b, #2
-        ext             v23.16b, v20.16b, v21.16b, #2
-        mul             v18.8h,  v19.8h,  v0.h[1]
-        mul             v22.8h,  v23.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
+        mul             v18.8h,  v16.8h,  v0.h[1]
+        mul             v22.8h,  v20.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v19.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v23.16b, v20.16b, v21.16b, #(2*\i-2)
         mla             v18.8h,  v19.8h,  v0.h[\i]
         mla             v22.8h,  v23.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
         mul             v18.8h,  v16.8h,  v0.h[0]
         mul             v22.8h,  v20.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
         ext             v19.16b, v16.16b, v17.16b, #(2*\i)
         ext             v23.16b, v20.16b, v21.16b, #(2*\i)
         mla             v18.8h,  v19.8h,  v0.h[\i]
         mla             v22.8h,  v23.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
         subs            \h,  \h,  #2
         srshr           v18.8h,  v18.8h, #2
@@ -1581,7 +1611,11 @@ L(\type\()_\taps\()_h):
 1280:   // 16xN, 32xN, ... h
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
         sub             \src,  \src,  #3
+.endif
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
@@ -1606,30 +1640,26 @@ L(\type\()_\taps\()_h):
 
 16:
 .ifc \taps, 6tap
-        ext             v28.16b, v16.16b, v17.16b, #2
-        ext             v29.16b, v17.16b, v18.16b, #2
-        ext             v30.16b, v20.16b, v21.16b, #2
-        ext             v31.16b, v21.16b, v22.16b, #2
-        mul             v24.8h,  v28.8h,  v0.h[1]
-        mul             v25.8h,  v29.8h,  v0.h[1]
-        mul             v26.8h,  v30.8h,  v0.h[1]
-        mul             v27.8h,  v31.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
-        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
-        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
+        mul             v24.8h,  v16.8h,  v0.h[1]
+        mul             v25.8h,  v17.8h,  v0.h[1]
+        mul             v26.8h,  v20.8h,  v0.h[1]
+        mul             v27.8h,  v21.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v28.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v29.16b, v17.16b, v18.16b, #(2*\i-2)
+        ext             v30.16b, v20.16b, v21.16b, #(2*\i-2)
+        ext             v31.16b, v21.16b, v22.16b, #(2*\i-2)
         mla             v24.8h,  v28.8h,  v0.h[\i]
         mla             v25.8h,  v29.8h,  v0.h[\i]
         mla             v26.8h,  v30.8h,  v0.h[\i]
         mla             v27.8h,  v31.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
         mul             v24.8h,  v16.8h,  v0.h[0]
         mul             v25.8h,  v17.8h,  v0.h[0]
         mul             v26.8h,  v20.8h,  v0.h[0]
         mul             v27.8h,  v21.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
         ext             v28.16b, v16.16b, v17.16b, #(2*\i)
         ext             v29.16b, v17.16b, v18.16b, #(2*\i)
         ext             v30.16b, v20.16b, v21.16b, #(2*\i)
@@ -1638,7 +1668,7 @@ L(\type\()_\taps\()_h):
         mla             v25.8h,  v29.8h,  v0.h[\i]
         mla             v26.8h,  v30.8h,  v0.h[\i]
         mla             v27.8h,  v31.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
         srshr           v24.8h,  v24.8h, #2
         srshr           v25.8h,  v25.8h, #2
@@ -1677,19 +1707,19 @@ L(\type\()_\taps\()_h):
         subs            \h,  \h,  #2
         b.gt            161b
         ret
+endfunc
 
-L(\type\()_\taps\()_h_tbl):
-        .hword L(\type\()_\taps\()_h_tbl) - 1280b
-        .hword L(\type\()_\taps\()_h_tbl) -  640b
-        .hword L(\type\()_\taps\()_h_tbl) -  320b
-        .hword L(\type\()_\taps\()_h_tbl) -  160b
-        .hword L(\type\()_\taps\()_h_tbl) -   80b
-        .hword L(\type\()_\taps\()_h_tbl) -   40b
-        .hword L(\type\()_\taps\()_h_tbl) -   20b
-        .hword 0
-
-
-L(\type\()_\taps\()_v):
+jumptable \type\()_\taps\()_h_tbl
+        .word 1280b - \type\()_\taps\()_h_tbl
+        .word 640b  - \type\()_\taps\()_h_tbl
+        .word 320b  - \type\()_\taps\()_h_tbl
+        .word 160b  - \type\()_\taps\()_h_tbl
+        .word 80b   - \type\()_\taps\()_h_tbl
+        .word 40b   - \type\()_\taps\()_h_tbl
+        .word 20b   - \type\()_\taps\()_h_tbl
+endjumptable
+
+function L(\type\()_\taps\()_v)
         cmp             \h,  #4
         ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
@@ -1698,9 +1728,9 @@ L(\type\()_\taps\()_v):
 4:
         add             \xmy, x10, \my, uxtw #3
 
-        adr             x9,  L(\type\()_\taps\()_v_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        movrel          x9,  \type\()_\taps\()_v_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
+        add             x9,  x9,  x8
         br              x9
 
 20:     // 2xN v
@@ -1709,8 +1739,7 @@ L(\type\()_\taps\()_v):
         b.gt            28f
 
         cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src,  \src,  \s_strd
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1789,8 +1818,7 @@ L(\type\()_\taps\()_v):
 
         // 4x2, 4x4 v
         cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -1865,8 +1893,7 @@ L(\type\()_\taps\()_v):
 
         // 8x2, 8x4 v
         cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -1964,8 +1991,7 @@ L(\type\()_\taps\()_v):
         b.gt            1680b
 
         // 16x2, 16x4 v
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -2003,18 +2029,19 @@ L(\type\()_\taps\()_v):
         shift_store_16  \type, \d_strd, v1, v2, v3, v4
 0:
         ret
+endfunc
 
-L(\type\()_\taps\()_v_tbl):
-        .hword L(\type\()_\taps\()_v_tbl) - 1280b
-        .hword L(\type\()_\taps\()_v_tbl) -  640b
-        .hword L(\type\()_\taps\()_v_tbl) -  320b
-        .hword L(\type\()_\taps\()_v_tbl) -  160b
-        .hword L(\type\()_\taps\()_v_tbl) -   80b
-        .hword L(\type\()_\taps\()_v_tbl) -   40b
-        .hword L(\type\()_\taps\()_v_tbl) -   20b
-        .hword 0
-
-L(\type\()_\taps\()_hv):
+jumptable \type\()_\taps\()_v_tbl
+        .word 1280b - \type\()_\taps\()_v_tbl
+        .word 640b  - \type\()_\taps\()_v_tbl
+        .word 320b  - \type\()_\taps\()_v_tbl
+        .word 160b  - \type\()_\taps\()_v_tbl
+        .word 80b   - \type\()_\taps\()_v_tbl
+        .word 40b   - \type\()_\taps\()_v_tbl
+        .word 20b   - \type\()_\taps\()_v_tbl
+endjumptable
+
+function L(\type\()_\taps\()_hv)
         cmp             \h,  #4
         ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
@@ -2023,19 +2050,17 @@ L(\type\()_\taps\()_hv):
 4:
         add             \xmy,  x10, \my, uxtw #3
 
-        adr             x9,  L(\type\()_\taps\()_hv_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        movrel          x9,  \type\()_\taps\()_hv_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
+        add             x9,  x9,  x8
         br              x9
 
 20:
         AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
         b.gt            280f
-        add             \xmy,  \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
 
         // 2x2, 2x4 hv
         sub             \sr2, \src, #1
@@ -2169,11 +2194,9 @@ L(\type\()_\taps\()_filter_2):
 
 40:
         AARCH64_VALID_JUMP_TARGET
-        add             \xmx, \xmx, #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
         b.gt            480f
-        add             \xmy, \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
         sub             \sr2, \src, #1
         sub             \src, \sr2, \s_strd
         add             \ds2, \dst, \d_strd
@@ -2218,8 +2241,10 @@ L(\type\()_\taps\()_filter_2):
 .ifc \type, put
         sqxtun          v2.8b,  v2.8h
         sqxtun          v3.8b,  v3.8h
-        st1             {v2.s}[0], [\dst], \d_strd
-        st1             {v3.s}[0], [\ds2], \d_strd
+        str             s2,  [\dst]
+        str             s3,  [\ds2]
+        add             \dst, \dst, \d_strd
+        add             \ds2, \ds2, \d_strd
 .else
         st1             {v2.4h}, [\dst], \d_strd
         st1             {v3.4h}, [\ds2], \d_strd
@@ -2311,8 +2336,10 @@ L(\type\()_\taps\()_filter_2):
 .ifc \type, put
         sqxtun          v2.8b,  v2.8h
         sqxtun          v3.8b,  v3.8h
-        st1             {v2.s}[0], [\dst], \d_strd
-        st1             {v3.s}[0], [\ds2], \d_strd
+        str             s2,  [\dst]
+        str             s3,  [\ds2]
+        add             \dst, \dst, \d_strd
+        add             \ds2, \ds2, \d_strd
 .else
         st1             {v2.4h}, [\dst], \d_strd
         st1             {v3.4h}, [\ds2], \d_strd
@@ -2359,10 +2386,13 @@ L(\type\()_\taps\()_filter_4):
 320:
         AARCH64_VALID_JUMP_TARGET
         b.gt            880f
-        add             \xmy,  \xmy,  #2
         ld1             {v0.8b},  [\xmx]
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
         sub             \src,  \src,  #3
+.endif
         sub             \src,  \src,  \s_strd
         sxtl            v0.8h,  v0.8b
         sxtl            v1.8h,  v1.8b
@@ -2440,8 +2470,10 @@ L(\type\()_\taps\()_filter_4):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
         sub             \src,  \src,  #3
-.ifc \taps, 8tap
         sub             \src,  \src,  \s_strd
 .endif
         sub             \src,  \src,  \s_strd, lsl #1
@@ -2585,17 +2617,15 @@ L(\type\()_\taps\()_filter_8_first):
         uxtl            v28.8h,  v28.8b
         uxtl            v29.8h,  v29.8b
 .ifc \taps, 6tap
-        ext             v24.16b, v28.16b, v29.16b, #(2*1)
-        ext             v25.16b, v28.16b, v29.16b, #(2*2)
-        ext             v26.16b, v28.16b, v29.16b, #(2*3)
-        ext             v27.16b, v28.16b, v29.16b, #(2*4)
-        mul             v16.8h,  v24.8h,  v0.h[1]
+        mul             v16.8h,  v28.8h,  v0.h[1]
+        ext             v25.16b, v28.16b, v29.16b, #(2*1)
+        ext             v26.16b, v28.16b, v29.16b, #(2*2)
+        ext             v27.16b, v28.16b, v29.16b, #(2*3)
         mla             v16.8h,  v25.8h,  v0.h[2]
         mla             v16.8h,  v26.8h,  v0.h[3]
         mla             v16.8h,  v27.8h,  v0.h[4]
-        ext             v24.16b, v28.16b, v29.16b, #(2*5)
-        ext             v25.16b, v28.16b, v29.16b, #(2*6)
-        ext             v26.16b, v28.16b, v29.16b, #(2*7)
+        ext             v24.16b, v28.16b, v29.16b, #(2*4)
+        ext             v25.16b, v28.16b, v29.16b, #(2*5)
         mla             v16.8h,  v24.8h,  v0.h[5]
         mla             v16.8h,  v25.8h,  v0.h[6]
 .else   // 8tap
@@ -2626,40 +2656,38 @@ L(\type\()_\taps\()_filter_8):
         uxtl            v30.8h,  v30.8b
         uxtl            v31.8h,  v31.8b
 .ifc \taps, 6tap
-        ext             v26.16b, v28.16b, v29.16b, #2
-        ext             v27.16b, v30.16b, v31.16b, #2
-        mul             v24.8h,  v26.8h,  v0.h[1]
-        mul             v25.8h,  v27.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
+        mul             v24.8h,  v28.8h,  v0.h[1]
+        mul             v25.8h,  v30.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i-2)
+        ext             v27.16b, v30.16b, v31.16b, #(2*\i-2)
         mla             v24.8h,  v26.8h,  v0.h[\i]
         mla             v25.8h,  v27.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
         mul             v24.8h,  v28.8h,  v0.h[0]
         mul             v25.8h,  v30.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
         ext             v26.16b, v28.16b, v29.16b, #(2*\i)
         ext             v27.16b, v30.16b, v31.16b, #(2*\i)
         mla             v24.8h,  v26.8h,  v0.h[\i]
         mla             v25.8h,  v27.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
         srshr           v24.8h,  v24.8h, #2
         srshr           v25.8h,  v25.8h, #2
         ret
-
-L(\type\()_\taps\()_hv_tbl):
-        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
-        .hword L(\type\()_\taps\()_hv_tbl) -  640b
-        .hword L(\type\()_\taps\()_hv_tbl) -  320b
-        .hword L(\type\()_\taps\()_hv_tbl) -  160b
-        .hword L(\type\()_\taps\()_hv_tbl) -   80b
-        .hword L(\type\()_\taps\()_hv_tbl) -   40b
-        .hword L(\type\()_\taps\()_hv_tbl) -   20b
-        .hword 0
 endfunc
+
+jumptable \type\()_\taps\()_hv_tbl
+        .word 1280b - \type\()_\taps\()_hv_tbl
+        .word 640b  - \type\()_\taps\()_hv_tbl
+        .word 320b  - \type\()_\taps\()_hv_tbl
+        .word 160b  - \type\()_\taps\()_hv_tbl
+        .word 80b   - \type\()_\taps\()_hv_tbl
+        .word 40b   - \type\()_\taps\()_hv_tbl
+        .word 20b   - \type\()_\taps\()_hv_tbl
+endjumptable
 .endm
 
 
@@ -2686,9 +2714,9 @@ function \type\()_bilin_8bpc_neon, export=1
 L(\type\()_bilin_h):
         cbnz            \my, L(\type\()_bilin_hv)
 
-        adr             x9,  L(\type\()_bilin_h_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        movrel          x9,  \type\()_bilin_h_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
+        add             x9,  x9,  x8
         br              x9
 
 20:     // 2xN h
@@ -2699,8 +2727,8 @@ L(\type\()_bilin_h):
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
 2:
-        ld1             {v4.s}[0],  [\src], \s_strd
-        ld1             {v6.s}[0],  [\sr2], \s_strd
+        ld1r            {v4.4s},  [\src], \s_strd
+        ld1r            {v6.4s},  [\sr2], \s_strd
         ext             v5.8b,  v4.8b,  v4.8b, #1
         ext             v7.8b,  v6.8b,  v6.8b, #1
         trn1            v4.4h,  v4.4h,  v6.4h
@@ -2736,7 +2764,7 @@ L(\type\()_bilin_h):
         st1             {v4.s}[0], [\dst], \d_strd
         st1             {v4.s}[1], [\ds2], \d_strd
 .else
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
 .endif
         b.gt            4b
@@ -2831,23 +2859,24 @@ L(\type\()_bilin_h):
         subs            \h,  \h,  #2
         b.gt            161b
         ret
+endfunc
 
-L(\type\()_bilin_h_tbl):
-        .hword L(\type\()_bilin_h_tbl) - 1280b
-        .hword L(\type\()_bilin_h_tbl) -  640b
-        .hword L(\type\()_bilin_h_tbl) -  320b
-        .hword L(\type\()_bilin_h_tbl) -  160b
-        .hword L(\type\()_bilin_h_tbl) -   80b
-        .hword L(\type\()_bilin_h_tbl) -   40b
-        .hword L(\type\()_bilin_h_tbl) -   20b
-        .hword 0
+jumptable \type\()_bilin_h_tbl
+        .word 1280b - \type\()_bilin_h_tbl
+        .word 640b  - \type\()_bilin_h_tbl
+        .word 320b  - \type\()_bilin_h_tbl
+        .word 160b  - \type\()_bilin_h_tbl
+        .word 80b   - \type\()_bilin_h_tbl
+        .word 40b   - \type\()_bilin_h_tbl
+        .word 20b   - \type\()_bilin_h_tbl
+endjumptable
 
 
-L(\type\()_bilin_v):
+function L(\type\()_bilin_v)
         cmp             \h,  #4
-        adr             x9,  L(\type\()_bilin_v_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        movrel          x9,  \type\()_bilin_v_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
+        add             x9,  x9,  x8
         br              x9
 
 20:     // 2xN v
@@ -2860,24 +2889,24 @@ L(\type\()_bilin_v):
         lsl             \d_strd,  \d_strd,  #1
 
         // 2x2 v
-        ld1             {v16.h}[0], [\src], \s_strd
+        ld1r            {v16.8h}, [\src], \s_strd
         b.gt            24f
 22:
-        ld1             {v17.h}[0], [\sr2], \s_strd
-        ld1             {v18.h}[0], [\src], \s_strd
+        ld1r            {v17.8h}, [\sr2], \s_strd
+        ld1r            {v18.8h}, [\src], \s_strd
         trn1            v16.4h, v16.4h, v17.4h
         trn1            v17.4h, v17.4h, v18.4h
         umull           v4.8h,  v16.8b,  v2.8b
         umlal           v4.8h,  v17.8b,  v3.8b
         uqrshrn         v4.8b,  v4.8h,  #4
-        st1             {v4.h}[0], [\dst]
+        str             h4,        [\dst]
         st1             {v4.h}[1], [\ds2]
         ret
 24:     // 2x4, 2x6, 2x8, ... v
-        ld1             {v17.h}[0], [\sr2], \s_strd
-        ld1             {v18.h}[0], [\src], \s_strd
-        ld1             {v19.h}[0], [\sr2], \s_strd
-        ld1             {v20.h}[0], [\src], \s_strd
+        ld1r            {v17.8h}, [\sr2], \s_strd
+        ld1r            {v18.8h}, [\src], \s_strd
+        ld1r            {v19.8h}, [\sr2], \s_strd
+        ld1r            {v20.8h}, [\src], \s_strd
         sub             \h,  \h,  #4
         trn1            v16.4h, v16.4h, v17.4h
         trn1            v17.4h, v17.4h, v18.4h
@@ -2907,10 +2936,10 @@ L(\type\()_bilin_v):
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         lsl             \d_strd,  \d_strd,  #1
-        ld1             {v16.s}[0], [\src], \s_strd
+        ld1r            {v16.4s}, [\src], \s_strd
 4:
-        ld1             {v17.s}[0], [\sr2], \s_strd
-        ld1             {v18.s}[0], [\src], \s_strd
+        ld1r            {v17.4s}, [\sr2], \s_strd
+        ld1r            {v18.4s}, [\src], \s_strd
         trn1            v16.2s, v16.2s, v17.2s
         trn1            v17.2s, v17.2s, v18.2s
         umull           v4.8h,  v16.8b,  v2.8b
@@ -2921,7 +2950,7 @@ L(\type\()_bilin_v):
         st1             {v4.s}[0], [\dst], \d_strd
         st1             {v4.s}[1], [\ds2], \d_strd
 .else
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
 .endif
         b.le            0f
@@ -3017,23 +3046,24 @@ L(\type\()_bilin_v):
         b               1b
 0:
         ret
+endfunc
 
-L(\type\()_bilin_v_tbl):
-        .hword L(\type\()_bilin_v_tbl) - 1280b
-        .hword L(\type\()_bilin_v_tbl) -  640b
-        .hword L(\type\()_bilin_v_tbl) -  320b
-        .hword L(\type\()_bilin_v_tbl) -  160b
-        .hword L(\type\()_bilin_v_tbl) -   80b
-        .hword L(\type\()_bilin_v_tbl) -   40b
-        .hword L(\type\()_bilin_v_tbl) -   20b
-        .hword 0
-
-L(\type\()_bilin_hv):
+jumptable \type\()_bilin_v_tbl
+        .word 1280b - \type\()_bilin_v_tbl
+        .word 640b  - \type\()_bilin_v_tbl
+        .word 320b  - \type\()_bilin_v_tbl
+        .word 160b  - \type\()_bilin_v_tbl
+        .word 80b   - \type\()_bilin_v_tbl
+        .word 40b   - \type\()_bilin_v_tbl
+        .word 20b   - \type\()_bilin_v_tbl
+endjumptable
+
+function L(\type\()_bilin_hv)
         uxtl            v2.8h, v2.8b
         uxtl            v3.8h, v3.8b
-        adr             x9,  L(\type\()_bilin_hv_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        movrel          x9,  \type\()_bilin_hv_tbl
+        ldrsw           x8,  [x9, x8, lsl #2]
+        add             x9,  x9,  x8
         br              x9
 
 20:     // 2xN hv
@@ -3044,14 +3074,14 @@ L(\type\()_bilin_hv):
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
-        ld1             {v28.s}[0],  [\src], \s_strd
+        ld1r            {v28.4s},  [\src], \s_strd
         ext             v29.8b, v28.8b, v28.8b, #1
         umull           v16.8h, v28.8b, v0.8b
         umlal           v16.8h, v29.8b, v1.8b
 
 2:
-        ld1             {v28.s}[0],  [\sr2], \s_strd
-        ld1             {v30.s}[0],  [\src], \s_strd
+        ld1r            {v28.4s},  [\sr2], \s_strd
+        ld1r            {v30.4s},  [\src], \s_strd
         ext             v29.8b, v28.8b, v28.8b, #1
         ext             v31.8b, v30.8b, v30.8b, #1
         trn1            v28.4h, v28.4h, v30.4h
@@ -3107,7 +3137,7 @@ L(\type\()_bilin_hv):
         st1             {v4.s}[1], [\ds2], \d_strd
 .else
         urshr           v4.8h,  v4.8h,  #4
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
 .endif
         b.le            0f
@@ -3182,17 +3212,17 @@ L(\type\()_bilin_hv):
         b               1b
 0:
         ret
-
-L(\type\()_bilin_hv_tbl):
-        .hword L(\type\()_bilin_hv_tbl) - 1280b
-        .hword L(\type\()_bilin_hv_tbl) -  640b
-        .hword L(\type\()_bilin_hv_tbl) -  320b
-        .hword L(\type\()_bilin_hv_tbl) -  160b
-        .hword L(\type\()_bilin_hv_tbl) -   80b
-        .hword L(\type\()_bilin_hv_tbl) -   40b
-        .hword L(\type\()_bilin_hv_tbl) -   20b
-        .hword 0
 endfunc
+
+jumptable \type\()_bilin_hv_tbl
+        .word 1280b - \type\()_bilin_hv_tbl
+        .word 640b  - \type\()_bilin_hv_tbl
+        .word 320b  - \type\()_bilin_hv_tbl
+        .word 160b  - \type\()_bilin_hv_tbl
+        .word 80b   - \type\()_bilin_hv_tbl
+        .word 40b   - \type\()_bilin_hv_tbl
+        .word 20b   - \type\()_bilin_hv_tbl
+endjumptable
 .endm
 
 make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
index 576fab158acca6e8bafec0f0dcbf4f1ad8704f3e..66cdfff744e70e4fe057013d14f68b772865707a 100644
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -145,11 +145,11 @@ function \type\()_16bpc_neon, export=1
         dup             v27.4s,  w6
         neg             v27.4s,  v27.4s
 .endif
-        adr             x7,  L(\type\()_tbl)
+        movrel          x7,  \type\()_tbl
         sub             w4,  w4,  #24
         \type           v4,  v5,  v0,  v1,  v2,  v3
-        ldrh            w4,  [x7, x4, lsl #1]
-        sub             x7,  x7,  w4, uxtw
+        ldrsw           x4,  [x7, x4, lsl #2]
+        add             x7,  x7,  x4
         br              x7
 40:
         AARCH64_VALID_JUMP_TARGET
@@ -157,9 +157,9 @@ function \type\()_16bpc_neon, export=1
         lsl             x1,  x1,  #1
 4:
         subs            w5,  w5,  #4
-        st1             {v4.d}[0],  [x0], x1
+        st1             {v4.8b},    [x0], x1
         st1             {v4.d}[1],  [x7], x1
-        st1             {v5.d}[0],  [x0], x1
+        st1             {v5.8b},    [x0], x1
         st1             {v5.d}[1],  [x7], x1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
@@ -175,8 +175,9 @@ function \type\()_16bpc_neon, export=1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
         b               8b
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         \type           v6,  v7,  v0,  v1,  v2,  v3
         st1             {v4.8h, v5.8h}, [x0], x1
         subs            w5,  w5,  #2
@@ -184,8 +185,9 @@ function \type\()_16bpc_neon, export=1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
         b               16b
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         \type           v6,  v7,  v0,  v1,  v2,  v3
         subs            w5,  w5,  #1
         st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
@@ -228,14 +230,16 @@ function \type\()_16bpc_neon, export=1
         b               128b
 0:
         ret
-L(\type\()_tbl):
-        .hword L(\type\()_tbl) - 1280b
-        .hword L(\type\()_tbl) -  640b
-        .hword L(\type\()_tbl) -   32b
-        .hword L(\type\()_tbl) -   16b
-        .hword L(\type\()_tbl) -   80b
-        .hword L(\type\()_tbl) -   40b
 endfunc
+
+jumptable \type\()_tbl
+        .word 1280b - \type\()_tbl
+        .word 640b  - \type\()_tbl
+        .word 320b  - \type\()_tbl
+        .word 160b  - \type\()_tbl
+        .word 80b   - \type\()_tbl
+        .word 40b   - \type\()_tbl
+endjumptable
 .endm
 
 bidir_fn avg, w6
@@ -247,12 +251,12 @@ bidir_fn mask, w7
 function w_mask_\type\()_16bpc_neon, export=1
         ldr             w8,  [sp]
         clz             w9,  w4
-        adr             x10, L(w_mask_\type\()_tbl)
+        movrel          x10, w_mask_\type\()_tbl
         dup             v31.8h,  w8   // bitdepth_max
         sub             w9,  w9,  #24
         clz             w8,  w8       // clz(bitdepth_max)
-        ldrh            w9,  [x10,  x9,  lsl #1]
-        sub             x10, x10, w9,  uxtw
+        ldrsw           x9,  [x10,  x9,  lsl #2]
+        add             x10, x10, x9
         sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
         mov             w9,  #PREP_BIAS*64
         neg             w8,  w8       // -sh
@@ -274,8 +278,9 @@ function w_mask_\type\()_16bpc_neon, export=1
         add             x12,  x0,  x1
         lsl             x1,   x1,  #1
         br              x10
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
         ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
         subs            w5,  w5,  #4
@@ -331,16 +336,17 @@ function w_mask_\type\()_16bpc_neon, export=1
         addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
         sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
         rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        st1             {v20.s}[0], [x6], #4
+        str             s20,        [x6],  #4
 .endif
-        st1             {v4.d}[0],  [x0],  x1
+        st1             {v4.8b},    [x0],  x1
         st1             {v4.d}[1],  [x12], x1
-        st1             {v5.d}[0],  [x0],  x1
+        st1             {v5.8b},    [x0],  x1
         st1             {v5.d}[1],  [x12], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
         ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
         subs            w5,  w5,  #2
@@ -394,7 +400,7 @@ function w_mask_\type\()_16bpc_neon, export=1
         addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
         sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
         rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        st1             {v20.s}[0], [x6], #4
+        str             s20,     [x6],  #4
 .endif
         st1             {v4.8h}, [x0],  x1
         st1             {v5.8h}, [x12], x1
@@ -541,14 +547,16 @@ function w_mask_\type\()_16bpc_neon, export=1
         add             x12, x12, x1
         b.gt            161b
         ret
-L(w_mask_\type\()_tbl):
-        .hword L(w_mask_\type\()_tbl) - 1280b
-        .hword L(w_mask_\type\()_tbl) -  640b
-        .hword L(w_mask_\type\()_tbl) -  320b
-        .hword L(w_mask_\type\()_tbl) -  160b
-        .hword L(w_mask_\type\()_tbl) -    8b
-        .hword L(w_mask_\type\()_tbl) -    4b
 endfunc
+
+jumptable w_mask_\type\()_tbl
+        .word 1280b - w_mask_\type\()_tbl
+        .word 640b  - w_mask_\type\()_tbl
+        .word 320b  - w_mask_\type\()_tbl
+        .word 160b  - w_mask_\type\()_tbl
+        .word 80b   - w_mask_\type\()_tbl
+        .word 40b   - w_mask_\type\()_tbl
+endjumptable
 .endm
 
 w_mask_fn 444
@@ -557,11 +565,11 @@ w_mask_fn 420
 
 
 function blend_16bpc_neon, export=1
-        adr             x6,  L(blend_tbl)
+        movrel          x6,  blend_tbl
         clz             w3,  w3
         sub             w3,  w3,  #26
-        ldrh            w3,  [x6,  x3,  lsl #1]
-        sub             x6,  x6,  w3,  uxtw
+        ldrsw           x3,  [x6,  x3,  lsl #2]
+        add             x6,  x6,  x3
         add             x8,  x0,  x1
         br              x6
 40:
@@ -570,7 +578,7 @@ function blend_16bpc_neon, export=1
 4:
         ld1             {v2.8b},   [x5], #8
         ld1             {v1.8h},   [x2], #16
-        ld1             {v0.d}[0], [x0]
+        ldr             d0,        [x0]
         neg             v2.8b,   v2.8b            // -m
         subs            w4,  w4,  #2
         ld1             {v0.d}[1], [x8]
@@ -579,7 +587,7 @@ function blend_16bpc_neon, export=1
         sub             v1.8h,   v0.8h,   v1.8h   // a - b
         sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
         add             v0.8h,   v0.8h,   v1.8h
-        st1             {v0.d}[0], [x0], x1
+        st1             {v0.8b},   [x0], x1
         st1             {v0.d}[1], [x8], x1
         b.gt            4b
         ret
@@ -642,8 +650,9 @@ function blend_16bpc_neon, export=1
         st1             {v2.8h, v3.8h}, [x8], x1
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ld1             {v16.16b, v17.16b},           [x5], #32
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
         subs            w4,  w4,  #1
@@ -673,15 +682,17 @@ function blend_16bpc_neon, export=1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         b.gt            32b
         ret
-L(blend_tbl):
-        .hword L(blend_tbl) -  32b
-        .hword L(blend_tbl) - 160b
-        .hword L(blend_tbl) -  80b
-        .hword L(blend_tbl) -  40b
 endfunc
 
+jumptable blend_tbl
+        .word 320b - blend_tbl
+        .word 160b - blend_tbl
+        .word 80b  - blend_tbl
+        .word 40b  - blend_tbl
+endjumptable
+
 function blend_h_16bpc_neon, export=1
-        adr             x6,  L(blend_h_tbl)
+        movrel          x6,  blend_h_tbl
         movrel          x5,  X(obmc_masks)
         add             x5,  x5,  w4,  uxtw
         sub             w4,  w4,  w4,  lsr #2
@@ -689,17 +700,18 @@ function blend_h_16bpc_neon, export=1
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w7,  w7,  #24
-        ldrh            w7,  [x6,  x7,  lsl #1]
-        sub             x6,  x6,  w7, uxtw
+        ldrsw           x7,  [x6,  x7,  lsl #2]
+        add             x6,  x6,  x7
         br              x6
-2:
+20:
         AARCH64_VALID_JUMP_TARGET
+2:
         ld2r            {v2.8b, v3.8b}, [x5], #2
         ld1             {v1.4h},        [x2], #8
         ext             v2.8b,   v2.8b,   v3.8b,   #6
         subs            w4,  w4,  #2
         neg             v2.8b,   v2.8b            // -m
-        ld1             {v0.s}[0], [x0]
+        ldr             s0,        [x0]
         ld1             {v0.s}[1], [x8]
         sxtl            v2.8h,   v2.8b
         shl             v2.4h,   v2.4h,   #9      // -m << 9
@@ -710,26 +722,28 @@ function blend_h_16bpc_neon, export=1
         st1             {v0.s}[1], [x8], x1
         b.gt            2b
         ret
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld2r            {v2.8b, v3.8b}, [x5], #2
         ld1             {v1.8h},        [x2], #16
         ext             v2.8b,   v2.8b,   v3.8b,   #4
         subs            w4,  w4,  #2
         neg             v2.8b,   v2.8b            // -m
-        ld1             {v0.d}[0],   [x0]
+        ldr             d0,          [x0]
         ld1             {v0.d}[1],   [x8]
         sxtl            v2.8h,   v2.8b
         shl             v2.8h,   v2.8h,   #9      // -m << 9
         sub             v1.8h,   v0.8h,   v1.8h   // a - b
         sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
         add             v0.8h,   v0.8h,   v1.8h
-        st1             {v0.d}[0], [x0], x1
+        st1             {v0.8b},   [x0], x1
         st1             {v0.d}[1], [x8], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld2r            {v4.8b, v5.8b}, [x5], #2
         ld1             {v2.8h, v3.8h}, [x2], #32
         neg             v4.8b,   v4.8b            // -m
@@ -751,8 +765,9 @@ function blend_h_16bpc_neon, export=1
         st1             {v1.8h}, [x8], x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ld2r            {v16.8b, v17.8b}, [x5], #2
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
         neg             v16.8b,  v16.8b           // -m
@@ -835,26 +850,28 @@ function blend_h_16bpc_neon, export=1
         add             x7,  x7,  w3,  uxtw #1
         b.gt            321b
         ret
-L(blend_h_tbl):
-        .hword L(blend_h_tbl) - 1280b
-        .hword L(blend_h_tbl) -  640b
-        .hword L(blend_h_tbl) -  320b
-        .hword L(blend_h_tbl) -   16b
-        .hword L(blend_h_tbl) -    8b
-        .hword L(blend_h_tbl) -    4b
-        .hword L(blend_h_tbl) -    2b
 endfunc
 
+jumptable blend_h_tbl
+        .word 1280b - blend_h_tbl
+        .word 640b  - blend_h_tbl
+        .word 320b  - blend_h_tbl
+        .word 160b  - blend_h_tbl
+        .word 80b   - blend_h_tbl
+        .word 40b   - blend_h_tbl
+        .word 20b   - blend_h_tbl
+endjumptable
+
 function blend_v_16bpc_neon, export=1
-        adr             x6,  L(blend_v_tbl)
+        movrel          x6,  blend_v_tbl
         movrel          x5,  X(obmc_masks)
         add             x5,  x5,  w3,  uxtw
         clz             w3,  w3
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w3,  w3,  #26
-        ldrh            w3,  [x6,  x3,  lsl #1]
-        sub             x6,  x6,  w3,  uxtw
+        ldrsw           x3,  [x6,  x3,  lsl #2]
+        add             x6,  x6,  x3
         br              x6
 20:
         AARCH64_VALID_JUMP_TARGET
@@ -863,8 +880,8 @@ function blend_v_16bpc_neon, export=1
         sxtl            v2.8h,   v2.8b
         shl             v2.4h,   v2.4h,   #9      // -m << 9
 2:
-        ld1             {v1.s}[0], [x2], #4
-        ld1             {v0.h}[0], [x0]
+        ldr             s1,  [x2],  #4
+        ldr             h0,  [x0]
         subs            w4,  w4,  #2
         ld1             {v1.h}[1], [x2]
         ld1             {v0.h}[1], [x8]
@@ -885,13 +902,13 @@ function blend_v_16bpc_neon, export=1
         shl             v2.8h,   v2.8h,   #9      // -m << 9
 4:
         ld1             {v1.8h},   [x2], #16
-        ld1             {v0.d}[0], [x0]
+        ldr             d0,        [x0]
         ld1             {v0.d}[1], [x8]
         subs            w4,  w4,  #2
         sub             v1.8h,   v0.8h,   v1.8h   // a - b
         sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
         add             v0.8h,   v0.8h,   v1.8h
-        st1             {v0.s}[0], [x0], #4
+        str             s0,        [x0], #4
         st1             {v0.s}[2], [x8], #4
         st1             {v0.h}[2], [x0], x1
         st1             {v0.h}[6], [x8], x1
@@ -915,8 +932,8 @@ function blend_v_16bpc_neon, export=1
         sqrdmulh        v3.8h,   v3.8h,   v4.8h
         add             v0.8h,   v0.8h,   v2.8h
         add             v1.8h,   v1.8h,   v3.8h
-        st1             {v0.d}[0], [x0], #8
-        st1             {v1.d}[0], [x8], #8
+        str             d0,        [x0], #8
+        str             d1,        [x8], #8
         st1             {v0.s}[2], [x0], x1
         st1             {v1.s}[2], [x8], x1
         b.gt            8b
@@ -992,34 +1009,38 @@ function blend_v_16bpc_neon, export=1
         st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
         b.gt            32b
         ret
-L(blend_v_tbl):
-        .hword L(blend_v_tbl) - 320b
-        .hword L(blend_v_tbl) - 160b
-        .hword L(blend_v_tbl) -  80b
-        .hword L(blend_v_tbl) -  40b
-        .hword L(blend_v_tbl) -  20b
 endfunc
 
+jumptable blend_v_tbl
+        .word 320b - blend_v_tbl
+        .word 160b - blend_v_tbl
+        .word 80b  - blend_v_tbl
+        .word 40b  - blend_v_tbl
+        .word 20b  - blend_v_tbl
+endjumptable
+
 
 // This has got the same signature as the put_8tap functions,
 // and assumes that x9 is set to (clz(w)-24).
-function put_neon
-        adr             x10, L(put_tbl)
-        ldrh            w9, [x10, x9, lsl #1]
-        sub             x10, x10, w9, uxtw
+function put_16bpc_neon, export=1
+        movrel          x10, put_16bpc_tbl
+        ldrsw           x9, [x10, x9, lsl #2]
+        add             x10, x10, x9
         br              x10
 
-2:
+20:
         AARCH64_VALID_JUMP_TARGET
-        ld1             {v0.s}[0], [x2], x3
-        ld1             {v1.s}[0], [x2], x3
+2:
+        ld1r            {v0.4s},   [x2], x3
+        ld1r            {v1.4s},   [x2], x3
         subs            w5,  w5,  #2
         st1             {v0.s}[0], [x0], x1
         st1             {v1.s}[0], [x0], x1
         b.gt            2b
         ret
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld1             {v0.4h}, [x2], x3
         ld1             {v1.4h}, [x2], x3
         subs            w5,  w5,  #2
@@ -1041,8 +1062,9 @@ function put_neon
         st1             {v1.8h}, [x8], x1
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
@@ -1052,8 +1074,9 @@ function put_neon
         add             x0,  x0,  x1
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
@@ -1067,8 +1090,9 @@ function put_neon
         add             x0,  x0,  x1
         b.gt            32b
         ret
-64:
+640:
         AARCH64_VALID_JUMP_TARGET
+64:
         ldp             q0,  q1,  [x2]
         ldp             q2,  q3,  [x2, #32]
         stp             q0,  q1,  [x0]
@@ -1082,8 +1106,9 @@ function put_neon
         add             x0,  x0,  x1
         b.gt            64b
         ret
-128:
+1280:
         AARCH64_VALID_JUMP_TARGET
+128:
         ldp             q0,  q1,  [x2]
         ldp             q2,  q3,  [x2, #32]
         stp             q0,  q1,  [x0]
@@ -1105,27 +1130,28 @@ function put_neon
         add             x0,  x0,  x1
         b.gt            128b
         ret
-
-L(put_tbl):
-        .hword L(put_tbl) - 128b
-        .hword L(put_tbl) -  64b
-        .hword L(put_tbl) -  32b
-        .hword L(put_tbl) -  16b
-        .hword L(put_tbl) -  80b
-        .hword L(put_tbl) -   4b
-        .hword L(put_tbl) -   2b
 endfunc
 
+jumptable put_16bpc_tbl
+        .word 1280b - put_16bpc_tbl
+        .word 640b  - put_16bpc_tbl
+        .word 320b  - put_16bpc_tbl
+        .word 160b  - put_16bpc_tbl
+        .word 80b   - put_16bpc_tbl
+        .word 40b   - put_16bpc_tbl
+        .word 20b   - put_16bpc_tbl
+endjumptable
+
 
 // This has got the same signature as the prep_8tap functions,
 // and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
 // x8 to w*2.
-function prep_neon
-        adr             x10, L(prep_tbl)
-        ldrh            w9, [x10, x9, lsl #1]
+function prep_16bpc_neon
+        movrel          x10, prep_16bpc_tbl
+        ldrsw           x9, [x10, x9, lsl #2]
         dup             v31.8h,  w7   // intermediate_bits
         movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
-        sub             x10, x10, w9, uxtw
+        add             x10, x10, x9
         br              x10
 
 40:
@@ -1133,7 +1159,7 @@ function prep_neon
         add             x9,  x1,  x2
         lsl             x2,  x2,  #1
 4:
-        ld1             {v0.d}[0], [x1], x2
+        ld1             {v0.8b},   [x1], x2
         ld1             {v0.d}[1], [x9], x2
         subs            w4,  w4,  #2
         sshl            v0.8h,   v0.8h,   v31.8h
@@ -1156,8 +1182,9 @@ function prep_neon
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            8b
         ret
-16:
+160:
         AARCH64_VALID_JUMP_TARGET
+16:
         ldp             q0,  q1,  [x1]
         add             x1,  x1,  x2
         sshl            v0.8h,   v0.8h,   v31.8h
@@ -1174,8 +1201,9 @@ function prep_neon
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ldp             q0,  q1,  [x1]
         sshl            v0.8h,   v0.8h,   v31.8h
         ldp             q2,  q3,  [x1, #32]
@@ -1191,8 +1219,9 @@ function prep_neon
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         b.gt            32b
         ret
-64:
+640:
         AARCH64_VALID_JUMP_TARGET
+64:
         ldp             q0,  q1,  [x1]
         subs            w4,  w4,  #1
         sshl            v0.8h,   v0.8h,   v31.8h
@@ -1222,8 +1251,9 @@ function prep_neon
         add             x0,  x0,  x8
         b.gt            64b
         ret
-128:
+1280:
         AARCH64_VALID_JUMP_TARGET
+128:
         ldp             q0,  q1,  [x1]
         subs            w4,  w4,  #1
         sshl            v0.8h,   v0.8h,   v31.8h
@@ -1277,16 +1307,17 @@ function prep_neon
         add             x0,  x0,  x8
         b.gt            128b
         ret
-
-L(prep_tbl):
-        .hword L(prep_tbl) - 128b
-        .hword L(prep_tbl) -  64b
-        .hword L(prep_tbl) -  32b
-        .hword L(prep_tbl) -  16b
-        .hword L(prep_tbl) -  80b
-        .hword L(prep_tbl) -  40b
 endfunc
 
+jumptable prep_16bpc_tbl
+        .word 1280b - prep_16bpc_tbl
+        .word 640b  - prep_16bpc_tbl
+        .word 320b  - prep_16bpc_tbl
+        .word 160b  - prep_16bpc_tbl
+        .word 80b   - prep_16bpc_tbl
+        .word 40b   - prep_16bpc_tbl
+endjumptable
+
 
 .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
         ld1             {\d0\wd}[0], [\s0], \strd
@@ -1455,10 +1486,10 @@ endfunc
 .endif
 .endm
 .macro st_d strd, r0, r1
-        st1             {\r0\().d}[0], [x0], \strd
+        st1             {\r0\().8b},   [x0], \strd
         st1             {\r0\().d}[1], [x9], \strd
 .ifnb \r1
-        st1             {\r1\().d}[0], [x0], \strd
+        st1             {\r1\().8b},   [x0], \strd
         st1             {\r1\().d}[1], [x9], \strd
 .endif
 .endm
@@ -1556,7 +1587,7 @@ function \type\()_\taps\()_neon
         b.ne            L(\type\()_\taps\()_h)
         tst             \my, #(0x7f << 14)
         b.ne            L(\type\()_\taps\()_v)
-        b               \type\()_neon
+        b               \type\()_16bpc_neon
 
 L(\type\()_\taps\()_h):
         cmp             \w,   #4
@@ -1569,26 +1600,25 @@ L(\type\()_\taps\()_h):
         add             \xmx, x11, \mx, uxtw #3
         b.ne            L(\type\()_\taps\()_hv)
 
-        adr             x10, L(\type\()_\taps\()_h_tbl)
-        dup             v30.4s,  w12           // 6 - intermediate_bits
-        ldrh            w9,  [x10, x9, lsl #1]
-        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
+        movrel          x10, \type\()_\taps\()_h_tbl
+        ldrsw           x9,  [x10, x9, lsl #2]
 .ifc \type, put
-        dup             v29.8h,  \bdmax        // intermediate_bits
+        mov             w12,  #34              // rounding for 10-bit
+        mov             w13,  #40              // rounding for 12-bit
+        cmp             \bdmax, #2             // 10-bit: 4, 12-bit: 2
+        csel            w12,  w12,  w13,  ne   // select rounding based on \bdmax
 .else
+        neg             w12,  w12              // -(6 - intermediate_bits)
         movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
-        sub             x10, x10, w9, uxtw
-.ifc \type, put
-        neg             v29.8h,  v29.8h        // -intermediate_bits
-.endif
+        add             x10, x10, x9
+        dup             v30.4s,  w12           // rounding or shift amount
         br              x10
 
 20:     // 2xN h
         AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
         sub             \src,  \src,  #2
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1598,6 +1628,7 @@ L(\type\()_\taps\()_h):
 2:
         ld1             {v4.8h},  [\src], \s_strd
         ld1             {v6.8h},  [\sr2], \s_strd
+        mov             v2.16b,  v30.16b
         ext             v5.16b,  v4.16b,  v4.16b,  #2
         ext             v7.16b,  v6.16b,  v6.16b,  #2
         subs            \h,  \h,  #2
@@ -1605,24 +1636,21 @@ L(\type\()_\taps\()_h):
         trn2            v6.2s,   v4.2s,   v6.2s
         trn1            v4.2s,   v5.2s,   v7.2s
         trn2            v7.2s,   v5.2s,   v7.2s
-        smull           v3.4s,   v3.4h,   v0.h[0]
-        smlal           v3.4s,   v4.4h,   v0.h[1]
-        smlal           v3.4s,   v6.4h,   v0.h[2]
-        smlal           v3.4s,   v7.4h,   v0.h[3]
-        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
-        sqxtun          v3.4h,   v3.4s
-        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
-        umin            v3.4h,   v3.4h,   v31.4h
-        st1             {v3.s}[0], [\dst], \d_strd
-        st1             {v3.s}[1], [\ds2], \d_strd
+        smlal           v2.4s,   v3.4h,   v0.h[0]
+        smlal           v2.4s,   v4.4h,   v0.h[1]
+        smlal           v2.4s,   v6.4h,   v0.h[2]
+        smlal           v2.4s,   v7.4h,   v0.h[3]
+        sqshrun         v2.4h,   v2.4s,   #6
+        umin            v2.4h,   v2.4h,   v31.4h
+        st1             {v2.s}[0], [\dst], \d_strd
+        st1             {v2.s}[1], [\ds2], \d_strd
         b.gt            2b
         ret
 .endif
 
 40:     // 4xN h
         AARCH64_VALID_JUMP_TARGET
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
         sub             \src,  \src,  #2
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1632,6 +1660,10 @@ L(\type\()_\taps\()_h):
 4:
         ld1             {v16.8h}, [\src], \s_strd
         ld1             {v20.8h}, [\sr2], \s_strd
+.ifc \type, put
+        mov             v2.16b,  v30.16b
+        mov             v3.16b,  v30.16b
+.endif
         ext             v17.16b, v16.16b, v16.16b, #2
         ext             v18.16b, v16.16b, v16.16b, #4
         ext             v19.16b, v16.16b, v16.16b, #6
@@ -1639,26 +1671,33 @@ L(\type\()_\taps\()_h):
         ext             v22.16b, v20.16b, v20.16b, #4
         ext             v23.16b, v20.16b, v20.16b, #6
         subs            \h,  \h,  #2
-        smull           v16.4s,  v16.4h,  v0.h[0]
-        smlal           v16.4s,  v17.4h,  v0.h[1]
-        smlal           v16.4s,  v18.4h,  v0.h[2]
-        smlal           v16.4s,  v19.4h,  v0.h[3]
-        smull           v20.4s,  v20.4h,  v0.h[0]
-        smlal           v20.4s,  v21.4h,  v0.h[1]
-        smlal           v20.4s,  v22.4h,  v0.h[2]
-        smlal           v20.4s,  v23.4h,  v0.h[3]
-        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
 .ifc \type, put
-        sqxtun          v16.4h,  v16.4s
-        sqxtun2         v16.8h,  v20.4s
-        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
+        smlal           v2.4s,   v16.4h,  v0.h[0]
+.else
+        smull           v2.4s,   v16.4h,  v0.h[0]
+.endif
+        smlal           v2.4s,   v17.4h,  v0.h[1]
+        smlal           v2.4s,   v18.4h,  v0.h[2]
+        smlal           v2.4s,   v19.4h,  v0.h[3]
+.ifc \type, put
+        smlal           v3.4s,   v20.4h,  v0.h[0]
+.else
+        smull           v3.4s,   v20.4h,  v0.h[0]
+.endif
+        smlal           v3.4s,   v21.4h,  v0.h[1]
+        smlal           v3.4s,   v22.4h,  v0.h[2]
+        smlal           v3.4s,   v23.4h,  v0.h[3]
+.ifc \type, put
+        sqshrun         v16.4h,  v2.4s,   #6
+        sqshrun2        v16.8h,  v3.4s,   #6
         umin            v16.8h,  v16.8h,  v31.8h
 .else
+        srshl           v16.4s,  v2.4s,   v30.4s // -(6-intermediate_bits)
+        srshl           v20.4s,  v3.4s,   v30.4s // -(6-intermediate_bits)
         uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
         sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
 .endif
-        st1             {v16.d}[0], [\dst], \d_strd
+        st1             {v16.8b},   [\dst], \d_strd
         st1             {v16.d}[1], [\ds2], \d_strd
         b.gt            4b
         ret
@@ -1670,7 +1709,11 @@ L(\type\()_\taps\()_h):
 1280:   // 8xN, 16xN, 32xN, ... h
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #4
+.else
         sub             \src,  \src,  #6
+.endif
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
@@ -1689,49 +1732,67 @@ L(\type\()_\taps\()_h):
 
 8:
 .ifc \taps, 6tap
-        ext             v24.16b, v16.16b, v17.16b, #2
-        ext             v25.16b, v20.16b, v21.16b, #2
-        smull           v18.4s,  v24.4h,  v0.h[1]
-        smull2          v19.4s,  v24.8h,  v0.h[1]
-        smull           v22.4s,  v25.4h,  v0.h[1]
-        smull2          v23.4s,  v25.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+    .ifc \type, put
+        mov             v18.16b, v30.16b
+        mov             v19.16b, v30.16b
+        smlal           v18.4s,  v16.4h,  v0.h[1]
+        smlal2          v19.4s,  v16.8h,  v0.h[1]
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v30.16b
+        smlal           v22.4s,  v20.4h,  v0.h[1]
+        smlal2          v23.4s,  v20.8h,  v0.h[1]
+    .else
+        smull           v18.4s,  v16.4h,  v0.h[1]
+        smull2          v19.4s,  v16.8h,  v0.h[1]
+        smull           v22.4s,  v20.4h,  v0.h[1]
+        smull2          v23.4s,  v20.8h,  v0.h[1]
+    .endif
+    .irpc i, 23456
+        ext             v24.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v25.16b, v20.16b, v21.16b, #(2*\i-2)
         smlal           v18.4s,  v24.4h,  v0.h[\i]
         smlal2          v19.4s,  v24.8h,  v0.h[\i]
         smlal           v22.4s,  v25.4h,  v0.h[\i]
         smlal2          v23.4s,  v25.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
+    .ifc \type, put
+        mov             v18.16b, v30.16b
+        mov             v19.16b, v30.16b
+        smlal           v18.4s,  v16.4h,  v0.h[0]
+        smlal2          v19.4s,  v16.8h,  v0.h[0]
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v30.16b
+        smlal           v22.4s,  v20.4h,  v0.h[0]
+        smlal2          v23.4s,  v20.8h,  v0.h[0]
+    .else
         smull           v18.4s,  v16.4h,  v0.h[0]
         smull2          v19.4s,  v16.8h,  v0.h[0]
         smull           v22.4s,  v20.4h,  v0.h[0]
         smull2          v23.4s,  v20.8h,  v0.h[0]
-.irpc i, 1234567
+    .endif
+    .irpc i, 1234567
         ext             v24.16b, v16.16b, v17.16b, #(2*\i)
         ext             v25.16b, v20.16b, v21.16b, #(2*\i)
         smlal           v18.4s,  v24.4h,  v0.h[\i]
         smlal2          v19.4s,  v24.8h,  v0.h[\i]
         smlal           v22.4s,  v25.4h,  v0.h[\i]
         smlal2          v23.4s,  v25.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
         subs            \mx, \mx, #8
-        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
-        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
 .ifc \type, put
-        sqxtun          v18.4h,  v18.4s
-        sqxtun2         v18.8h,  v19.4s
-        sqxtun          v22.4h,  v22.4s
-        sqxtun2         v22.8h,  v23.4s
-        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
-        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
+        sqshrun         v18.4h,  v18.4s,  #6
+        sqshrun2        v18.8h,  v19.4s,  #6
+        sqshrun         v22.4h,  v22.4s,  #6
+        sqshrun2        v22.8h,  v23.4s,  #6
         umin            v18.8h,  v18.8h,  v31.8h
         umin            v22.8h,  v22.8h,  v31.8h
 .else
+        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
         uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
         uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
         sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
@@ -1756,19 +1817,20 @@ L(\type\()_\taps\()_h):
         subs            \h,  \h,  #2
         b.gt            81b
         ret
+endfunc
 
-L(\type\()_\taps\()_h_tbl):
-        .hword L(\type\()_\taps\()_h_tbl) - 1280b
-        .hword L(\type\()_\taps\()_h_tbl) -  640b
-        .hword L(\type\()_\taps\()_h_tbl) -  320b
-        .hword L(\type\()_\taps\()_h_tbl) -  160b
-        .hword L(\type\()_\taps\()_h_tbl) -   80b
-        .hword L(\type\()_\taps\()_h_tbl) -   40b
-        .hword L(\type\()_\taps\()_h_tbl) -   20b
-        .hword 0
+jumptable \type\()_\taps\()_h_tbl
+        .word 1280b - \type\()_\taps\()_h_tbl
+        .word 640b  - \type\()_\taps\()_h_tbl
+        .word 320b  - \type\()_\taps\()_h_tbl
+        .word 160b  - \type\()_\taps\()_h_tbl
+        .word 80b   - \type\()_\taps\()_h_tbl
+        .word 40b   - \type\()_\taps\()_h_tbl
+        .word 20b   - \type\()_\taps\()_h_tbl
+endjumptable
 
 
-L(\type\()_\taps\()_v):
+function L(\type\()_\taps\()_v)
         cmp             \h,  #4
         ubfx            w10, \my, #7, #7
         and             \my, \my, #0x7f
@@ -1781,12 +1843,12 @@ L(\type\()_\taps\()_v):
         dup             v30.4s,  w12           // 6 - intermediate_bits
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
-        adr             x10, L(\type\()_\taps\()_v_tbl)
-        ldrh            w9,  [x10, x9, lsl #1]
+        movrel          x10, \type\()_\taps\()_v_tbl
+        ldrsw           x9,  [x10, x9, lsl #2]
 .ifc \type, prep
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
 .endif
-        sub             x10, x10, w9, uxtw
+        add             x10, x10, x9
         br              x10
 
 20:     // 2xN v
@@ -1795,8 +1857,7 @@ L(\type\()_\taps\()_v):
         b.gt            28f
 
         cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src,  \src,  \s_strd
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1873,8 +1934,7 @@ L(\type\()_\taps\()_v):
 
         // 4x2, 4x4 v
         cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -1938,8 +1998,7 @@ L(\type\()_\taps\()_v):
 
         // 8x2, 8x4 v
         cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -2027,8 +2086,7 @@ L(\type\()_\taps\()_v):
         b.gt            1680b
 
         // 16x2, 16x4 v
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
         sub             \src, \src, \s_strd
         sxtl            v0.8h,   v0.8b
 
@@ -2051,18 +2109,19 @@ L(\type\()_\taps\()_v):
         b               16b
 0:
         ret
+endfunc
 
-L(\type\()_\taps\()_v_tbl):
-        .hword L(\type\()_\taps\()_v_tbl) - 1280b
-        .hword L(\type\()_\taps\()_v_tbl) -  640b
-        .hword L(\type\()_\taps\()_v_tbl) -  320b
-        .hword L(\type\()_\taps\()_v_tbl) -  160b
-        .hword L(\type\()_\taps\()_v_tbl) -   80b
-        .hword L(\type\()_\taps\()_v_tbl) -   40b
-        .hword L(\type\()_\taps\()_v_tbl) -   20b
-        .hword 0
-
-L(\type\()_\taps\()_hv):
+jumptable \type\()_\taps\()_v_tbl
+        .word 1280b - \type\()_\taps\()_v_tbl
+        .word 640b  - \type\()_\taps\()_v_tbl
+        .word 320b  - \type\()_\taps\()_v_tbl
+        .word 160b  - \type\()_\taps\()_v_tbl
+        .word 80b   - \type\()_\taps\()_v_tbl
+        .word 40b   - \type\()_\taps\()_v_tbl
+        .word 20b   - \type\()_\taps\()_v_tbl
+endjumptable
+
+function L(\type\()_\taps\()_hv)
         cmp             \h,  #4
         ubfx            w10, \my, #7, #7
         and             \my, \my, #0x7f
@@ -2071,16 +2130,16 @@ L(\type\()_\taps\()_hv):
 4:
         add             \xmy, x11, \my, uxtw #3
 
-        adr             x10, L(\type\()_\taps\()_hv_tbl)
+        movrel          x10, \type\()_\taps\()_hv_tbl
         dup             v30.4s,  w12           // 6 - intermediate_bits
-        ldrh            w9,  [x10, x9, lsl #1]
+        ldrsw           x9,  [x10, x9, lsl #2]
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
 .ifc \type, put
         dup             v29.4s,  w13           // 6 + intermediate_bits
 .else
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
-        sub             x10, x10, w9, uxtw
+        add             x10, x10, x9
 .ifc \type, put
         neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
 .endif
@@ -2089,11 +2148,9 @@ L(\type\()_\taps\()_hv):
 20:
         AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
         b.gt            280f
-        add             \xmy,  \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
 
         // 2x2, 2x4 hv
         sub             \sr2, \src, #2
@@ -2236,11 +2293,9 @@ L(\type\()_\taps\()_filter_2):
 
 40:
         AARCH64_VALID_JUMP_TARGET
-        add             \xmx, \xmx, #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
         b.gt            480f
-        add             \xmy, \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
         sub             \sr2, \src, #2
         sub             \src, \sr2, \s_strd
         add             \ds2, \dst, \d_strd
@@ -2293,7 +2348,7 @@ L(\type\()_\taps\()_filter_2):
 .endif
         subs            \h,  \h,  #2
 
-        st1             {v2.d}[0], [\dst], \d_strd
+        st1             {v2.8b},   [\dst], \d_strd
         st1             {v2.d}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.8b,  v18.8b
@@ -2392,7 +2447,7 @@ L(\type\()_\taps\()_filter_2):
         sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
 .endif
         subs            \h,  \h,  #2
-        st1             {v3.d}[0], [\dst], \d_strd
+        st1             {v3.8b},   [\dst], \d_strd
         st1             {v3.d}[1], [\ds2], \d_strd
         b.le            0f
 .ifc \taps, 8tap
@@ -2436,10 +2491,13 @@ L(\type\()_\taps\()_filter_4):
 320:
         AARCH64_VALID_JUMP_TARGET
         b.gt            880f
-        add             \xmy,  \xmy,  #2
         ld1             {v0.8b},  [\xmx]
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #4
+.else
         sub             \src,  \src,  #6
+.endif
         sub             \src,  \src,  \s_strd
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
@@ -2453,13 +2511,23 @@ L(\type\()_\taps\()_filter_4):
         lsl             \s_strd, \s_strd, #1
 
         ld1             {v27.8h, v28.8h},  [\src], \s_strd
+.ifc \taps, 6tap
+        smull           v24.4s,  v27.4h,  v0.h[1]
+        smull2          v25.4s,  v27.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+    .endr
+.else
         smull           v24.4s,  v27.4h,  v0.h[0]
         smull2          v25.4s,  v27.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
         ext             v26.16b, v27.16b, v28.16b, #(2*\i)
         smlal           v24.4s,  v26.4h,  v0.h[\i]
         smlal2          v25.4s,  v26.8h,  v0.h[\i]
-.endr
+    .endr
+.endif
         srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
         // The intermediates from the horizontal pass fit in 16 bit without
@@ -2537,8 +2605,10 @@ L(\type\()_\taps\()_filter_4):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #4
+.else
         sub             \src,  \src,  #6
-.ifc \taps, 8tap
         sub             \src,  \src,  \s_strd
 .endif
         sub             \src,  \src,  \s_strd, lsl #1
@@ -2555,22 +2625,21 @@ L(\type\()_\taps\()_filter_4):
 
         ld1             {v27.8h, v28.8h},  [\src], \s_strd
 .ifc \taps, 6tap
-        ext             v26.16b, v27.16b, v28.16b, #2
-        smull           v24.4s,  v26.4h,  v0.h[1]
-        smull2          v25.4s,  v26.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smull           v24.4s,  v27.4h,  v0.h[1]
+        smull2          v25.4s,  v27.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
         smlal           v24.4s,  v26.4h,  v0.h[\i]
         smlal2          v25.4s,  v26.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
         smull           v24.4s,  v27.4h,  v0.h[0]
         smull2          v25.4s,  v27.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
         ext             v26.16b, v27.16b, v28.16b, #(2*\i)
         smlal           v24.4s,  v26.4h,  v0.h[\i]
         smlal2          v25.4s,  v26.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
         srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
@@ -2712,15 +2781,13 @@ L(\type\()_\taps\()_filter_8):
         ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
         ld1             {v6.8h, v7.8h},  [\src], \s_strd
 .ifc \taps, 6tap
-        ext             v23.16b, v4.16b,  v5.16b,  #2
-        ext             v24.16b, v6.16b,  v7.16b,  #2
-        smull           v25.4s,  v23.4h,  v0.h[1]
-        smull2          v26.4s,  v23.8h,  v0.h[1]
-        smull           v27.4s,  v24.4h,  v0.h[1]
-        smull2          v28.4s,  v24.8h,  v0.h[1]
+        smull           v25.4s,  v4.4h,   v0.h[1]
+        smull2          v26.4s,  v4.8h,   v0.h[1]
+        smull           v27.4s,  v6.4h,   v0.h[1]
+        smull2          v28.4s,  v6.8h,   v0.h[1]
 .irpc i, 23456
-        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
-        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
+        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i-2)
+        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i-2)
         smlal           v25.4s,  v23.4h,  v0.h[\i]
         smlal2          v26.4s,  v23.8h,  v0.h[\i]
         smlal           v27.4s,  v24.4h,  v0.h[\i]
@@ -2747,17 +2814,17 @@ L(\type\()_\taps\()_filter_8):
         uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2
         uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
         ret
-
-L(\type\()_\taps\()_hv_tbl):
-        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
-        .hword L(\type\()_\taps\()_hv_tbl) -  640b
-        .hword L(\type\()_\taps\()_hv_tbl) -  320b
-        .hword L(\type\()_\taps\()_hv_tbl) -  160b
-        .hword L(\type\()_\taps\()_hv_tbl) -   80b
-        .hword L(\type\()_\taps\()_hv_tbl) -   40b
-        .hword L(\type\()_\taps\()_hv_tbl) -   20b
-        .hword 0
 endfunc
+
+jumptable \type\()_\taps\()_hv_tbl
+        .word 1280b - \type\()_\taps\()_hv_tbl
+        .word 640b  - \type\()_\taps\()_hv_tbl
+        .word 320b  - \type\()_\taps\()_hv_tbl
+        .word 160b  - \type\()_\taps\()_hv_tbl
+        .word 80b   - \type\()_\taps\()_hv_tbl
+        .word 40b   - \type\()_\taps\()_hv_tbl
+        .word 20b   - \type\()_\taps\()_hv_tbl
+endjumptable
 .endm
 
 
@@ -2787,21 +2854,21 @@ function \type\()_bilin_16bpc_neon, export=1
         add             w12, \bdmax, #4   // 4 + intermediate_bits
         cbnz            \mx, L(\type\()_bilin_h)
         cbnz            \my, L(\type\()_bilin_v)
-        b               \type\()_neon
+        b               \type\()_16bpc_neon
 
 L(\type\()_bilin_h):
         cbnz            \my, L(\type\()_bilin_hv)
 
-        adr             x10, L(\type\()_bilin_h_tbl)
+        movrel          x10, \type\()_bilin_h_tbl
         dup             v31.8h,  w11      // 4 - intermediate_bits
-        ldrh            w9,  [x10, x9, lsl #1]
+        ldrsw           x9,  [x10, x9, lsl #2]
         neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
 .ifc \type, put
         dup             v30.8h,  \bdmax   // intermediate_bits
 .else
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
-        sub             x10, x10, w9, uxtw
+        add             x10, x10, x9
 .ifc \type, put
         neg             v30.8h,  v30.8h   // -intermediate_bits
 .endif
@@ -2854,7 +2921,7 @@ L(\type\()_bilin_h):
 .else
         sub             v4.8h,   v4.8h,   v29.8h
 .endif
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
         b.gt            4b
         ret
@@ -2958,30 +3025,31 @@ L(\type\()_bilin_h):
         subs            \h,  \h,  #2
         b.gt            161b
         ret
+endfunc
 
-L(\type\()_bilin_h_tbl):
-        .hword L(\type\()_bilin_h_tbl) - 1280b
-        .hword L(\type\()_bilin_h_tbl) -  640b
-        .hword L(\type\()_bilin_h_tbl) -  320b
-        .hword L(\type\()_bilin_h_tbl) -  160b
-        .hword L(\type\()_bilin_h_tbl) -   80b
-        .hword L(\type\()_bilin_h_tbl) -   40b
-        .hword L(\type\()_bilin_h_tbl) -   20b
-        .hword 0
+jumptable \type\()_bilin_h_tbl
+        .word 1280b - \type\()_bilin_h_tbl
+        .word 640b  - \type\()_bilin_h_tbl
+        .word 320b  - \type\()_bilin_h_tbl
+        .word 160b  - \type\()_bilin_h_tbl
+        .word 80b   - \type\()_bilin_h_tbl
+        .word 40b   - \type\()_bilin_h_tbl
+        .word 20b   - \type\()_bilin_h_tbl
+endjumptable
 
 
-L(\type\()_bilin_v):
+function L(\type\()_bilin_v)
         cmp             \h,  #4
-        adr             x10, L(\type\()_bilin_v_tbl)
+        movrel          x10, \type\()_bilin_v_tbl
 .ifc \type, prep
         dup             v31.8h,  w11      // 4 - intermediate_bits
 .endif
-        ldrh            w9,  [x10, x9, lsl #1]
+        ldrsw           x9,  [x10, x9, lsl #2]
 .ifc \type, prep
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
         neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
 .endif
-        sub             x10, x10, w9, uxtw
+        add             x10, x10, x9
         br              x10
 
 20:     // 2xN v
@@ -2994,24 +3062,24 @@ L(\type\()_bilin_v):
         lsl             \d_strd,  \d_strd,  #1
 
         // 2x2 v
-        ld1             {v16.s}[0], [\src], \s_strd
+        ld1r            {v16.4s}, [\src], \s_strd
         b.gt            24f
 22:
-        ld1             {v17.s}[0], [\sr2], \s_strd
-        ld1             {v18.s}[0], [\src], \s_strd
+        ld1r            {v17.4s}, [\sr2], \s_strd
+        ld1r            {v18.4s}, [\src], \s_strd
         trn1            v16.2s,  v16.2s,  v17.2s
         trn1            v17.2s,  v17.2s,  v18.2s
         mul             v4.4h,   v16.4h,  v2.4h
         mla             v4.4h,   v17.4h,  v3.4h
         urshr           v4.8h,   v4.8h,   #4
-        st1             {v4.s}[0], [\dst]
+        str             s4,        [\dst]
         st1             {v4.s}[1], [\ds2]
         ret
 24:     // 2x4, 2x6, 2x8, ... v
-        ld1             {v17.s}[0], [\sr2], \s_strd
-        ld1             {v18.s}[0], [\src], \s_strd
-        ld1             {v19.s}[0], [\sr2], \s_strd
-        ld1             {v20.s}[0], [\src], \s_strd
+        ld1r            {v17.4s}, [\sr2], \s_strd
+        ld1r            {v18.4s}, [\src], \s_strd
+        ld1r            {v19.4s}, [\sr2], \s_strd
+        ld1r            {v20.4s}, [\src], \s_strd
         sub             \h,  \h,  #4
         trn1            v16.2s,  v16.2s,  v17.2s
         trn1            v17.2s,  v17.2s,  v18.2s
@@ -3056,7 +3124,7 @@ L(\type\()_bilin_v):
         urshl           v4.8h,   v4.8h,   v31.8h
         sub             v4.8h,   v4.8h,   v29.8h
 .endif
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.8b,  v18.8b
@@ -3156,28 +3224,29 @@ L(\type\()_bilin_v):
         b               1b
 0:
         ret
+endfunc
 
-L(\type\()_bilin_v_tbl):
-        .hword L(\type\()_bilin_v_tbl) - 1280b
-        .hword L(\type\()_bilin_v_tbl) -  640b
-        .hword L(\type\()_bilin_v_tbl) -  320b
-        .hword L(\type\()_bilin_v_tbl) -  160b
-        .hword L(\type\()_bilin_v_tbl) -   80b
-        .hword L(\type\()_bilin_v_tbl) -   40b
-        .hword L(\type\()_bilin_v_tbl) -   20b
-        .hword 0
-
-L(\type\()_bilin_hv):
-        adr             x10, L(\type\()_bilin_hv_tbl)
+jumptable \type\()_bilin_v_tbl
+        .word 1280b - \type\()_bilin_v_tbl
+        .word 640b  - \type\()_bilin_v_tbl
+        .word 320b  - \type\()_bilin_v_tbl
+        .word 160b  - \type\()_bilin_v_tbl
+        .word 80b   - \type\()_bilin_v_tbl
+        .word 40b   - \type\()_bilin_v_tbl
+        .word 20b   - \type\()_bilin_v_tbl
+endjumptable
+
+function L(\type\()_bilin_hv)
+        movrel          x10, \type\()_bilin_hv_tbl
         dup             v31.8h,  w11      // 4 - intermediate_bits
-        ldrh            w9,  [x10, x9, lsl #1]
+        ldrsw           x9,  [x10, x9, lsl #2]
         neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
 .ifc \type, put
         dup             v30.4s,  w12      // 4 + intermediate_bits
 .else
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
-        sub             x10, x10, w9, uxtw
+        add             x10, x10, x9
 .ifc \type, put
         neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
 .endif
@@ -3264,7 +3333,7 @@ L(\type\()_bilin_hv):
         sub             v4.8h,   v4.8h,   v29.8h
 .endif
         subs            \h,  \h,  #2
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
         b.le            0f
         trn2            v16.2d,  v17.2d,  v17.2d
@@ -3350,17 +3419,17 @@ L(\type\()_bilin_hv):
         b               1b
 0:
         ret
-
-L(\type\()_bilin_hv_tbl):
-        .hword L(\type\()_bilin_hv_tbl) - 1280b
-        .hword L(\type\()_bilin_hv_tbl) -  640b
-        .hword L(\type\()_bilin_hv_tbl) -  320b
-        .hword L(\type\()_bilin_hv_tbl) -  160b
-        .hword L(\type\()_bilin_hv_tbl) -   80b
-        .hword L(\type\()_bilin_hv_tbl) -   40b
-        .hword L(\type\()_bilin_hv_tbl) -   20b
-        .hword 0
 endfunc
+
+jumptable \type\()_bilin_hv_tbl
+        .word 1280b - \type\()_bilin_hv_tbl
+        .word 640b  - \type\()_bilin_hv_tbl
+        .word 320b  - \type\()_bilin_hv_tbl
+        .word 160b  - \type\()_bilin_hv_tbl
+        .word 80b   - \type\()_bilin_hv_tbl
+        .word 40b   - \type\()_bilin_hv_tbl
+        .word 20b   - \type\()_bilin_hv_tbl
+endjumptable
 .endm
 
 make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
diff --git a/src/arm/64/mc16_sve.S b/src/arm/64/mc16_sve.S
new file mode 100644
index 0000000000000000000000000000000000000000..9ebdb2187ec80e1a63cf6e0120a789b6bc9a3bc6
--- /dev/null
+++ b/src/arm/64/mc16_sve.S
@@ -0,0 +1,1649 @@
+/*
+ * Copyright © 2024, Arm Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 32, lsl #8        // 8192
+#define PREP_BIAS_NEG 224, lsl #8   // -8192
+
+#if HAVE_SVE2
+ENABLE_SVE
+ENABLE_SVE2
+
+// No spaces in these expressions, due to gas-preprocessor. It is translated by
+// -1 to save the negative offset when getting the address of `mc_subpel_filters`.
+#define REGULAR1        (((0*15-1)<<7)|(3*15-1))
+#define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
+#define SHARP1          (((2*15-1)<<7)|(3*15-1))
+
+#define FUNC_ALIGN      2
+#define JUMP_ALIGN      2
+#define LOOP_ALIGN      2
+
+
+// Shuffle indices to permute horizontal samples in preparation for input to
+// 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample
+// indices in the interval of [-3, 4] relative to the current sample position.
+const h_tbl_sve, align=4
+        .byte  0,  1,  2,  3,  4,  5,  6,  7,   2,  3,  4,  5,  6,  7,  8,  9
+        .byte  4,  5,  6,  7,  8,  9, 10, 11,   6,  7,  8,  9, 10, 11, 12, 13
+endconst
+
+// Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit
+// registers contain a transposed 4x4 matrix of values. Subsequent iterations
+// of the vertical convolution can reuse the 3x4 sub-matrix from the previous
+// loop iteration. These shuffle indices shift and merge this 4x4 matrix with
+// the values of a new line.
+const v_tbl_sve, align=4
+        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 24, 25
+        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 18, 19
+        .byte  2,  3,  4,  5,  6,  7, 20, 21,  10, 11, 12, 13, 14, 15, 22, 23
+        .byte  2,  3,  4,  5,  6,  7, 24, 25,  10, 11, 12, 13, 14, 15, 26, 27
+        .byte  2,  3,  4,  5,  6,  7, 28, 29,  10, 11, 12, 13, 14, 15, 30, 31
+endconst
+
+
+.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
+function \op\()_8tap_\type\()_16bpc_\isa, export=1, align=FUNC_ALIGN
+        mov             x9,  \type_h
+        mov             x10, \type_v
+    .if \jump
+        b               \op\()_8tap_\isa
+    .endif
+endfunc
+.endm
+
+.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd
+make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
+make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
+make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
+make_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
+make_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
+make_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
+make_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
+make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
+make_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
+
+function \type\()_8tap_\isa, align=FUNC_ALIGN
+        clz             w8, \w
+        mov             w11, #0x4081                    // (1<<14) | (1<<7) | 1
+        ptrue           p0.b, vl16
+        sub             w8, w8, #24                     // for jump tables
+        movrel          x12, X(mc_subpel_filters)
+        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
+.ifc \type, prep
+        cbz             \my, prep_sve
+.else   // put
+        cbnz            \my, L(\type\()_8tap_v_\isa)
+        mov             w9, w8
+        b               X(put_16bpc_neon)
+
+        .align JUMP_ALIGN
+.endif
+
+L(\type\()_8tap_v_\isa):
+        madd            \my, \my, w11, w10
+        movrel          x13, v_tbl_sve
+.ifc \bdmax, w8                                         // put case, but skip
+        ld1r            {v5.8h}, [sp]                   // loading into w8
+.endif
+        sub             \src, \src, \s_strd             // src - s_strd
+        ubfx            w11, \my, #7, #7
+        and             \my, \my, #0x7F
+        ldr             q6, [x13]
+        cmp             \h, #4
+        csel            \my, \my, w11, le
+        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd
+        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
+        ldp             q28, q29, [x13, #16]
+        ld1sb           {z7.h}, p0/z, [\xmy]
+.ifc \type, prep
+        clz             \bdmax, \bdmax
+        sub             \bdmax, \bdmax, #24
+        dup             v5.4s, \bdmax
+.endif
+        cmp             \w, #8
+        b.lt            40f
+
+        // .align JUMP_ALIGN   // fallthrough
+80:     // V - 8xN+
+        ldp             q30, q31, [x13, #48]
+.ifc \type, prep
+        add             \wd_strd, \w, \w                // d_strd = 2 * w
+.endif
+        .align LOOP_ALIGN
+81:
+        add             \lsrc, \src, \s_strd, lsl #1
+
+        ldr             q16, [\src]
+        ldr             q17, [\src, \s_strd]
+        ldr             q18, [\lsrc]
+        ldr             q19, [\lsrc, \s_strd]
+        add             \lsrc, \lsrc, \s_strd, lsl #1
+        mov             \ldst, \dst
+
+        ldr             q20, [\lsrc]
+        ldr             q21, [\lsrc, \s_strd]
+        add             \lsrc, \lsrc, \s_strd, lsl #1
+        ldr             q22, [\lsrc]
+        ldr             q23, [\lsrc, \s_strd]
+        add             \lsrc, \lsrc, \s_strd, lsl #1
+        sub             w8, \h, #1
+
+        zip1            v0.8h, v16.8h, v17.8h
+        zip2            v1.8h, v16.8h, v17.8h
+        zip1            v2.8h, v18.8h, v19.8h
+        zip2            v3.8h, v18.8h, v19.8h
+
+        zip1            v18.8h, v20.8h, v21.8h
+        zip2            v21.8h, v20.8h, v21.8h
+        zip1            v24.8h, v22.8h, v23.8h
+        zip2            v27.8h, v22.8h, v23.8h
+
+        zip1            v16.4s, v0.4s, v2.4s
+        zip2            v19.4s, v0.4s, v2.4s
+        zip1            v22.4s, v1.4s, v3.4s
+        zip2            v25.4s, v1.4s, v3.4s
+
+        zip1            v17.4s, v18.4s, v24.4s
+        zip2            v20.4s, v18.4s, v24.4s
+        zip1            v23.4s, v21.4s, v27.4s
+        zip2            v26.4s, v21.4s, v27.4s
+
+        .align LOOP_ALIGN
+8:
+        ld1             {v18.16b}, [\lsrc], \s_strd
+
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        mov             v21.16b, v18.16b
+        mov             v24.16b, v18.16b
+        mov             v27.16b, v18.16b
+
+        sdot            z0.d, z16.h, z7.h[0]
+        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
+        sdot            z1.d, z19.h, z7.h[0]
+        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
+        sdot            z2.d, z22.h, z7.h[0]
+        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
+        subs            w8, w8, #1
+        sdot            z3.d, z25.h, z7.h[0]
+        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
+
+        sdot            z0.d, z17.h, z7.h[1]
+        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
+        sdot            z1.d, z20.h, z7.h[1]
+        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
+        sdot            z2.d, z23.h, z7.h[1]
+        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
+        sdot            z3.d, z26.h, z7.h[1]
+        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        uzp1            v1.4s, v2.4s, v3.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v1.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+.else   // put
+        sqrshrun        v0.4h, v0.4s, #6
+        sqrshrun2       v0.8h, v1.4s, #6
+        umin            v0.8h, v0.8h, v5.8h
+.endif
+        st1             {v0.16b}, [\ldst], \d_strd
+        b.gt            8b
+
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+
+        sdot            z0.d, z16.h, z7.h[0]
+        sdot            z1.d, z19.h, z7.h[0]
+        sdot            z2.d, z22.h, z7.h[0]
+        sdot            z3.d, z25.h, z7.h[0]
+
+        sdot            z0.d, z17.h, z7.h[1]
+        sdot            z1.d, z20.h, z7.h[1]
+        sdot            z2.d, z23.h, z7.h[1]
+        sdot            z3.d, z26.h, z7.h[1]
+        subs            \w, \w, #8
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        uzp1            v1.4s, v2.4s, v3.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v1.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+.else   // put
+        sqrshrun        v0.4h, v0.4s, #6
+        sqrshrun2       v0.8h, v1.4s, #6
+        umin            v0.8h, v0.8h, v5.8h
+.endif
+        str             q0, [\ldst]
+
+        add             \dst, \dst, #16
+        add             \src, \src, #16
+        b.gt            81b
+        ret
+
+        .align JUMP_ALIGN
+40:     // V - 4xN, put only: 2xN
+.ifc \type, put
+        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
+        whilelt         p1.h, wzr, \w               // masking for writes
+.endif
+        cmp             \h, #4
+        b.le            44f
+
+        ldr             d16, [\src]
+        ldr             d17, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             d18, [\src]
+        ldr             d19, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        ldr             d20, [\src]
+        ldr             d21, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             d22, [\src]
+        ldr             d23, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        sub             \h, \h, #2
+
+        zip1            v0.8h, v16.8h, v17.8h
+        zip1            v2.8h, v18.8h, v19.8h
+        zip1            v18.8h, v20.8h, v21.8h
+        zip1            v24.8h, v22.8h, v23.8h
+
+        zip1            v16.4s, v0.4s, v2.4s
+        zip2            v19.4s, v0.4s, v2.4s
+        zip1            v17.4s, v18.4s, v24.4s
+        zip2            v20.4s, v18.4s, v24.4s
+
+        .align LOOP_ALIGN
+4:
+        ldr             d18, [\src]
+        ldr             d24, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        mov             v21.16b, v18.16b
+        mov             v27.16b, v24.16b
+
+        sdot            z0.d, z16.h, z7.h[0]
+        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
+        sdot            z1.d, z19.h, z7.h[0]
+        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
+        sdot            z0.d, z17.h, z7.h[1]
+        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
+        sdot            z1.d, z20.h, z7.h[1]
+        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
+        subs            \h, \h, #2
+
+        sdot            z2.d, z22.h, z7.h[0]
+        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
+        sdot            z3.d, z25.h, z7.h[0]
+        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
+        sdot            z2.d, z23.h, z7.h[1]
+        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
+        sdot            z3.d, z26.h, z7.h[1]
+        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        uzp1            v1.4s, v2.4s, v3.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v1.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             q0, [\dst], #16
+.else   // put
+        sqrshrun        v0.4h, v0.4s, #6
+        sqrshrun        v1.4h, v1.4s, #6
+        umin            v0.4h, v0.4h, v5.4h
+        umin            v1.4h, v1.4h, v5.4h
+        st1h            {z0.h}, p1, [\dst]
+        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
+        add             \dst, \dst, \d_strd, lsl #2
+.endif
+        b.gt            4b
+
+        ldr             d18, [\src]
+
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        mov             v21.16b, v18.16b
+
+        sdot            z0.d, z16.h, z7.h[0]
+        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
+        sdot            z1.d, z19.h, z7.h[0]
+        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
+        sdot            z0.d, z17.h, z7.h[1]
+        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
+        sdot            z1.d, z20.h, z7.h[1]
+        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
+
+        sdot            z2.d, z22.h, z7.h[0]
+        sdot            z3.d, z25.h, z7.h[0]
+        sdot            z2.d, z23.h, z7.h[1]
+        sdot            z3.d, z26.h, z7.h[1]
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        uzp1            v1.4s, v2.4s, v3.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v1.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             q0, [\dst]
+.else   // put
+        sqrshrun        v0.4h, v0.4s, #6
+        sqrshrun        v1.4h, v1.4s, #6
+        umin            v0.4h, v0.4h, v5.4h
+        umin            v1.4h, v1.4h, v5.4h
+        st1h            {z0.h}, p1, [\dst]
+        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
+.endif
+        ret
+
+        .align JUMP_ALIGN
+44:     // V - 4x4, put only: 4x2, 2x4, 2x2
+        add             \src, \src, \s_strd, lsl #1     // src - s_strd
+        subs            \h, \h, #2
+
+        ldr             d16, [\src]
+        ldr             d17, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             d18, [\src]
+        ldr             d19, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        ext             v7.16b, v7.16b, v7.16b, #4      // [\xmy + 2 * 2]
+
+        zip1            v0.8h, v16.8h, v17.8h
+        zip1            v2.8h, v18.8h, v19.8h
+        zip1            v16.4s, v0.4s, v2.4s
+        zip2            v19.4s, v0.4s, v2.4s
+
+.ifc \type, put
+        b.eq            42f
+.endif
+        ldr             d17, [\src]
+        ldr             d23, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        mov             v20.16b, v17.16b
+        mov             v26.16b, v23.16b
+
+        sdot            z0.d, z16.h, z7.h[0]
+        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
+        sdot            z1.d, z19.h, z7.h[0]
+        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
+        sdot            z2.d, z22.h, z7.h[0]
+        tbl             v16.16b, {v22.16b, v23.16b}, v28.16b
+        sdot            z3.d, z25.h, z7.h[0]
+        tbl             v19.16b, {v25.16b, v26.16b}, v29.16b
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        uzp1            v1.4s, v2.4s, v3.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v1.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             q0, [\dst], #16
+.else   // put
+        sqrshrun        v0.4h, v0.4s, #6
+        sqrshrun        v1.4h, v1.4s, #6
+        umin            v0.4h, v0.4h, v5.4h
+        umin            v1.4h, v1.4h, v5.4h
+        st1h            {z0.h}, p1, [\dst]
+        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
+        add             \dst, \dst, \d_strd, lsl #2
+.endif
+
+.ifc \type, put
+        .align JUMP_ALIGN
+42:
+.endif
+        ldr             d17, [\src]
+
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        mov             v20.16b, v17.16b
+
+        sdot            z0.d, z16.h, z7.h[0]
+        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
+        sdot            z1.d, z19.h, z7.h[0]
+        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
+
+        sdot            z2.d, z22.h, z7.h[0]
+        sdot            z3.d, z25.h, z7.h[0]
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        uzp1            v1.4s, v2.4s, v3.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v1.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             q0, [\dst]
+.else   // put
+        sqrshrun        v0.4h, v0.4s, #6
+        sqrshrun        v1.4h, v1.4s, #6
+        umin            v0.4h, v0.4h, v5.4h
+        umin            v1.4h, v1.4h, v5.4h
+        st1h            {z0.h}, p1, [\dst]
+        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
+.endif
+        ret
+
+        .align JUMP_ALIGN
+L(\type\()_8tap_h_hv_\isa):
+        madd            \mx, \mx, w11, w9
+        movrel          x13, h_tbl_sve
+        sub             \src, \src, #6              // src - 3 * 2
+        ubfx            w9, \mx, #7, #7
+        and             \mx, \mx, #0x7F
+        cmp             \w, #4
+        csel            \mx, \mx, w9, le
+        ldp             q30, q31, [x13]
+        add             \xmx, x12, \xmx, lsl #3     // subpel H filter address
+        cbz             \my, L(\type\()_8tap_h_\isa)
+
+        // HV cases
+        madd            w14, \my, w11, w10
+.ifc \bdmax, w8
+        ldr             \bdmax, [sp]
+.endif
+        ubfx            w11, w14, #7, #7
+        and             w14, w14, #0x7F
+        ld1sb           {z4.h}, p0/z, [\xmx]
+        cmp             \h, #4
+        csel            w14, w14, w11, le
+.ifc \type, put
+        dup             v29.8h, \bdmax
+.endif
+        clz             \bdmax, \bdmax
+        add             \xmy, x12, x14, lsl #3      // subpel V filter address
+        ld1sb           {z7.h}, p0/z, [\xmy]
+.ifc \type, put
+        mov             w9, #12
+        sub             w9, w9, \bdmax
+        dup             v6.4s, w9
+.endif
+        sub             \bdmax, \bdmax, #24
+        mov             x15, x30
+        sub             \src, \src, \s_strd         // src - s_strd - 3 * 2
+        dup             v5.4s, \bdmax
+        cmp             w10, SHARP1
+        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
+
+        // HV 8-tap cases
+        cmp             \w, #4
+        b.le            40f
+
+        // .align JUMP_ALIGN    // fallthrough
+80:     // HV8 - 8xN+
+.ifc \type, prep
+        add             \wd_strd, \w, \w                // d_strd = 2 * w
+.endif
+        cmp             \h, #4
+        b.le            84f
+        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd - 3 * 2
+
+        .align LOOP_ALIGN
+81:
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        mov             w8, \h
+
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v16.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v17.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v18.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v19.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v20.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v21.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v22.8h, v23.8h, v24.8h
+
+        .align LOOP_ALIGN
+8:
+        ldp             q24, q28, [\lsrc]
+        smull           v0.4s, v16.4h, v7.h[0]
+        smull2          v1.4s, v16.8h, v7.h[0]
+        mov             v16.16b, v17.16b
+
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        tbl             v23.16b, {v24.16b}, v30.16b
+        tbl             v24.16b, {v24.16b}, v31.16b
+
+        ldur            q26, [\lsrc, #8]
+        smlal           v0.4s, v17.4h, v7.h[1]
+        smlal2          v1.4s, v17.8h, v7.h[1]
+        mov             v17.16b, v18.16b
+        add             \lsrc, \lsrc, \s_strd
+
+        sdot            z2.d, z23.h, z4.h[0]
+        sdot            z3.d, z24.h, z4.h[0]
+        movi            v23.2d, #0
+        movi            v24.2d, #0
+        tbl             v25.16b, {v26.16b}, v30.16b
+        tbl             v26.16b, {v26.16b}, v31.16b
+        smlal           v0.4s, v18.4h, v7.h[2]
+        smlal2          v1.4s, v18.8h, v7.h[2]
+        mov             v18.16b, v19.16b
+
+        sdot            z23.d, z25.h, z4.h[0]
+        sdot            z24.d, z26.h, z4.h[0]
+        tbl             v27.16b, {v28.16b}, v30.16b
+        tbl             v28.16b, {v28.16b}, v31.16b
+        smlal           v0.4s, v19.4h, v7.h[3]
+        smlal2          v1.4s, v19.8h, v7.h[3]
+        mov             v19.16b, v20.16b
+
+        subs            w8, w8, #1
+        sdot            z2.d, z25.h, z4.h[1]
+        sdot            z3.d, z26.h, z4.h[1]
+        sdot            z23.d, z27.h, z4.h[1]
+        sdot            z24.d, z28.h, z4.h[1]
+
+        smlal           v0.4s, v20.4h, v7.h[4]
+        smlal2          v1.4s, v20.8h, v7.h[4]
+        mov             v20.16b, v21.16b
+
+        uzp1            v3.4s, v2.4s, v3.4s
+        uzp1            v24.4s, v23.4s, v24.4s
+        smlal           v0.4s, v21.4h, v7.h[5]
+        smlal2          v1.4s, v21.8h, v7.h[5]
+        mov             v21.16b, v22.16b
+
+        srshl           v23.4s, v3.4s, v5.4s
+        srshl           v24.4s, v24.4s, v5.4s
+        smlal           v0.4s, v22.4h, v7.h[6]
+        smlal2          v1.4s, v22.8h, v7.h[6]
+
+        uzp1            v22.8h, v23.8h, v24.8h
+        smlal           v0.4s, v22.4h, v7.h[7]
+        smlal2          v1.4s, v22.8h, v7.h[7]
+
+.ifc \type, prep
+        rshrn           v0.4h, v0.4s, #6
+        rshrn2          v0.8h, v1.4s, #6
+        sub             z0.h, z0.h, #PREP_BIAS
+.else   // put
+        srshl           v0.4s, v0.4s, v6.4s
+        srshl           v1.4s, v1.4s, v6.4s
+        sqxtun          v0.4h, v0.4s
+        sqxtun2         v0.8h, v1.4s
+        umin            v0.8h, v0.8h, v29.8h
+.endif
+        st1             {v0.8h}, [\ldst], \d_strd
+        b.gt            8b
+
+        subs            \w, \w, #8
+        add             \src, \src, #16
+        add             \dst, \dst, #16
+        b.gt            81b
+        ret             x15
+
+        .align JUMP_ALIGN
+40:     // HV8 - 4xN, put only: 2xN
+.ifc \type, put
+        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
+        whilelt         p1.h, wzr, \w               // masking for writes
+.endif
+        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
+        add             \src, \src, #4
+
+        cmp             \h, #4
+        b.le            44f
+
+        sub             \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v16.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v17.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v18.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v19.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v20.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v21.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v22.4h, v0.4s
+
+        .align LOOP_ALIGN
+4:
+        ld1             {v3.16b}, [\src], \s_strd
+
+        smull           v24.4s, v16.4h, v7.h[0]
+        smlal           v24.4s, v17.4h, v7.h[1]
+        tbl             v2.16b, {v3.16b}, v30.16b
+        tbl             v3.16b, {v3.16b}, v31.16b
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        smlal           v24.4s, v18.4h, v7.h[2]
+        smlal           v24.4s, v19.4h, v7.h[3]
+        sdot            z0.d, z2.h, z4.h[0]
+        sdot            z1.d, z3.h, z4.h[0]
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        uzp1            v0.4s, v0.4s, v1.4s
+
+        smlal           v24.4s, v20.4h, v7.h[4]
+        smlal           v24.4s, v21.4h, v7.h[5]
+        srshl           v0.4s, v0.4s, v5.4s
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+
+        subs            \h, \h, #1
+        smlal           v24.4s, v22.4h, v7.h[6]
+        xtn             v22.4h, v0.4s
+        smlal           v24.4s, v22.4h, v7.h[7]
+
+.ifc \type, prep
+        rshrn           v0.4h, v24.4s, #6
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             d0, [\dst], #8
+.else   // put
+        srshl           v0.4s, v24.4s, v6.4s
+        sqxtun          v0.4h, v0.4s
+        umin            v0.4h, v0.4h, v29.4h
+        st1h            {z0.h}, p1, [\dst]
+        add             \dst, \dst, \d_strd, lsl #1
+.endif
+        b.gt            4b
+        ret             x15
+
+        .align JUMP_ALIGN
+L(\type\()_6tap_hv_\isa):
+        cmp             \w, #4
+        b.le            46f
+
+        // .align JUMP_ALIGN    // fallthrough
+80:     // HV6 - 8xN+
+.ifc \type, prep
+        add             \wd_strd, \w, \w        // d_strd = 2 * w
+.endif
+        cmp             \h, #4
+        b.le            84f
+        sub             \src, \src, \s_strd     // src - 2 * s_strd - 3 * 2
+
+        .align LOOP_ALIGN
+81:
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        mov             w8, \h
+
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v16.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v17.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v18.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v19.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v20.8h, v23.8h, v24.8h
+
+        .align LOOP_ALIGN
+8:
+        ldp             q24, q28, [\lsrc]
+
+        smull           v0.4s, v16.4h, v7.h[1]
+        smull2          v1.4s, v16.8h, v7.h[1]
+        mov             v16.16b, v17.16b
+
+        tbl             v23.16b, {v24.16b}, v30.16b
+        tbl             v24.16b, {v24.16b}, v31.16b
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+
+        ldur            q26, [\lsrc, #8]
+        add             \lsrc, \lsrc, \s_strd
+
+        sdot            z2.d, z23.h, z4.h[0]
+        sdot            z3.d, z24.h, z4.h[0]
+        tbl             v25.16b, {v26.16b}, v30.16b
+        tbl             v26.16b, {v26.16b}, v31.16b
+        movi            v23.2d, #0
+        movi            v24.2d, #0
+
+        sdot            z23.d, z25.h, z4.h[0]
+        sdot            z24.d, z26.h, z4.h[0]
+        tbl             v27.16b, {v28.16b}, v30.16b
+        tbl             v28.16b, {v28.16b}, v31.16b
+        smlal           v0.4s, v17.4h, v7.h[2]
+        smlal2          v1.4s, v17.8h, v7.h[2]
+        mov             v17.16b, v18.16b
+
+        sdot            z2.d, z25.h, z4.h[1]
+        sdot            z3.d, z26.h, z4.h[1]
+        sdot            z23.d, z27.h, z4.h[1]
+        sdot            z24.d, z28.h, z4.h[1]
+
+        smlal           v0.4s, v18.4h, v7.h[3]
+        smlal2          v1.4s, v18.8h, v7.h[3]
+        mov             v18.16b, v19.16b
+
+        uzp1            v3.4s, v2.4s, v3.4s
+        uzp1            v24.4s, v23.4s, v24.4s
+        smlal           v0.4s, v19.4h, v7.h[4]
+        smlal2          v1.4s, v19.8h, v7.h[4]
+        mov             v19.16b, v20.16b
+
+        srshl           v23.4s, v3.4s, v5.4s
+        srshl           v24.4s, v24.4s, v5.4s
+        smlal           v0.4s, v20.4h, v7.h[5]
+        smlal2          v1.4s, v20.8h, v7.h[5]
+
+        subs            w8, w8, #1
+        uzp1            v20.8h, v23.8h, v24.8h
+        smlal           v0.4s, v20.4h, v7.h[6]
+        smlal2          v1.4s, v20.8h, v7.h[6]
+
+.ifc \type, prep
+        rshrn           v0.4h, v0.4s, #6
+        rshrn2          v0.8h, v1.4s, #6
+        sub             z0.h, z0.h, #PREP_BIAS
+.else   // put
+        srshl           v0.4s, v0.4s, v6.4s
+        srshl           v1.4s, v1.4s, v6.4s
+        sqxtun          v0.4h, v0.4s
+        sqxtun2         v0.8h, v1.4s
+        umin            v0.8h, v0.8h, v29.8h
+.endif
+        st1             {v0.8h}, [\ldst], \d_strd
+        b.gt            8b
+
+        add             \dst, \dst, #16
+        subs            \w, \w, #8
+        add             \src, \src, #16
+        b.gt            81b
+        ret             x15
+
+        .align LOOP_ALIGN
+84:     // HV4 - 8x4, 8x2
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        mov             w8, \h
+
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v17.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v18.8h, v23.8h, v24.8h
+        bl              L(\type\()_hv_filter8_\isa)
+        uzp1            v19.8h, v23.8h, v24.8h
+
+        .align LOOP_ALIGN
+81:
+        ldp             q24, q28, [\lsrc]
+        ldur            q26, [\lsrc, #8]
+        add             \lsrc, \lsrc, \s_strd
+
+        tbl             v23.16b, {v24.16b}, v30.16b
+        tbl             v24.16b, {v24.16b}, v31.16b
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        sdot            z2.d, z23.h, z4.h[0]
+        sdot            z3.d, z24.h, z4.h[0]
+
+        tbl             v25.16b, {v26.16b}, v30.16b
+        tbl             v26.16b, {v26.16b}, v31.16b
+        movi            v23.2d, #0
+        movi            v24.2d, #0
+        sdot            z23.d, z25.h, z4.h[0]
+        sdot            z24.d, z26.h, z4.h[0]
+
+        tbl             v27.16b, {v28.16b}, v30.16b
+        tbl             v28.16b, {v28.16b}, v31.16b
+        sdot            z2.d, z25.h, z4.h[1]
+        sdot            z3.d, z26.h, z4.h[1]
+        sdot            z23.d, z27.h, z4.h[1]
+        sdot            z24.d, z28.h, z4.h[1]
+
+        smull           v0.4s, v17.4h, v7.h[2]
+        smull2          v1.4s, v17.8h, v7.h[2]
+        mov             v17.16b, v18.16b
+
+        subs            w8, w8, #1
+        uzp1            v3.4s, v2.4s, v3.4s
+        uzp1            v24.4s, v23.4s, v24.4s
+        smlal           v0.4s, v18.4h, v7.h[3]
+        smlal2          v1.4s, v18.8h, v7.h[3]
+        mov             v18.16b, v19.16b
+
+        srshl           v23.4s, v3.4s, v5.4s
+        srshl           v24.4s, v24.4s, v5.4s
+        smlal           v0.4s, v19.4h, v7.h[4]
+        smlal2          v1.4s, v19.8h, v7.h[4]
+
+        uzp1            v19.8h, v23.8h, v24.8h
+        smlal           v0.4s, v19.4h, v7.h[5]
+        smlal2          v1.4s, v19.8h, v7.h[5]
+
+.ifc \type, prep
+        rshrn           v0.4h, v0.4s, #6
+        rshrn2          v0.8h, v1.4s, #6
+        sub             z0.h, z0.h, #PREP_BIAS
+.else   // put
+        srshl           v0.4s, v0.4s, v6.4s
+        srshl           v1.4s, v1.4s, v6.4s
+        sqxtun          v0.4h, v0.4s
+        sqxtun2         v0.8h, v1.4s
+        umin            v0.8h, v0.8h, v29.8h
+.endif
+        st1             {v0.8h}, [\ldst], \d_strd
+        b.gt            81b
+
+        subs            \w, \w, #8
+        add             \dst, \dst, #16
+        add             \src, \src, #16
+        b.gt            84b
+        ret             x15
+
+        .align FUNC_ALIGN
+L(\type\()_hv_filter8_\isa):
+        ldp             q24, q28, [\lsrc]
+        ldur            q26, [\lsrc, #8]
+        add             \lsrc, \lsrc, \s_strd
+
+        tbl             v23.16b, {v24.16b}, v30.16b
+        tbl             v24.16b, {v24.16b}, v31.16b
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+        sdot            z2.d, z23.h, z4.h[0]
+        sdot            z3.d, z24.h, z4.h[0]
+
+        tbl             v25.16b, {v26.16b}, v30.16b
+        tbl             v26.16b, {v26.16b}, v31.16b
+        movi            v23.2d, #0
+        movi            v24.2d, #0
+        sdot            z23.d, z25.h, z4.h[0]
+        sdot            z24.d, z26.h, z4.h[0]
+
+        tbl             v27.16b, {v28.16b}, v30.16b
+        tbl             v28.16b, {v28.16b}, v31.16b
+        sdot            z2.d, z25.h, z4.h[1]
+        sdot            z3.d, z26.h, z4.h[1]
+        sdot            z23.d, z27.h, z4.h[1]
+        sdot            z24.d, z28.h, z4.h[1]
+
+        uzp1            v3.4s, v2.4s, v3.4s
+        uzp1            v24.4s, v23.4s, v24.4s
+        srshl           v23.4s, v3.4s, v5.4s
+        srshl           v24.4s, v24.4s, v5.4s
+        ret
+
+        .align FUNC_ALIGN
+L(\type\()_hv_filter4_\isa):
+        ld1             {v3.16b}, [\src], \s_strd
+
+        tbl             v2.16b, {v3.16b}, v30.16b
+        tbl             v3.16b, {v3.16b}, v31.16b
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        sdot            z0.d, z2.h, z4.h[0]
+        sdot            z1.d, z3.h, z4.h[0]
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        srshl           v0.4s, v0.4s, v5.4s
+        ret
+
+        .align JUMP_ALIGN
+46:     // H4V6 - 4xN, put only: 2xN
+.ifc \type, put
+        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
+        whilelt         p1.h, wzr, \w               // masking for writes
+.endif
+        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
+        add             \src, \src, #4
+
+        cmp             \h, #4
+        b.le            44f
+
+        sub             \src, \src, \s_strd         // src - 2 * s_strd - 3 * 2
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v16.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v17.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v18.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v19.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v20.4h, v0.4s
+
+        .align LOOP_ALIGN
+4:
+        ld1             {v3.16b}, [\src], \s_strd
+        smull           v24.4s, v16.4h, v7.h[1]
+        smlal           v24.4s, v17.4h, v7.h[2]
+
+        tbl             v2.16b, {v3.16b}, v30.16b
+        tbl             v3.16b, {v3.16b}, v31.16b
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        sdot            z0.d, z2.h, z4.h[0]
+        sdot            z1.d, z3.h, z4.h[0]
+
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+        smlal           v24.4s, v18.4h, v7.h[3]
+        smlal           v24.4s, v19.4h, v7.h[4]
+        uzp1            v0.4s, v0.4s, v1.4s
+
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        subs            \h, \h, #1
+        srshl           v0.4s, v0.4s, v5.4s
+        smlal           v24.4s, v20.4h, v7.h[5]
+        xtn             v20.4h, v0.4s
+        smlal           v24.4s, v20.4h, v7.h[6]
+
+.ifc \type, prep
+        rshrn           v0.4h, v24.4s, #6
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             d0, [\dst], #8
+.else   // put
+        srshl           v0.4s, v24.4s, v6.4s
+        sqxtun          v0.4h, v0.4s
+        umin            v0.4h, v0.4h, v29.4h
+        st1h            {z0.h}, p1, [\dst]
+        add             \dst, \dst, \d_strd, lsl #1
+.endif
+        b.gt            4b
+        ret             x15
+
+        .align JUMP_ALIGN
+44:     // H4V4 - 4x4, put only: 4x2, 2x4, 2x2
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v17.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v18.4h, v0.4s
+        bl              L(\type\()_hv_filter4_\isa)
+        xtn             v19.4h, v0.4s
+
+        .align LOOP_ALIGN
+4:
+        ld1             {v3.16b}, [\src], \s_strd
+        smull           v24.4s, v17.4h, v7.h[2]
+        smlal           v24.4s, v18.4h, v7.h[3]
+
+        tbl             v2.16b, {v3.16b}, v30.16b
+        tbl             v3.16b, {v3.16b}, v31.16b
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        sdot            z0.d, z2.h, z4.h[0]
+        sdot            z1.d, z3.h, z4.h[0]
+        uzp1            v0.4s, v0.4s, v1.4s
+
+        mov             v17.16b, v18.16b
+        mov             v18.16b, v19.16b
+        subs            \h, \h, #1
+        srshl           v0.4s, v0.4s, v5.4s
+        smlal           v24.4s, v19.4h, v7.h[4]
+        xtn             v19.4h, v0.4s
+        smlal           v24.4s, v19.4h, v7.h[5]
+
+.ifc \type, prep
+        rshrn           v0.4h, v24.4s, #6
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             d0, [\dst], #8
+.else   // put
+        srshl           v0.4s, v24.4s, v6.4s
+        sqxtun          v0.4h, v0.4s
+        umin            v0.4h, v0.4h, v29.4h
+        st1h            {z0.h}, p1, [\dst]
+        add             \dst, \dst, \d_strd, lsl #1
+.endif
+        b.gt            4b
+        ret             x15
+
+        .align JUMP_ALIGN
+L(\type\()_8tap_h_\isa):
+        movrel          x11, \type\()_8tap_h_\isa\()_tbl
+        ldrsw           x12, [x11, x8, lsl #2]
+.ifc \bdmax, w8
+        ldr             \bdmax, [sp]
+.endif
+.ifc \type, prep
+        clz             \bdmax, \bdmax
+        sub             \bdmax, \bdmax, #24
+        dup             v5.4s, \bdmax
+.else   // put
+        mov             w9, #34             // rounding for 10-bit case
+        mov             w10, #40            // rounding for 12-bit case
+        cmp             \bdmax, #0xFFF
+        csel            w9, w9, w10, ne     // select rounding based on \bdmax
+        dup             v5.8h, \bdmax
+        dup             v6.2d, x9
+.endif
+        add             x11, x11, x12
+        ld1sb           {z4.h}, p0/z, [\xmx]
+        br              x11
+
+        .align JUMP_ALIGN
+20:     // H - 4xN, put only: 2xN
+40:
+        AARCH64_VALID_JUMP_TARGET
+        add             \src, \src, #4              // src - 1 * 2
+        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
+.ifc \type, put
+        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
+        whilelt         p1.h, wzr, \w               // masking for writes
+.endif
+        .align LOOP_ALIGN
+4:
+        ldr             q17, [\src]
+        ldr             q19, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+.ifc \type, prep
+        movi            v0.2d, #0
+        movi            v1.2d, #0
+        movi            v2.2d, #0
+        movi            v3.2d, #0
+.else
+        mov             v0.16b, v6.16b
+        mov             v1.16b, v6.16b
+        mov             v2.16b, v6.16b
+        mov             v3.16b, v6.16b
+.endif
+        tbl             v16.16b, {v17.16b}, v30.16b
+        tbl             v17.16b, {v17.16b}, v31.16b
+        sdot            z0.d, z16.h, z4.h[0]
+        sdot            z1.d, z17.h, z4.h[0]
+        subs            \h, \h, #2
+        tbl             v18.16b, {v19.16b}, v30.16b
+        tbl             v19.16b, {v19.16b}, v31.16b
+        sdot            z2.d, z18.h, z4.h[0]
+        sdot            z3.d, z19.h, z4.h[0]
+
+        uzp1            v0.4s, v0.4s, v1.4s
+        uzp1            v1.4s, v2.4s, v3.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v1.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        str             q0, [\dst], #16
+.else   // put
+        sqshrun         v0.4h, v0.4s, #6
+        sqshrun         v1.4h, v1.4s, #6
+        umin            v0.4h, v0.4h, v5.4h
+        umin            v1.4h, v1.4h, v5.4h
+        st1h            {z0.h}, p1, [\dst]
+        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
+        add             \dst, \dst, \d_strd, lsl #2
+.endif
+        b.gt            4b
+        ret
+
+        .align JUMP_ALIGN
+80:     // H - 8xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+8:
+        ldp             q17, q21, [\src]
+        ldur            q19, [\src, #8]
+
+.ifc \type, prep
+        movi            v0.2d, #0
+        movi            v2.2d, #0
+.else
+        mov             v0.16b, v6.16b
+        mov             v2.16b, v6.16b
+.endif
+        tbl             v16.16b, {v17.16b}, v30.16b
+        tbl             v17.16b, {v17.16b}, v31.16b
+        add             \src, \src, \s_strd
+        sdot            z0.d, z16.h, z4.h[0]
+        sdot            z2.d, z17.h, z4.h[0]
+
+        tbl             v18.16b, {v19.16b}, v30.16b
+        tbl             v19.16b, {v19.16b}, v31.16b
+.ifc \type, prep
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+.else
+        mov             v16.16b, v6.16b
+        mov             v17.16b, v6.16b
+.endif
+        ldp             q23, q27, [\src]
+        ldur            q25, [\src, #8]
+
+        sdot            z16.d, z18.h, z4.h[0]
+        sdot            z17.d, z19.h, z4.h[0]
+
+        tbl             v22.16b, {v23.16b}, v30.16b
+        tbl             v23.16b, {v23.16b}, v31.16b
+.ifc \type, prep
+        movi            v1.2d, #0
+        movi            v3.2d, #0
+.else
+        mov             v1.16b, v6.16b
+        mov             v3.16b, v6.16b
+.endif
+        add             \src, \src, \s_strd
+        sdot            z1.d, z22.h, z4.h[0]
+        sdot            z3.d, z23.h, z4.h[0]
+
+        tbl             v24.16b, {v25.16b}, v30.16b
+        tbl             v25.16b, {v25.16b}, v31.16b
+.ifc \type, prep
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+.else
+        mov             v22.16b, v6.16b
+        mov             v23.16b, v6.16b
+.endif
+        sdot            z22.d, z24.h, z4.h[0]
+        sdot            z23.d, z25.h, z4.h[0]
+
+        tbl             v20.16b, {v21.16b}, v30.16b
+        tbl             v21.16b, {v21.16b}, v31.16b
+        sdot            z0.d, z18.h, z4.h[1]
+        sdot            z2.d, z19.h, z4.h[1]
+        tbl             v26.16b, {v27.16b}, v30.16b
+        tbl             v27.16b, {v27.16b}, v31.16b
+        sdot            z16.d, z20.h, z4.h[1]
+        sdot            z17.d, z21.h, z4.h[1]
+
+        sdot            z1.d, z24.h, z4.h[1]
+        sdot            z3.d, z25.h, z4.h[1]
+
+        sdot            z22.d, z26.h, z4.h[1]
+        sdot            z23.d, z27.h, z4.h[1]
+
+        subs            \h, \h, #2
+        uzp1            v0.4s, v0.4s, v2.4s
+        uzp1            v2.4s, v16.4s, v17.4s
+        uzp1            v1.4s, v1.4s, v3.4s
+        uzp1            v3.4s, v22.4s, v23.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v2.4s, v2.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        srshl           v3.4s, v3.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v2.8h
+        uzp1            v1.8h, v1.8h, v3.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        sub             z1.h, z1.h, #PREP_BIAS
+        stp             q0, q1, [\dst], #32
+.else   // put
+        sqshrun         v0.4h, v0.4s, #6
+        sqshrun2        v0.8h, v2.4s, #6
+        sqshrun         v1.4h, v1.4s, #6
+        sqshrun2        v1.8h, v3.4s, #6
+        umin            v0.8h, v0.8h, v5.8h
+        umin            v1.8h, v1.8h, v5.8h
+        st1             {v0.16b}, [\dst], \d_strd
+        st1             {v1.16b}, [\dst], \d_strd
+.endif
+        b.gt            8b
+        ret
+
+        .align JUMP_ALIGN
+160:    // H - 16xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+16:
+        ldp             q17, q21, [\src]
+        ldur            q19, [\src, #8]
+
+.ifc \type, prep
+        movi            v0.2d, #0
+        movi            v2.2d, #0
+.else
+        mov             v0.16b, v6.16b
+        mov             v2.16b, v6.16b
+.endif
+        tbl             v16.16b, {v17.16b}, v30.16b
+        tbl             v17.16b, {v17.16b}, v31.16b
+        sdot            z0.d, z16.h, z4.h[0]
+        sdot            z2.d, z17.h, z4.h[0]
+
+        tbl             v18.16b, {v19.16b}, v30.16b
+        tbl             v19.16b, {v19.16b}, v31.16b
+.ifc \type, prep
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+.else
+        mov             v16.16b, v6.16b
+        mov             v17.16b, v6.16b
+.endif
+        ldur            q25, [\src, #24]
+        ldr             q27, [\src, #32]
+
+        sdot            z16.d, z18.h, z4.h[0]
+        sdot            z17.d, z19.h, z4.h[0]
+
+        tbl             v22.16b, {v21.16b}, v30.16b
+        tbl             v23.16b, {v21.16b}, v31.16b
+.ifc \type, prep
+        movi            v1.2d, #0
+        movi            v3.2d, #0
+.else
+        mov             v1.16b, v6.16b
+        mov             v3.16b, v6.16b
+.endif
+        add             \src, \src, \s_strd
+        sdot            z1.d, z22.h, z4.h[0]
+        sdot            z3.d, z23.h, z4.h[0]
+
+        tbl             v24.16b, {v25.16b}, v30.16b
+        tbl             v25.16b, {v25.16b}, v31.16b
+.ifc \type, prep
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+.else
+        mov             v22.16b, v6.16b
+        mov             v23.16b, v6.16b
+.endif
+        sdot            z22.d, z24.h, z4.h[0]
+        sdot            z23.d, z25.h, z4.h[0]
+
+        tbl             v20.16b, {v21.16b}, v30.16b
+        tbl             v21.16b, {v21.16b}, v31.16b
+        sdot            z0.d, z18.h, z4.h[1]
+        sdot            z2.d, z19.h, z4.h[1]
+        tbl             v26.16b, {v27.16b}, v30.16b
+        tbl             v27.16b, {v27.16b}, v31.16b
+        sdot            z16.d, z20.h, z4.h[1]
+        sdot            z17.d, z21.h, z4.h[1]
+
+        sdot            z1.d, z24.h, z4.h[1]
+        sdot            z3.d, z25.h, z4.h[1]
+
+        sdot            z22.d, z26.h, z4.h[1]
+        sdot            z23.d, z27.h, z4.h[1]
+
+        subs            \h, \h, #1
+        uzp1            v0.4s, v0.4s, v2.4s
+        uzp1            v2.4s, v16.4s, v17.4s
+        uzp1            v1.4s, v1.4s, v3.4s
+        uzp1            v3.4s, v22.4s, v23.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v2.4s, v2.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        srshl           v3.4s, v3.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v2.8h
+        uzp1            v1.8h, v1.8h, v3.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        sub             z1.h, z1.h, #PREP_BIAS
+        stp             q0, q1, [\dst], #32
+.else   // put
+        sqshrun         v0.4h, v0.4s, #6
+        sqshrun2        v0.8h, v2.4s, #6
+        sqshrun         v1.4h, v1.4s, #6
+        sqshrun2        v1.8h, v3.4s, #6
+        umin            v0.8h, v0.8h, v5.8h
+        umin            v1.8h, v1.8h, v5.8h
+        st1             {v0.16b, v1.16b}, [\dst], \d_strd
+.endif
+        b.gt            16b
+        ret
+
+        .align JUMP_ALIGN
+320:    // H - 32xN+
+640:
+1280:
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+        sub             \d_strd, \d_strd, \w, uxtw #1
+.endif
+        sub             \s_strd, \s_strd, \w, uxtw #1
+        mov             w8, \w
+
+        .align LOOP_ALIGN
+32:
+        ldp             q17, q21, [\src]
+        ldur            q19, [\src, #8]
+
+.ifc \type, prep
+        movi            v0.2d, #0
+        movi            v2.2d, #0
+.else
+        mov             v0.16b, v6.16b
+        mov             v2.16b, v6.16b
+.endif
+        tbl             v16.16b, {v17.16b}, v30.16b
+        tbl             v17.16b, {v17.16b}, v31.16b
+        sdot            z0.d, z16.h, z4.h[0]
+        sdot            z2.d, z17.h, z4.h[0]
+
+        tbl             v18.16b, {v19.16b}, v30.16b
+        tbl             v19.16b, {v19.16b}, v31.16b
+.ifc \type, prep
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+.else
+        mov             v16.16b, v6.16b
+        mov             v17.16b, v6.16b
+.endif
+        ldur            q25, [\src, #24]
+
+        sdot            z16.d, z18.h, z4.h[0]
+        sdot            z17.d, z19.h, z4.h[0]
+
+        ldr             q27, [\src, #32]!
+
+        tbl             v22.16b, {v21.16b}, v30.16b
+        tbl             v23.16b, {v21.16b}, v31.16b
+.ifc \type, prep
+        movi            v1.2d, #0
+        movi            v3.2d, #0
+.else
+        mov             v1.16b, v6.16b
+        mov             v3.16b, v6.16b
+.endif
+        sdot            z1.d, z22.h, z4.h[0]
+        sdot            z3.d, z23.h, z4.h[0]
+
+        tbl             v24.16b, {v25.16b}, v30.16b
+        tbl             v25.16b, {v25.16b}, v31.16b
+.ifc \type, prep
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+.else
+        mov             v22.16b, v6.16b
+        mov             v23.16b, v6.16b
+.endif
+        sdot            z22.d, z24.h, z4.h[0]
+        sdot            z23.d, z25.h, z4.h[0]
+
+        tbl             v20.16b, {v21.16b}, v30.16b
+        tbl             v21.16b, {v21.16b}, v31.16b
+        sdot            z0.d, z18.h, z4.h[1]
+        sdot            z2.d, z19.h, z4.h[1]
+        tbl             v26.16b, {v27.16b}, v30.16b
+        tbl             v27.16b, {v27.16b}, v31.16b
+        sdot            z16.d, z20.h, z4.h[1]
+        sdot            z17.d, z21.h, z4.h[1]
+
+        sdot            z1.d, z24.h, z4.h[1]
+        sdot            z3.d, z25.h, z4.h[1]
+
+        sdot            z22.d, z26.h, z4.h[1]
+        sdot            z23.d, z27.h, z4.h[1]
+
+        subs            w8, w8, #16
+        uzp1            v0.4s, v0.4s, v2.4s
+        uzp1            v2.4s, v16.4s, v17.4s
+        uzp1            v1.4s, v1.4s, v3.4s
+        uzp1            v3.4s, v22.4s, v23.4s
+.ifc \type, prep
+        srshl           v0.4s, v0.4s, v5.4s
+        srshl           v2.4s, v2.4s, v5.4s
+        srshl           v1.4s, v1.4s, v5.4s
+        srshl           v3.4s, v3.4s, v5.4s
+        uzp1            v0.8h, v0.8h, v2.8h
+        uzp1            v1.8h, v1.8h, v3.8h
+        sub             z0.h, z0.h, #PREP_BIAS
+        sub             z1.h, z1.h, #PREP_BIAS
+.else   // put
+        sqshrun         v0.4h, v0.4s, #6
+        sqshrun2        v0.8h, v2.4s, #6
+        sqshrun         v1.4h, v1.4s, #6
+        sqshrun2        v1.8h, v3.4s, #6
+        umin            v0.8h, v0.8h, v5.8h
+        umin            v1.8h, v1.8h, v5.8h
+.endif
+        stp             q0, q1, [\dst], #32
+        b.gt            32b
+
+        add             \src, \src, \s_strd
+.ifc \type, put
+        add             \dst, \dst, \d_strd
+.endif
+        subs            \h, \h, #1
+        mov             w8, \w
+        b.gt            32b
+        ret
+endfunc
+
+jumptable \type\()_8tap_h_\isa\()_tbl
+        .word 1280b - \type\()_8tap_h_\isa\()_tbl
+        .word 640b  - \type\()_8tap_h_\isa\()_tbl
+        .word 320b  - \type\()_8tap_h_\isa\()_tbl
+        .word 160b  - \type\()_8tap_h_\isa\()_tbl
+        .word 80b   - \type\()_8tap_h_\isa\()_tbl
+        .word 40b   - \type\()_8tap_h_\isa\()_tbl
+.ifc \type, put
+        .word 20b   - \type\()_8tap_h_\isa\()_tbl
+.endif
+endjumptable
+.endm
+
+
+function prep_sve
+        movrel          x9, prep_tbl
+        mov             w6, #19
+        ldrsw           x8, [x9, x8, lsl #2]
+        sub             w6, w6, w7, lsr #8          // 19 - bdmax / 256
+        add             x9, x9, x8
+        movi            v30.8h, #PREP_BIAS_NEG
+        dup             v29.8h, w6                  // 10b: 1 << 4, 12b: 1 << 2
+        br              x9
+
+        .align JUMP_ALIGN
+40:     // prep - 4xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+4:
+        ldr             d0, [x1]
+        ldr             d1, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        subs            w4, w4, #2
+        mad             z0.h, p0/m, z29.h, z30.h
+        mad             z1.h, p0/m, z29.h, z30.h
+        stp             d0, d1, [x0], #16
+        b.gt            4b
+        ret
+
+        .align JUMP_ALIGN
+80:     // prep - 8xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+8:
+        ld1             {v0.8h}, [x1], x2
+        ld1             {v1.8h}, [x1], x2
+        subs            w4, w4, #2
+        mad             z0.h, p0/m, z29.h, z30.h
+        mad             z1.h, p0/m, z29.h, z30.h
+        stp             q0, q1, [x0], #32
+        b.gt            8b
+        ret
+
+        .align JUMP_ALIGN
+160:    // prep - 16xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+16:
+        ld1             {v0.8h, v1.8h}, [x1], x2
+        mad             z0.h, p0/m, z29.h, z30.h
+        mad             z1.h, p0/m, z29.h, z30.h
+        subs            w4, w4, #2
+        ld1             {v2.8h, v3.8h}, [x1], x2
+        mad             z2.h, p0/m, z29.h, z30.h
+        mad             z3.h, p0/m, z29.h, z30.h
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        add             x0, x0, #64
+        b.gt            16b
+        ret
+
+        .align JUMP_ALIGN
+320:    // prep - 32xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+32:
+        ldp             q0, q1, [x1]
+        mad             z0.h, p0/m, z29.h, z30.h
+        mad             z1.h, p0/m, z29.h, z30.h
+        ldp             q2, q3, [x1, #32]
+        subs            w4, w4, #1
+        mad             z2.h, p0/m, z29.h, z30.h
+        mad             z3.h, p0/m, z29.h, z30.h
+        add             x1, x1, x2
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        add             x0, x0, #64
+        b.gt            32b
+        ret
+
+        .align JUMP_ALIGN
+640:    // prep - 64xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+64:
+        ldp             q0, q1, [x1]
+        mad             z0.h, p0/m, z29.h, z30.h
+        mad             z1.h, p0/m, z29.h, z30.h
+        ldp             q2, q3, [x1, #32]
+        mad             z2.h, p0/m, z29.h, z30.h
+        mad             z3.h, p0/m, z29.h, z30.h
+        ldp             q4, q5, [x1, #64]
+        mad             z4.h, p0/m, z29.h, z30.h
+        mad             z5.h, p0/m, z29.h, z30.h
+        ldp             q6, q7, [x1, #96]
+        add             x1, x1, x2
+        subs            w4, w4, #1
+        mad             z6.h, p0/m, z29.h, z30.h
+        mad             z7.h, p0/m, z29.h, z30.h
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        stp             q4, q5, [x0, #64]
+        stp             q6, q7, [x0, #96]
+        add             x0, x0, #128
+        b.gt            64b
+        ret
+
+        .align JUMP_ALIGN
+1280:   // prep - 128xN
+        AARCH64_VALID_JUMP_TARGET
+
+        .align LOOP_ALIGN
+128:
+        ldp             q0, q1, [x1]
+        mad             z0.h, p0/m, z29.h, z30.h
+        mad             z1.h, p0/m, z29.h, z30.h
+        ldp             q2, q3, [x1, #32]
+        mad             z2.h, p0/m, z29.h, z30.h
+        mad             z3.h, p0/m, z29.h, z30.h
+        ldp             q4, q5, [x1, #64]
+        mad             z4.h, p0/m, z29.h, z30.h
+        mad             z5.h, p0/m, z29.h, z30.h
+        ldp             q6, q7, [x1, #96]
+        mad             z6.h, p0/m, z29.h, z30.h
+        mad             z7.h, p0/m, z29.h, z30.h
+        ldp             q16, q17, [x1, #128]
+        mad             z16.h, p0/m, z29.h, z30.h
+        mad             z17.h, p0/m, z29.h, z30.h
+        ldp             q18, q19, [x1, #160]
+        mad             z18.h, p0/m, z29.h, z30.h
+        mad             z19.h, p0/m, z29.h, z30.h
+        ldp             q20, q21, [x1, #192]
+        mad             z20.h, p0/m, z29.h, z30.h
+        mad             z21.h, p0/m, z29.h, z30.h
+        ldp             q22, q23, [x1, #224]
+        add             x1, x1, x2
+        mad             z22.h, p0/m, z29.h, z30.h
+        mad             z23.h, p0/m, z29.h, z30.h
+        subs            w4, w4, #1
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        stp             q4, q5, [x0, #64]
+        stp             q6, q7, [x0, #96]
+        stp             q16, q17, [x0, #128]
+        stp             q18, q19, [x0, #160]
+        stp             q20, q21, [x0, #192]
+        stp             q22, q23, [x0, #224]
+        add             x0, x0, #256
+        b.gt            128b
+        ret
+endfunc
+
+jumptable prep_tbl
+        .word 1280b - prep_tbl
+        .word 640b  - prep_tbl
+        .word 320b  - prep_tbl
+        .word 160b  - prep_tbl
+        .word 80b   - prep_tbl
+        .word 40b   - prep_tbl
+endjumptable
+
+
+// dst(x0), d_strd(x9), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6), bdmax(w7)
+// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w9), ws_strd(w2)
+filter_8tap_fn prep, sve2, x0, x9, x1, x2, w3, w4, w5, w6, w7, x5, x6, x5, x6, w9, w2
+
+// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8)
+// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3)
+filter_8tap_fn  put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3
+
+DISABLE_SVE2
+DISABLE_SVE
+#endif  // HAVE_SVE2
diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index a3eef59f36f23bca4f577fcd5fe5125869603635..079ff9eb7a1261d5f87cc7b24cf7ab4ef3a2881b 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -45,32 +45,39 @@ ENABLE_DOTPROD
 #define LOOP_ALIGN      2
 
 
-// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
-        .align 4
-L(hv_tbl_neon_dotprod):
-        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
-
-// Shuffle indices to permute horizontal samples in preparation for input to
-// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
-// interval of [-3, 4] relative to the current sample position.
-        .align 4
-L(h_tbl_neon_dotprod):
+const h_tbl_neon_dotprod, align=4
+        // Shuffle indices to permute horizontal samples in preparation for
+        // input to SDOT instructions. The 8-tap horizontal convolution uses
+        // sample indices in the interval of [-3, 4] relative to the current
+        // sample position.
         .byte  0,  1,  2,  3,   1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6
         .byte  4,  5,  6,  7,   5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10
         .byte  8,  9, 10, 11,   9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14
 
-// Vertical convolutions are also using SDOT instructions, where a 128-bit
-// register contains a transposed 4x4 matrix of values. Subsequent iterations of
-// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop
-// iteration. These shuffle indices shift and merge this 4x4 matrix with the
-// values of a new line.
-        .align 4
-L(v_tbl_neon_dotprod):
+        // Shuffle indices to permute horizontal samples in preparation for
+        // input to USMMLA instructions.
+#define OFFSET_USMMLA 48
+        .byte  0,  1,  2,  3,   4,  5,  6,  7,   2,  3,  4,  5,   6,  7,  8,  9
+        .byte  4,  5,  6,  7,   8,  9, 10, 11,   6,  7,  8,  9,  10, 11, 12, 13
+
+        // Lookup table used to help conversion of shifted 32-bit values to 8-bit.
+#define OFFSET_CVT_32_8 80
+        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
+endconst
+
+const v_tbl_neon_dotprod, align=4
+        // Vertical convolutions are also using SDOT instructions, where a
+        // 128-bit register contains a transposed 4x4 matrix of values.
+        // Subsequent iterations of the vertical convolution can reuse the
+        // 3x4 sub-matrix from the previous loop iteration. These shuffle
+        // indices shift and merge this 4x4 matrix with the values of a new
+        // line.
         .byte  1,  2,  3, 16,   5,  6,  7, 20,   9, 10, 11, 24,  13, 14, 15, 28
         .byte  1,  2,  3, 16,   5,  6,  7, 17,   9, 10, 11, 18,  13, 14, 15, 19
         .byte  1,  2,  3, 20,   5,  6,  7, 21,   9, 10, 11, 22,  13, 14, 15, 23
         .byte  1,  2,  3, 24,   5,  6,  7, 25,   9, 10, 11, 26,  13, 14, 15, 27
         .byte  1,  2,  3, 28,   5,  6,  7, 29,   9, 10, 11, 30,  13, 14, 15, 31
+endconst
 
 
 .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
@@ -109,24 +116,24 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN
         .align JUMP_ALIGN
 L(\type\()_8tap_v_\isa):
         madd            \my, \my, w11, w10
-        ldr             q6, L(v_tbl_neon_dotprod)
+        movrel          x13, v_tbl_neon_dotprod
         sub             \src, \src, \s_strd
 .ifc \isa, neon_dotprod
     .ifc \type, prep
-        mov             w8, 0x2002          // FILTER_WEIGHT * 128 + rounding
+        mov             w8, #0x2002         // FILTER_WEIGHT * 128 + rounding
         dup             v4.4s, w8
     .else
-        movi            v4.4s, #32, lsl 8   // FILTER_WEIGHT * 128, bias for SDOT
+        movi            v4.4s, #32, lsl #8  // FILTER_WEIGHT * 128, bias for SDOT
     .endif
 .endif
         ubfx            w11, \my, #7, #7
         and             \my, \my, #0x7F
-        ldr             q28, L(v_tbl_neon_dotprod) + 16
+        ldp             q6, q28, [x13]
         cmp             \h, #4
         csel            \my, \my, w11, le
         sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
         add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
-        ldr             q29, L(v_tbl_neon_dotprod) + 32
+        ldr             q29, [x13, #32]
 .ifc \isa, neon_dotprod
         movi            v5.16b, #128
 .endif
@@ -137,8 +144,7 @@ L(\type\()_8tap_v_\isa):
 
         // .align JUMP_ALIGN    // fallthrough
 160:    // V - 16xN+
-        ldr             q30, L(v_tbl_neon_dotprod) + 48
-        ldr             q31, L(v_tbl_neon_dotprod) + 64
+        ldp             q30, q31, [x13, #48]
 .ifc \type, prep
         add             \wd_strd, \w, \w
 .endif
@@ -676,18 +682,19 @@ L(\type\()_8tap_v_\isa):
 L(\type\()_8tap_h_hv_\isa):
         madd            \mx, \mx, w11, w9
         madd            w14, \my, w11, w10      // for HV
-        ldr             q28, L(h_tbl_neon_dotprod)
 .ifc \isa, neon_dotprod
-        mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
+        mov             w13, #0x2002            // FILTER_WEIGHT * 128 + rounding
         dup             v27.4s, w13             // put H overrides this
 .endif
+        movrel          x13, h_tbl_neon_dotprod
         sub             \src, \src, #3          // src - 3
-        ubfx            w9, \mx, #7, #7
+        ldr             q28, [x13]              // for 4-tap & 8-tap H filters
+        ubfx            w15, \mx, #7, #7
         and             \mx, \mx, #0x7F
         ubfx            w11, w14, #7, #7        // for HV
         and             w14, w14, #0x7F         // for HV
         cmp             \w, #4
-        csel            \mx, \mx, w9, le
+        csel            \mx, \mx, w15, le
         add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
 .ifc \isa, neon_dotprod
         movi            v24.16b, #128
@@ -702,10 +709,10 @@ L(\type\()_8tap_h_hv_\isa):
         mov             x15, x30
         ldr             d7, [\xmy]
 .ifc \type, put
-        ldr             q25, L(hv_tbl_neon_dotprod)
-.endif
+        ldr             q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
+.endif                                                 // of 32b values to 8b
         sxtl            v7.8h, v7.8b
-        cmp             w10, SHARP1
+        cmp             w10, #SHARP1
         b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
 
         // HV 8-tap cases
@@ -718,8 +725,7 @@ L(\type\()_8tap_h_hv_\isa):
 
         // .align JUMP_ALIGN    // fallthrough
 80:     // HV8 - 8xN+
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldp             q29, q30, [x13, #16]
         ldr             d26, [\xmx]
 .ifc \type, prep
         add             \wd_strd, \w, \w
@@ -860,7 +866,7 @@ L(\type\()_8tap_h_hv_\isa):
 
         .align JUMP_ALIGN
 40:     // HV8 - 4xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
@@ -930,7 +936,7 @@ L(\type\()_8tap_h_hv_\isa):
 .ifc \type, put
         .align JUMP_ALIGN
 20:     // HV8 - 2xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
@@ -1005,12 +1011,91 @@ L(\type\()_6tap_hv_\isa):
 
         // .align JUMP_ALIGN    // fallthrough
 80:     // HV6 - 8xN+
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
         ldr             d26, [\xmx]
 .ifc \type, prep
         add             \wd_strd, \w, \w
 .endif
+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            88f             // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
+
+        .align LOOP_ALIGN
+81:
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        mov             w8, \h
+
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v16.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v17.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v18.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v19.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter6_neon_i8mm)
+        srshr           v20.8h, v22.8h, #2
+
+        .align LOOP_ALIGN
+8:
+        ld1             {v23.16b}, [\lsrc], \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[1]
+        smull2          v1.4s, v16.8h, v7.h[1]
+        mov             v16.16b, v17.16b
+        movi            v5.4s, #0
+        movi            v6.4s, #0
+        tbl             v2.16b, {v23.16b}, v29.16b
+        tbl             v3.16b, {v23.16b}, v30.16b
+
+        smlal           v0.4s, v17.4h, v7.h[2]
+        smlal2          v1.4s, v17.8h, v7.h[2]
+        mov             v17.16b, v18.16b
+
+        usmmla          v5.4s, v2.16b, v26.16b
+        usmmla          v6.4s, v3.16b, v26.16b
+
+        smlal           v0.4s, v18.4h, v7.h[3]
+        smlal2          v1.4s, v18.8h, v7.h[3]
+        mov             v18.16b, v19.16b
+        subs            w8, w8, #1
+
+        smlal           v0.4s, v19.4h, v7.h[4]
+        smlal2          v1.4s, v19.8h, v7.h[4]
+        uzp1            v23.8h, v5.8h, v6.8h
+        mov             v19.16b, v20.16b
+
+        smlal           v0.4s, v20.4h, v7.h[5]
+        smlal2          v1.4s, v20.8h, v7.h[5]
+        srshr           v20.8h, v23.8h, #2
+        smlal           v0.4s, v20.4h, v7.h[6]
+        smlal2          v1.4s, v20.8h, v7.h[6]
+    .ifc \type, prep
+        rshrn           v0.4h, v0.4s, #6
+        rshrn2          v0.8h, v1.4s, #6
+        st1             {v0.8h}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #16
+    .else
+        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        sqrshrun        v0.8b, v0.8h, #2
+        st1             {v0.8b}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #8
+    .endif
+        add             \src, \src, #8
+        subs            \w, \w, #8
+        b.gt            81b
+        ret             x15
+
+        .align JUMP_ALIGN
+88:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]
 
         .align LOOP_ALIGN
 81:
@@ -1042,8 +1127,8 @@ L(\type\()_6tap_hv_\isa):
 .endif
         .align LOOP_ALIGN
 8:
-        ldr             q23, [\xmy]
-        add             \xmy, \xmy, \s_strd
+        ldr             q23, [\lsrc]
+        add             \lsrc, \lsrc, \s_strd
 
         smull           v0.4s, v16.4h, v7.h[1]
         smull2          v1.4s, v16.8h, v7.h[1]
@@ -1130,6 +1215,20 @@ L(\type\()_hv_filter8_\isa):
         uzp1            v22.8h, v22.8h, v23.8h
         ret
 
+.ifc \isa, neon_i8mm
+        .align FUNC_ALIGN
+L(\type\()_hv_filter6_neon_i8mm):
+        ld1             {v4.16b}, [\lsrc], \s_strd
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+        tbl             v2.16b, {v4.16b}, v29.16b
+        tbl             v3.16b, {v4.16b}, v30.16b
+        usmmla          v22.4s, v2.16b, v26.16b
+        usmmla          v23.4s, v3.16b, v26.16b
+        uzp1            v22.8h, v22.8h, v23.8h
+        ret
+.endif
+
         .align FUNC_ALIGN
 L(\type\()_hv_filter4_\isa):
         ld1             {v4.8b}, [\src], \s_strd
@@ -1145,7 +1244,7 @@ L(\type\()_hv_filter4_\isa):
 
         .align JUMP_ALIGN
 40:     // HV6 - 4xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
@@ -1206,7 +1305,7 @@ L(\type\()_hv_filter4_\isa):
 .ifc \type, put
         .align JUMP_ALIGN
 20:     // HV6 - 2xN
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
@@ -1266,8 +1365,8 @@ L(\type\()_hv_filter4_\isa):
 
         .align JUMP_ALIGN
 L(\type\()_8tap_h_\isa):
-        adr             x9, L(\type\()_8tap_h_\isa\()_tbl)
-        ldrh            w8, [x9, x8, lsl #1]
+        movrel          x11, \type\()_8tap_h_\isa\()_tbl
+        ldrsw           x8, [x11, x8, lsl #2]
 .ifc \type, put
     .ifc \isa, neon_i8mm
         movi            v27.4s, #34     // special rounding
@@ -1276,15 +1375,15 @@ L(\type\()_8tap_h_\isa):
         dup             v27.4s, w10
     .endif
 .endif
-        sub             x9, x9, x8
-        br              x9
+        add             x11, x11, x8
+        br              x11
 
 .ifc \type, put
         .align JUMP_ALIGN
 20:     // H - 2xN
         AARCH64_VALID_JUMP_TARGET
         add             \src, \src, #2
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
 
         .align LOOP_ALIGN
 2:
@@ -1321,7 +1420,7 @@ L(\type\()_8tap_h_\isa):
 40:     // H - 4xN
         AARCH64_VALID_JUMP_TARGET
         add             \src, \src, #2
-        ldr             s26, [\xmx, #2]
+        ldur            s26, [\xmx, #2]
 
         .align LOOP_ALIGN
 4:
@@ -1370,9 +1469,63 @@ L(\type\()_8tap_h_\isa):
         .align JUMP_ALIGN
 80:     // H - 8xN
         AARCH64_VALID_JUMP_TARGET
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
         ldr             d26, [\xmx]
+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            88f             // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
+
+        .align LOOP_ALIGN
+8:
+        ldr             q0, [\src]
+        ldr             q16, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+    .ifc \type, prep
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        movi            v20.4s, #0
+        movi            v21.4s, #0
+    .else
+        mov             v4.16b, v27.16b
+        mov             v5.16b, v27.16b
+        mov             v20.16b, v27.16b
+        mov             v21.16b, v27.16b
+    .endif
+        tbl             v1.16b, {v0.16b}, v29.16b
+        tbl             v2.16b, {v0.16b}, v30.16b
+        tbl             v17.16b, {v16.16b}, v29.16b
+        tbl             v18.16b, {v16.16b}, v30.16b
+
+        usmmla          v4.4s, v1.16b, v26.16b
+        usmmla          v5.4s, v2.16b, v26.16b
+        usmmla          v20.4s, v17.16b, v26.16b
+        usmmla          v21.4s, v18.16b, v26.16b
+
+        uzp1            v4.8h, v4.8h, v5.8h
+        uzp1            v20.8h, v20.8h, v21.8h
+    .ifc \type, prep
+        srshr           v4.8h, v4.8h, #2
+        srshr           v20.8h, v20.8h, #2
+        subs            \h, \h, #2
+        stp             q4, q20, [\dst], #32
+    .else   // put
+        sqshrun         v4.8b, v4.8h, #6
+        sqshrun         v20.8b, v20.8h, #6
+        subs            \h, \h, #2
+        str             d4, [\dst]
+        str             d20, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+    .endif
+        b.gt            8b
+        ret
+
+        .align JUMP_ALIGN
+88:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]
 
         .align LOOP_ALIGN
 8:
@@ -1436,14 +1589,66 @@ L(\type\()_8tap_h_\isa):
         .align JUMP_ALIGN
 160:    // H - 16xN
         AARCH64_VALID_JUMP_TARGET
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
         ldr             d26, [\xmx]
+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            168f            // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
 
         .align LOOP_ALIGN
 16:
         ldr             q16, [\src]
-        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
+        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
+        add             \src, \src, \s_strd
+    .ifc \type, prep
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+    .else
+        mov             v6.16b, v27.16b
+        mov             v7.16b, v27.16b
+        mov             v22.16b, v27.16b
+        mov             v23.16b, v27.16b
+    .endif
+        tbl             v0.16b, {v16.16b}, v29.16b
+        tbl             v1.16b, {v16.16b}, v30.16b
+        tbl             v2.16b, {v17.16b}, v29.16b
+        tbl             v3.16b, {v17.16b}, v30.16b
+
+        usmmla          v6.4s, v0.16b, v26.16b
+        usmmla          v7.4s, v1.16b, v26.16b
+        usmmla          v22.4s, v2.16b, v26.16b
+        usmmla          v23.4s, v3.16b, v26.16b
+
+        uzp1            v6.8h, v6.8h, v7.8h
+        uzp1            v22.8h, v22.8h, v23.8h
+    .ifc \type, prep
+        srshr           v6.8h, v6.8h, #2
+        srshr           v22.8h, v22.8h, #2
+        subs            \h, \h, #1
+        stp             q6, q22, [\dst], #32
+    .else   // put
+        sqshrun         v6.8b, v6.8h, #6
+        sqshrun2        v6.16b, v22.8h, #6
+        subs            \h, \h, #1
+        st1             {v6.16b}, [\dst], \d_strd
+    .endif
+        b.gt            16b
+        ret
+
+        .align JUMP_ALIGN
+168:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]
+
+        .align LOOP_ALIGN
+16:
+        ldr             q16, [\src]
+        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
         add             \src, \src, \s_strd
 .ifc \type\()_\isa, prep_neon_i8mm
         movi            v6.4s, #0
@@ -1501,8 +1706,6 @@ L(\type\()_8tap_h_\isa):
 640:
 1280:
         AARCH64_VALID_JUMP_TARGET
-        ldr             q29, L(h_tbl_neon_dotprod) + 16
-        ldr             q30, L(h_tbl_neon_dotprod) + 32
         ldr             d26, [\xmx]
 .ifc \type, put
         sub             \d_strd, \d_strd, \w, uxtw
@@ -1510,10 +1713,73 @@ L(\type\()_8tap_h_\isa):
         sub             \s_strd, \s_strd, \w, uxtw
         mov             w8, \w
 
+.ifc \isa, neon_i8mm
+        cmp             w9, #SHARP1
+        b.eq            328f            // horizontal == SHARP1
+
+        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
+        ext             v0.8b, v26.8b, v26.8b, #7
+        ins             v26.d[1], v0.d[0]
+
+        .align LOOP_ALIGN
+32:
+        ldr             q16, [\src]
+        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
+        add             \src, \src, #16
+    .ifc \type, prep
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+    .else
+        mov             v6.16b, v27.16b
+        mov             v7.16b, v27.16b
+        mov             v22.16b, v27.16b
+        mov             v23.16b, v27.16b
+    .endif
+        tbl             v0.16b, {v16.16b}, v29.16b
+        tbl             v1.16b, {v16.16b}, v30.16b
+        tbl             v2.16b, {v17.16b}, v29.16b
+        tbl             v3.16b, {v17.16b}, v30.16b
+
+        usmmla          v6.4s, v0.16b, v26.16b
+        usmmla          v7.4s, v1.16b, v26.16b
+        usmmla          v22.4s, v2.16b, v26.16b
+        usmmla          v23.4s, v3.16b, v26.16b
+
+        uzp1            v6.8h, v6.8h, v7.8h
+        uzp1            v22.8h, v22.8h, v23.8h
+    .ifc \type, prep
+        srshr           v6.8h, v6.8h, #2
+        srshr           v22.8h, v22.8h, #2
+        subs            w8, w8, #16
+        stp             q6, q22, [\dst], #32
+    .else   // put
+        sqshrun         v6.8b, v6.8h, #6
+        sqshrun2        v6.16b, v22.8h, #6
+        subs            w8, w8, #16
+        str             q6, [\dst], #16
+    .endif
+        b.gt            32b
+
+        add             \src, \src, \s_strd
+    .ifc \type, put
+        add             \dst, \dst, \d_strd
+    .endif
+        mov             w8, \w
+        subs            \h, \h, #1
+        b.gt            32b
+        ret
+
+        .align JUMP_ALIGN
+328:
+.endif  // neon_i8mm
+        ldp             q29, q30, [x13, #16]
+
         .align LOOP_ALIGN
 32:
         ldr             q16, [\src]
-        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
+        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
         add             \src, \src, #16
 .ifc \type\()_\isa, prep_neon_i8mm
         movi            v6.4s, #0
@@ -1573,19 +1839,19 @@ L(\type\()_8tap_h_\isa):
         subs            \h, \h, #1
         b.gt            32b
         ret
+endfunc
 
-L(\type\()_8tap_h_\isa\()_tbl):
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b)
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
+jumptable \type\()_8tap_h_\isa\()_tbl
+        .word 1280b - \type\()_8tap_h_\isa\()_tbl
+        .word 640b  - \type\()_8tap_h_\isa\()_tbl
+        .word 320b  - \type\()_8tap_h_\isa\()_tbl
+        .word 160b  - \type\()_8tap_h_\isa\()_tbl
+        .word 80b   - \type\()_8tap_h_\isa\()_tbl
+        .word 40b   - \type\()_8tap_h_\isa\()_tbl
 .ifc \type, put
-        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
-        .hword 0
+        .word 20b   - \type\()_8tap_h_\isa\()_tbl
 .endif
-endfunc
+endjumptable
 .endm
 
 // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
diff --git a/src/arm/64/refmvs.S b/src/arm/64/refmvs.S
index e905682f47cdfa23cc2a24604c067bade503733b..c75c47890945bf1214da0947a2e48081b13d81d6 100644
--- a/src/arm/64/refmvs.S
+++ b/src/arm/64/refmvs.S
@@ -34,13 +34,13 @@
 function splat_mv_neon, export=1
         ld1             {v3.16b},  [x1]
         clz             w3,  w3
-        adr             x5,  L(splat_tbl)
+        movrel          x5,  splat_tbl
         sub             w3,  w3,  #26
         ext             v2.16b,  v3.16b,  v3.16b,  #12
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrsw           x3,  [x5, w3, uxtw #2]
         add             w2,  w2,  w2,  lsl #1
         ext             v0.16b,  v2.16b,  v3.16b,  #4
-        sub             x3,  x5,  w3, uxtw
+        add             x3,  x5,  x3
         ext             v1.16b,  v2.16b,  v3.16b,  #8
         lsl             w2,  w2,  #2
         ext             v2.16b,  v2.16b,  v3.16b,  #12
@@ -80,16 +80,17 @@ function splat_mv_neon, export=1
         st1             {v0.16b, v1.16b, v2.16b}, [x1]
         b.gt            1b
         ret
-
-L(splat_tbl):
-        .hword L(splat_tbl) -  320b
-        .hword L(splat_tbl) -  160b
-        .hword L(splat_tbl) -   80b
-        .hword L(splat_tbl) -   40b
-        .hword L(splat_tbl) -   20b
-        .hword L(splat_tbl) -   10b
 endfunc
 
+jumptable splat_tbl
+        .word 320b  - splat_tbl
+        .word 160b  - splat_tbl
+        .word 80b   - splat_tbl
+        .word 40b   - splat_tbl
+        .word 20b   - splat_tbl
+        .word 10b   - splat_tbl
+endjumptable
+
 const mv_tbls, align=4
         .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
         .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
@@ -112,7 +113,7 @@ function save_tmvs_neon, export=1
 
         movi            v30.8b,  #0
         ld1             {v31.8b}, [x3]
-        adr             x8,  L(save_tmvs_tbl)
+        movrel          x8,  save_tmvs_tbl
         movrel          x16, mask_mult
         movrel          x13, mv_tbls
         ld1             {v29.8b}, [x16]
@@ -137,9 +138,9 @@ function save_tmvs_neon, export=1
 2:
         ldrb            w11, [x9, #10]            // cand_b->bs
         ld1             {v0.16b}, [x9]            // cand_b->mv
-        add             x11, x8,  w11, uxtw #2
+        add             x11, x8,  w11, uxtw #3
         ldr             h1,  [x9, #8]             // cand_b->ref
-        ldrh            w12, [x11]                // bw8
+        ldr             w12, [x11]                // bw8
         mov             x15, x8
         add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
         cmp             x9,  x10
@@ -149,9 +150,9 @@ function save_tmvs_neon, export=1
         ldrb            w15, [x9, #10]            // cand_b->bs
         add             x16, x9,  #8
         ld1             {v4.16b}, [x9]            // cand_b->mv
-        add             x15, x8,  w15, uxtw #2
+        add             x15, x8,  w15, uxtw #3
         ld1             {v1.h}[1], [x16]          // cand_b->ref
-        ldrh            w12, [x15]                // bw8
+        ldr             w12, [x15]                // bw8
         add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
         trn1            v2.2d,   v0.2d,   v4.2d
 
@@ -166,12 +167,12 @@ function save_tmvs_neon, export=1
         addp            v1.4h,   v1.4h,   v1.4h   // Combine condition for [1] and [0]
         umov            w16, v1.h[0]              // Extract case for first block
         umov            w17, v1.h[1]
-        ldrh            w11, [x11, #2]            // Fetch jump table entry
-        ldrh            w15, [x15, #2]
+        ldrsw           x11, [x11, #4]            // Fetch jump table entry
+        ldrsw           x15, [x15, #4]
         ldr             q1, [x13, w16, uxtw #4]   // Load permutation table base on case
         ldr             q5, [x13, w17, uxtw #4]
-        sub             x11, x8,  w11, uxtw       // Find jump table target
-        sub             x15, x8,  w15, uxtw
+        add             x11, x8,  x11             // Find jump table target
+        add             x15, x8,  x15
         tbl             v0.16b, {v0.16b}, v1.16b  // Permute cand_b to output refmvs_temporal_block
         tbl             v4.16b, {v4.16b}, v5.16b
 
@@ -243,50 +244,51 @@ function save_tmvs_neon, export=1
         str             q2, [x3, #(16*5-16)]
         add             x3,  x3,  #16*5
         ret
-
-L(save_tmvs_tbl):
-        .hword 16 * 12
-        .hword L(save_tmvs_tbl) - 160b
-        .hword 16 * 12
-        .hword L(save_tmvs_tbl) - 160b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 8 * 12
-        .hword L(save_tmvs_tbl) -  80b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 4 * 12
-        .hword L(save_tmvs_tbl) -  40b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 2 * 12
-        .hword L(save_tmvs_tbl) -  20b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
-        .hword 1 * 12
-        .hword L(save_tmvs_tbl) -  10b
 endfunc
+
+jumptable save_tmvs_tbl
+        .word 16 * 12
+        .word 160b - save_tmvs_tbl
+        .word 16 * 12
+        .word 160b - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 8 * 12
+        .word 80b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 4 * 12
+        .word 40b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 2 * 12
+        .word 20b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+        .word 1 * 12
+        .word 10b  - save_tmvs_tbl
+endjumptable
diff --git a/src/arm/arm-arch.h b/src/arm/arm-arch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f00b9b2fcecbd105f27a2d958be0da18ea4cc634
--- /dev/null
+++ b/src/arm/arm-arch.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ARM_ARM_ARCH_H
+#define ARM_ARM_ARCH_H
+
+/* Compatibility header to define __ARM_ARCH with older compilers */
+#ifndef __ARM_ARCH
+
+#ifdef _M_ARM
+#define __ARM_ARCH _M_ARM
+
+#elif defined(__ARM_ARCH_8A__) || defined(_M_ARM64)
+#define __ARM_ARCH 8
+
+#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
+      defined(__ARM_ARCH_7EM__) || defined(__ARM_ARCH_7R__) || \
+      defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__)
+#define __ARM_ARCH 7
+
+#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+      defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || \
+      defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
+#define __ARM_ARCH 6
+
+#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
+      defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__)
+#define __ARM_ARCH 5
+
+#elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+#define __ARM_ARCH 4
+
+#elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+#define __ARM_ARCH 3
+
+#elif defined(__ARM_ARCH_2__)
+#define __ARM_ARCH 2
+
+#else
+#error Unknown ARM architecture version
+#endif
+
+#endif /* !__ARM_ARCH */
+
+#endif /* ARM_ARM_ARCH_H */
diff --git a/src/arm/asm.S b/src/arm/asm.S
index fed73b30483cf8b760fbb296d71dea1e8f3413b3..e3731fe732bf9eb7358fcfedf2b7a99577869f5f 100644
--- a/src/arm/asm.S
+++ b/src/arm/asm.S
@@ -323,6 +323,32 @@ EXTERN\name:
 \name:
 .endm
 
+.macro jumptable name
+#ifdef _WIN32
+// MS armasm64 doesn't seem to be able to create relocations for subtraction
+// of labels in different sections; for armasm64 (and all of Windows for
+// simplicity), write the jump table in the text section, to allow calculating
+// differences at assembly time. See
+// https://developercommunity.visualstudio.com/t/armasm64-unable-to-create-cross-section/10722340
+// for reference. (LLVM can create such relocations, but checking for _WIN32
+// for simplicity, as execute-only memory isn't relevant on Windows at the
+// moment.)
+        function \name
+#else
+// For other platforms, write jump tables in a const data section, to allow
+// working in environments where executable memory isn't readable.
+        const \name
+#endif
+.endm
+
+.macro endjumptable
+#ifdef _WIN32
+        endfunc
+#else
+        endconst
+#endif
+.endm
+
 #ifdef __APPLE__
 #define L(x) L ## x
 #else
diff --git a/src/arm/cpu.c b/src/arm/cpu.c
index d9b1751a6ae221c3af7f8fbc108e54a748e088e5..5275b7404a0486dc192a0560ac262df1770053f5 100644
--- a/src/arm/cpu.c
+++ b/src/arm/cpu.c
@@ -29,9 +29,10 @@
 
 #include "common/attributes.h"
 
+#include "src/cpu.h"
 #include "src/arm/cpu.h"
 
-#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
 #include <sys/auxv.h>
 
 #if ARCH_AARCH64
@@ -42,7 +43,7 @@
 #define HWCAP2_AARCH64_I8MM   (1 << 13)
 
 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-#ifdef HAVE_GETAUXVAL
+#if HAVE_GETAUXVAL
     unsigned long hw_cap = getauxval(AT_HWCAP);
     unsigned long hw_cap2 = getauxval(AT_HWCAP2);
 #else
@@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
     elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
 #endif
 
-    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    unsigned flags = dav1d_get_default_cpu_flags();
     flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
     flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
     flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
@@ -68,14 +69,15 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
 #define HWCAP_ARM_I8MM    (1 << 27)
 
 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-#ifdef HAVE_GETAUXVAL
+#if HAVE_GETAUXVAL
     unsigned long hw_cap = getauxval(AT_HWCAP);
 #else
     unsigned long hw_cap = 0;
     elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
 #endif
 
-    unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
+    flags |= (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
     flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
     flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
     return flags;
@@ -95,7 +97,7 @@ static int have_feature(const char *feature) {
 }
 
 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    unsigned flags = dav1d_get_default_cpu_flags();
     if (have_feature("hw.optional.arm.FEAT_DotProd"))
         flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
     if (have_feature("hw.optional.arm.FEAT_I8MM"))
@@ -104,17 +106,67 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
     return flags;
 }
 
+#elif defined(__OpenBSD__) && ARCH_AARCH64
+#include <machine/armreg.h>
+#include <machine/cpu.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+     unsigned flags = dav1d_get_default_cpu_flags();
+
+#ifdef CPU_ID_AA64ISAR0
+     int mib[2];
+     uint64_t isar0;
+     uint64_t isar1;
+     size_t len;
+
+     mib[0] = CTL_MACHDEP;
+     mib[1] = CPU_ID_AA64ISAR0;
+     len = sizeof(isar0);
+     if (sysctl(mib, 2, &isar0, &len, NULL, 0) != -1) {
+         if (ID_AA64ISAR0_DP(isar0) >= ID_AA64ISAR0_DP_IMPL)
+             flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+     }
+
+     mib[0] = CTL_MACHDEP;
+     mib[1] = CPU_ID_AA64ISAR1;
+     len = sizeof(isar1);
+     if (sysctl(mib, 2, &isar1, &len, NULL, 0) != -1) {
+#ifdef ID_AA64ISAR1_I8MM_IMPL
+         if (ID_AA64ISAR1_I8MM(isar1) >= ID_AA64ISAR1_I8MM_IMPL)
+             flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+#endif
+     }
+#endif
+
+     return flags;
+}
+
 #elif defined(_WIN32)
 #include <windows.h>
 
 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    unsigned flags = dav1d_get_default_cpu_flags();
 #ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
     if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
         flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
 #endif
-    /* No I8MM or SVE feature detection available on Windows at the time of
-     * writing. */
+#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE))
+        flags |= DAV1D_ARM_CPU_FLAG_SVE;
+#endif
+#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE))
+        flags |= DAV1D_ARM_CPU_FLAG_SVE2;
+#endif
+#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
+    /* There's no PF_* flag that indicates whether plain I8MM is available
+     * or not. But if SVE_I8MM is available, that also implies that
+     * regular I8MM is available. */
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE))
+        flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+#endif
     return flags;
 }
 
@@ -160,7 +212,8 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
 }
 
 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
+    flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
     flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
     flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
     flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
@@ -174,7 +227,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
 #else  /* Unsupported OS */
 
 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    return 0;
+    return dav1d_get_default_cpu_flags();
 }
 
 #endif
diff --git a/src/arm/itx.h b/src/arm/itx.h
index 2a58a31322ee6c3afb864d695f217ca544a8de9c..657f85e613231c59ca4b388d92f94db170a7a767 100644
--- a/src/arm/itx.h
+++ b/src/arm/itx.h
@@ -49,7 +49,9 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
 
-static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc,
+                                           int *const all_simd)
+{
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
@@ -77,4 +79,5 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
     assign_itx1_fn (R, 64, 16, neon);
     assign_itx1_fn (R, 64, 32, neon);
     assign_itx1_fn ( , 64, 64, neon);
+    *all_simd = 1;
 }
diff --git a/src/arm/mc.h b/src/arm/mc.h
index dabdab35753e9451f6f61cb4dec9b3db320468aa..bb6f7f8cae60e159613b95695d530b3ff9ac6376 100644
--- a/src/arm/mc.h
+++ b/src/arm/mc.h
@@ -63,6 +63,7 @@
 decl_8tap_fns(neon);
 decl_8tap_fns(neon_dotprod);
 decl_8tap_fns(neon_i8mm);
+decl_8tap_fns(sve2);
 
 decl_mc_fn(BF(dav1d_put_bilin, neon));
 decl_mct_fn(BF(dav1d_prep_bilin, neon));
@@ -110,17 +111,27 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
     c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
     c->emu_edge = BF(dav1d_emu_edge, neon);
 
-#if ARCH_AARCH64 && BITDEPTH == 8
+#if ARCH_AARCH64
+#if BITDEPTH == 8
 #if HAVE_DOTPROD
-    if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
-
-    init_8tap_fns(neon_dotprod);
+    if (flags & DAV1D_ARM_CPU_FLAG_DOTPROD) {
+        init_8tap_fns(neon_dotprod);
+    }
 #endif  // HAVE_DOTPROD
 
 #if HAVE_I8MM
-    if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return;
-
-    init_8tap_fns(neon_i8mm);
+    if (flags & DAV1D_ARM_CPU_FLAG_I8MM) {
+        init_8tap_fns(neon_i8mm);
+    }
 #endif  // HAVE_I8MM
-#endif  // ARCH_AARCH64 && BITDEPTH == 8
+#endif  // BITDEPTH == 8
+
+#if BITDEPTH == 16
+#if HAVE_SVE2
+    if (flags & DAV1D_ARM_CPU_FLAG_SVE2) {
+        init_8tap_fns(sve2);
+    }
+#endif  // HAVE_SVE2
+#endif  // BITDEPTH == 16
+#endif  // ARCH_AARCH64
 }
diff --git a/src/cpu.c b/src/cpu.c
index 9bb85f151b1b656bc6435db4c716e5dc9c43b64e..415266711e11ebc06ce8ed9e9609c576151d0b15 100644
--- a/src/cpu.c
+++ b/src/cpu.c
@@ -33,20 +33,24 @@
 
 #ifdef _WIN32
 #include <windows.h>
-#elif defined(__APPLE__)
+#endif
+#ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
-#else
-#include <pthread.h>
+#endif
+#if HAVE_UNISTD_H
 #include <unistd.h>
 #endif
 
-#ifdef HAVE_PTHREAD_NP_H
+#if HAVE_PTHREAD_GETAFFINITY_NP
+#include <pthread.h>
+#if HAVE_PTHREAD_NP_H
 #include <pthread_np.h>
 #endif
 #if defined(__FreeBSD__)
 #define cpu_set_t cpuset_t
 #endif
+#endif
 
 unsigned dav1d_cpu_flags = 0U;
 unsigned dav1d_cpu_flags_mask = ~0U;
@@ -87,7 +91,7 @@ COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
     GetNativeSystemInfo(&system_info);
     return system_info.dwNumberOfProcessors;
 #endif
-#elif defined(HAVE_PTHREAD_GETAFFINITY_NP) && defined(CPU_COUNT)
+#elif HAVE_PTHREAD_GETAFFINITY_NP && defined(CPU_COUNT)
     cpu_set_t affinity;
     if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity))
         return CPU_COUNT(&affinity);
diff --git a/src/cpu.h b/src/cpu.h
index 7205e8e62ff66943cc0ba8169392333de8a6a7d8..c18b7ff1fb97155ce43c4fe541921e49187532e3 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -54,12 +54,9 @@ void dav1d_init_cpu(void);
 DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
 int dav1d_num_logical_processors(Dav1dContext *c);
 
-static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
-    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
+    unsigned flags = 0;
 
-#if TRIM_DSP_FUNCTIONS
-/* Since this function is inlined, unconditionally setting a flag here will
- * enable dead code elimination in the calling function. */
 #if ARCH_AARCH64 || ARCH_ARM
 #if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
@@ -119,6 +116,17 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
     flags |= DAV1D_X86_CPU_FLAG_SSE2;
 #endif
 #endif
+
+    return flags;
+}
+
+static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
+    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+
+#if TRIM_DSP_FUNCTIONS
+/* Since this function is inlined, unconditionally setting a flag here will
+ * enable dead code elimination in the calling function. */
+    flags |= dav1d_get_default_cpu_flags();
 #endif
 
     return flags;
diff --git a/src/ctx.c b/src/ctx.c
new file mode 100644
index 0000000000000000000000000000000000000000..0a0fe54c7ceacee1adf1625aa96dd9983b7897c0
--- /dev/null
+++ b/src/ctx.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * Copyright © 2024, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "ctx.h"
+
+static void memset_w1(void *const ptr, const int value) {
+    set_ctx1((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w2(void *const ptr, const int value) {
+    set_ctx2((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w4(void *const ptr, const int value) {
+    set_ctx4((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w8(void *const ptr, const int value) {
+    set_ctx8((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w16(void *const ptr, const int value) {
+    set_ctx16((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w32(void *const ptr, const int value) {
+    set_ctx32((uint8_t *) ptr, 0, value);
+}
+
+const dav1d_memset_pow2_fn dav1d_memset_pow2[6] = {
+    memset_w1,
+    memset_w2,
+    memset_w4,
+    memset_w8,
+    memset_w16,
+    memset_w32
+};
diff --git a/src/ctx.h b/src/ctx.h
index d0e1f310ae2d9c65637a9da8bd6c10021ff5e2bd..7dea8b68948eebac6b1581b0dbc0a7221a44b8a8 100644
--- a/src/ctx.h
+++ b/src/ctx.h
@@ -31,61 +31,59 @@
 #include <stdint.h>
 
 #include "common/attributes.h"
+#include "common/intops.h"
 
 union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
 union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
 union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
 union alias8 { uint8_t u8; } ATTR_ALIAS;
 
-#define set_ctx_rep4(type, var, off, val) do { \
-        const uint64_t const_val = val; \
-        ((union alias64 *) &var[off +  0])->u64 = const_val; \
-        ((union alias64 *) &var[off +  8])->u64 = const_val; \
-        ((union alias64 *) &var[off + 16])->u64 = const_val; \
-        ((union alias64 *) &var[off + 24])->u64 = const_val; \
+typedef void (*dav1d_memset_pow2_fn)(void *ptr, int value);
+EXTERN const dav1d_memset_pow2_fn dav1d_memset_pow2[6];
+
+static inline void dav1d_memset_likely_pow2(void *const ptr, const int value, const int n) {
+    assert(n >= 1 && n <= 32);
+    if ((n&(n-1)) == 0) {
+        dav1d_memset_pow2[ulog2(n)](ptr, value);
+    } else {
+        memset(ptr, value, n);
+    }
+}
+
+// For smaller sizes use multiplication to broadcast bytes. memset misbehaves on the smaller sizes.
+// For the larger sizes, we want to use memset to get access to vector operations.
+#define set_ctx1(var, off, val) \
+    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
+#define set_ctx2(var, off, val) \
+    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
+#define set_ctx4(var, off, val) \
+    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
+#define set_ctx8(var, off, val) \
+    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
+#define set_ctx16(var, off, val) do { \
+        memset(&(var)[off], val, 16); \
     } while (0)
-#define set_ctx_rep2(type, var, off, val) do { \
-        const uint64_t const_val = val; \
-        ((union alias64 *) &var[off + 0])->u64 = const_val; \
-        ((union alias64 *) &var[off + 8])->u64 = const_val; \
+#define set_ctx32(var, off, val) do { \
+        memset(&(var)[off], val, 32); \
     } while (0)
-#define set_ctx_rep1(typesz, var, off, val) \
-    ((union alias##typesz *) &var[off])->u##typesz = val
-#define case_set(var, dir, diridx, off) \
-    switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
-    }
-#define case_set_upto16(var, dir, diridx, off) \
-    switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    }
-#define case_set_upto32_with_default(var, dir, diridx, off) \
+#define case_set(var) \
     switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
-    default: default_memset(dir, diridx, off, var); break; \
+    case 0: set_ctx(set_ctx1); break; \
+    case 1: set_ctx(set_ctx2); break; \
+    case 2: set_ctx(set_ctx4); break; \
+    case 3: set_ctx(set_ctx8); break; \
+    case 4: set_ctx(set_ctx16); break; \
+    case 5: set_ctx(set_ctx32); break; \
+    default: assert(0); \
     }
-#define case_set_upto16_with_default(var, dir, diridx, off) \
+#define case_set_upto16(var) \
     switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    default: default_memset(dir, diridx, off, var); break; \
+    case 0: set_ctx(set_ctx1); break; \
+    case 1: set_ctx(set_ctx2); break; \
+    case 2: set_ctx(set_ctx4); break; \
+    case 3: set_ctx(set_ctx8); break; \
+    case 4: set_ctx(set_ctx16); break; \
+    default: assert(0); \
     }
 
 #endif /* DAV1D_SRC_CTX_H */
diff --git a/src/decode.c b/src/decode.c
index ea371324216de34f748db67287c9bc9513019726..f5b6db95838d24695489bbf205f3cbe8a4074831 100644
--- a/src/decode.c
+++ b/src/decode.c
@@ -161,14 +161,8 @@ static void read_tx_tree(Dav1dTaskContext *const t,
         }
         t->by -= txsh;
     } else {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
-        case_set_upto16(t_dim->h, l., 1, by4);
-#undef set_ctx
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
-        case_set_upto16(t_dim->w, a->, 0, bx4);
-#undef set_ctx
+        dav1d_memset_pow2[t_dim->lw](&t->a->tx[bx4], is_split ? TX_4X4 : txw);
+        dav1d_memset_pow2[t_dim->lh](&t->l.tx[by4], is_split ? TX_4X4 : txh);
     }
 }
 
@@ -464,19 +458,13 @@ static void read_vartx_tree(Dav1dTaskContext *const t,
     {
         b->max_ytx = b->uvtx = TX_4X4;
         if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir tx, off, TX_4X4)
-            case_set(bh4, l., 1, by4);
-            case_set(bw4, a->, 0, bx4);
-#undef set_ctx
+            dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], TX_4X4);
+            dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], TX_4X4);
         }
     } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) {
         if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
-            case_set(bh4, l., 1, by4);
-            case_set(bw4, a->, 0, bx4);
-#undef set_ctx
+            dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], b_dim[2 + 0]);
+            dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], b_dim[2 + 1]);
         }
         b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
     } else {
@@ -696,8 +684,7 @@ static int decode_b(Dav1dTaskContext *const t,
                     const enum BlockLevel bl,
                     const enum BlockSize bs,
                     const enum BlockPartition bp,
-                    const enum EdgeFlags intra_edge_flags)
-{
+                    const enum EdgeFlags intra_edge_flags) {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
     Av1Block b_mem, *const b = t->frame_thread.pass ?
@@ -722,11 +709,13 @@ static int decode_b(Dav1dTaskContext *const t,
 
             const enum IntraPredMode y_mode_nofilt =
                 b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
-            rep_macro(type, t->dir intra, off, mul)
-            case_set(bh4, l., 1, by4);
-            case_set(bw4, a->, 0, bx4);
+#define set_ctx(rep_macro) \
+            rep_macro(edge->mode, off, y_mode_nofilt); \
+            rep_macro(edge->intra, off, 1)
+            BlockContext *edge = t->a;
+            for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
+                case_set(b_dim[2 + i]);
+            }
 #undef set_ctx
             if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
                 refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
@@ -742,11 +731,9 @@ static int decode_b(Dav1dTaskContext *const t,
             }
 
             if (has_chroma) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
-                case_set(cbh4, l., 1, cby4);
-                case_set(cbw4, a->, 0, cbx4);
-#undef set_ctx
+                uint8_t uv_mode = b->uv_mode;
+                dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode);
+                dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode);
             }
         } else {
             if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
@@ -784,13 +771,15 @@ static int decode_b(Dav1dTaskContext *const t,
             if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
 
             const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
-            rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
-            rep_macro(type, t->dir intra, off, 0)
-            case_set(bh4, l., 1, by4);
-            case_set(bw4, a->, 0, bx4);
+            BlockContext *edge = t->a;
+            for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
+#define set_ctx(rep_macro) \
+                rep_macro(edge->filter[0], off, filter[0]); \
+                rep_macro(edge->filter[1], off, filter[1]); \
+                rep_macro(edge->intra, off, 0)
+                case_set(b_dim[2 + i]);
 #undef set_ctx
+            }
 
             if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
                 refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
@@ -808,11 +797,8 @@ static int decode_b(Dav1dTaskContext *const t,
             }
 
             if (has_chroma) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
-                case_set(cbh4, l., 1, cby4);
-                case_set(cbw4, a->, 0, cbx4);
-#undef set_ctx
+                dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
+                dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
             }
         }
         return 0;
@@ -1240,39 +1226,39 @@ static int decode_b(Dav1dTaskContext *const t,
                                        has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
                                        has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
         }
-
         // update contexts
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
-        rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
-        rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
-        rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
-        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
-        rep_macro(type, t->dir skip_mode, off, 0); \
-        rep_macro(type, t->dir intra, off, mul); \
-        rep_macro(type, t->dir skip, off, mul * b->skip); \
-        /* see aomedia bug 2183 for why we use luma coordinates here */ \
-        rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
-        if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
-            rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
-            rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
-            rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
-            rep_macro(type, t->dir filter[0], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
-            rep_macro(type, t->dir filter[1], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
-        }
         const enum IntraPredMode y_mode_nofilt =
             b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
-        case_set(bh4, l., 1, by4);
-        case_set(bw4, a->, 0, bx4);
+        BlockContext *edge = t->a;
+        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
+            int t_lsz = ((uint8_t *) &t_dim->lw)[i]; // lw then lh
+#define set_ctx(rep_macro) \
+            rep_macro(edge->tx_intra, off, t_lsz); \
+            rep_macro(edge->tx, off, t_lsz); \
+            rep_macro(edge->mode, off, y_mode_nofilt); \
+            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
+            rep_macro(edge->seg_pred, off, seg_pred); \
+            rep_macro(edge->skip_mode, off, 0); \
+            rep_macro(edge->intra, off, 1); \
+            rep_macro(edge->skip, off, b->skip); \
+            /* see aomedia bug 2183 for why we use luma coordinates here */ \
+            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
+            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
+                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
+                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
+                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
+                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
+                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
+            }
+            case_set(b_dim[2 + i]);
 #undef set_ctx
+        }
         if (b->pal_sz[0])
             f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4);
         if (has_chroma) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
-                case_set(cbh4, l., 1, cby4);
-                case_set(cbw4, a->, 0, cbx4);
-#undef set_ctx
+            uint8_t uv_mode = b->uv_mode;
+            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode);
+            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode);
             if (b->pal_sz[1])
                 f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4);
         }
@@ -1374,26 +1360,24 @@ static int decode_b(Dav1dTaskContext *const t,
         }
 
         splat_intrabc_mv(f->c, t, bs, b, bw4, bh4);
-
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
-        rep_macro(type, t->dir mode, off, mul * DC_PRED); \
-        rep_macro(type, t->dir pal_sz, off, 0); \
-        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
-        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
-        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
-        rep_macro(type, t->dir skip_mode, off, 0); \
-        rep_macro(type, t->dir intra, off, 0); \
-        rep_macro(type, t->dir skip, off, mul * b->skip)
-        case_set(bh4, l., 1, by4);
-        case_set(bw4, a->, 0, bx4);
+        BlockContext *edge = t->a;
+        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
+#define set_ctx(rep_macro) \
+            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
+            rep_macro(edge->mode, off, DC_PRED); \
+            rep_macro(edge->pal_sz, off, 0); \
+            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+            rep_macro(t->pal_sz_uv[i], off, 0); \
+            rep_macro(edge->seg_pred, off, seg_pred); \
+            rep_macro(edge->skip_mode, off, 0); \
+            rep_macro(edge->intra, off, 0); \
+            rep_macro(edge->skip, off, b->skip)
+            case_set(b_dim[2 + i]);
 #undef set_ctx
+        }
         if (has_chroma) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
-            case_set(cbh4, l., 1, cby4);
-            case_set(cbw4, a->, 0, cbx4);
-#undef set_ctx
+            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
+            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
         }
     } else {
         // inter-specific mode/mv coding
@@ -1922,32 +1906,29 @@ static int decode_b(Dav1dTaskContext *const t,
             splat_tworef_mv(f->c, t, bs, b, bw4, bh4);
         else
             splat_oneref_mv(f->c, t, bs, b, bw4, bh4);
-
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
-        rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
-        rep_macro(type, t->dir intra, off, 0); \
-        rep_macro(type, t->dir skip, off, mul * b->skip); \
-        rep_macro(type, t->dir pal_sz, off, 0); \
-        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
-        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
-        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
-        rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
-        rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
-        rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
-        rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
-        rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
-        rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
-        case_set(bh4, l., 1, by4);
-        case_set(bw4, a->, 0, bx4);
+        BlockContext *edge = t->a;
+        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
+#define set_ctx(rep_macro) \
+            rep_macro(edge->seg_pred, off, seg_pred); \
+            rep_macro(edge->skip_mode, off, b->skip_mode); \
+            rep_macro(edge->intra, off, 0); \
+            rep_macro(edge->skip, off, b->skip); \
+            rep_macro(edge->pal_sz, off, 0); \
+            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+            rep_macro(t->pal_sz_uv[i], off, 0); \
+            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
+            rep_macro(edge->comp_type, off, b->comp_type); \
+            rep_macro(edge->filter[0], off, filter[0]); \
+            rep_macro(edge->filter[1], off, filter[1]); \
+            rep_macro(edge->mode, off, b->inter_mode); \
+            rep_macro(edge->ref[0], off, b->ref[0]); \
+            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
+            case_set(b_dim[2 + i]);
 #undef set_ctx
-
+        }
         if (has_chroma) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
-            case_set(cbh4, l., 1, cby4);
-            case_set(cbw4, a->, 0, cbx4);
-#undef set_ctx
+            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
+            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
         }
     }
 
@@ -1956,12 +1937,12 @@ static int decode_b(Dav1dTaskContext *const t,
         f->frame_hdr->segmentation.update_map)
     {
         uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+#define set_ctx(rep_macro) \
         for (int y = 0; y < bh4; y++) { \
-            rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
+            rep_macro(seg_ptr, 0, b->seg_id); \
             seg_ptr += f->b4_stride; \
         }
-        case_set(bw4, NULL, 0, 0);
+        case_set(b_dim[2]);
 #undef set_ctx
     }
     if (!b->skip) {
@@ -2398,10 +2379,10 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl,
     }
 
     if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
-        rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
-        case_set_upto16(hsz,,,);
+#define set_ctx(rep_macro) \
+        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
+        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
+        case_set_upto16(ulog2(hsz));
 #undef set_ctx
     }
 
diff --git a/src/itx_1d.c b/src/itx_1d.c
index 8f75c653afefb10120d2efd488a0e67648f7b773..14e89ca0c886863013976cb0d71bb41136ebf8ab 100644
--- a/src/itx_1d.c
+++ b/src/itx_1d.c
@@ -89,8 +89,8 @@ inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
     c[3 * stride] = CLIP(t0 - t3);
 }
 
-void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
-                         const int min, const int max)
+static void inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
 {
     inv_dct4_1d_internal_c(c, stride, min, max, 0);
 }
@@ -142,8 +142,8 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
     c[7 * stride] = CLIP(t0 - t7);
 }
 
-void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
-                         const int min, const int max)
+static void inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                          const int min, const int max)
 {
     inv_dct8_1d_internal_c(c, stride, min, max, 0);
 }
@@ -237,8 +237,8 @@ inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
     c[15 * stride] = CLIP(t0 - t15a);
 }
 
-void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
-                          const int min, const int max)
+static void inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                           const int min, const int max)
 {
     inv_dct16_1d_internal_c(c, stride, min, max, 0);
 }
@@ -427,14 +427,14 @@ inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
     c[31 * stride] = CLIP(t0  - t31);
 }
 
-void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
-                          const int min, const int max)
+static void inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                           const int min, const int max)
 {
     inv_dct32_1d_internal_c(c, stride, min, max, 0);
 }
 
-void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
-                          const int min, const int max)
+static void inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
+                           const int min, const int max)
 {
     assert(stride > 0);
     inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
@@ -962,13 +962,13 @@ inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
 }
 
 #define inv_adst_1d(sz) \
-void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
-                               const int min, const int max) \
+static void inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                                const int min, const int max) \
 { \
     inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
 } \
-void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
-                                   const int min, const int max) \
+static void inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+                                          const int min, const int max) \
 { \
     inv_adst##sz##_1d_internal_c(c, stride, min, max, \
                                  &c[(sz - 1) * stride], -stride); \
@@ -980,8 +980,8 @@ inv_adst_1d(16)
 
 #undef inv_adst_1d
 
-void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
-                              const int min, const int max)
+static void inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
 {
     assert(stride > 0);
     for (int i = 0; i < 4; i++) {
@@ -990,16 +990,16 @@ void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
     }
 }
 
-void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
-                              const int min, const int max)
+static void inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
+                               const int min, const int max)
 {
     assert(stride > 0);
     for (int i = 0; i < 8; i++)
         c[stride * i] *= 2;
 }
 
-void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
-                               const int min, const int max)
+static void inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
+                                const int min, const int max)
 {
     assert(stride > 0);
     for (int i = 0; i < 16; i++) {
@@ -1008,14 +1008,57 @@ void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
     }
 }
 
-void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
-                               const int min, const int max)
+static void inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
+                                const int min, const int max)
 {
     assert(stride > 0);
     for (int i = 0; i < 32; i++)
         c[stride * i] *= 4;
 }
 
+const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES] = {
+    [TX_4X4] = {
+        [DCT] = inv_dct4_1d_c,
+        [ADST] = inv_adst4_1d_c,
+        [FLIPADST] = inv_flipadst4_1d_c,
+        [IDENTITY] = inv_identity4_1d_c,
+    }, [TX_8X8] = {
+        [DCT] = inv_dct8_1d_c,
+        [ADST] = inv_adst8_1d_c,
+        [FLIPADST] = inv_flipadst8_1d_c,
+        [IDENTITY] = inv_identity8_1d_c,
+    }, [TX_16X16] = {
+        [DCT] = inv_dct16_1d_c,
+        [ADST] = inv_adst16_1d_c,
+        [FLIPADST] = inv_flipadst16_1d_c,
+        [IDENTITY] = inv_identity16_1d_c,
+    }, [TX_32X32] = {
+        [DCT] = inv_dct32_1d_c,
+        [IDENTITY] = inv_identity32_1d_c,
+    }, [TX_64X64] = {
+        [DCT] = inv_dct64_1d_c,
+    },
+};
+
+const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2] = {
+    [DCT_DCT]           = { DCT, DCT },
+    [ADST_DCT]          = { ADST, DCT },
+    [DCT_ADST]          = { DCT, ADST },
+    [ADST_ADST]         = { ADST, ADST },
+    [FLIPADST_DCT]      = { FLIPADST, DCT },
+    [DCT_FLIPADST]      = { DCT, FLIPADST },
+    [FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
+    [ADST_FLIPADST]     = { ADST, FLIPADST },
+    [FLIPADST_ADST]     = { FLIPADST, ADST },
+    [IDTX]              = { IDENTITY, IDENTITY },
+    [V_DCT]             = { DCT, IDENTITY },
+    [H_DCT]             = { IDENTITY, DCT },
+    [V_ADST]            = { ADST, IDENTITY },
+    [H_ADST]            = { IDENTITY, ADST },
+    [V_FLIPADST]        = { FLIPADST, IDENTITY },
+    [H_FLIPADST]        = { IDENTITY, FLIPADST },
+};
+
 #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
   ARCH_AARCH64 || \
   (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
diff --git a/src/itx_1d.h b/src/itx_1d.h
index b63d71b020bdac7da9cd27cdb615874476471ad3..880ac99a358cfa14ebe4ddcb11678c75720c25c0 100644
--- a/src/itx_1d.h
+++ b/src/itx_1d.h
@@ -28,31 +28,25 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "src/levels.h"
+
 #ifndef DAV1D_SRC_ITX_1D_H
 #define DAV1D_SRC_ITX_1D_H
 
+enum Tx1dType {
+    DCT,
+    ADST,
+    IDENTITY,
+    FLIPADST,
+    N_TX_1D_TYPES,
+};
+
 #define decl_itx_1d_fn(name) \
 void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
 typedef decl_itx_1d_fn(*itx_1d_fn);
 
-decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
-decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
-
-decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
-decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
-decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
-
-decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
-decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
-decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
-
-decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
-decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
-decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
-decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
+EXTERN const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES];
+EXTERN const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2];
 
 void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
 
diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c
index a226223c960273a6d83d50644d7a58c2cda2662a..bafe0a86a6c0909bd3040d6d3ce8a39bbf709c36 100644
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -29,6 +29,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include "common/attributes.h"
@@ -36,13 +37,17 @@
 
 #include "src/itx.h"
 #include "src/itx_1d.h"
+#include "src/scan.h"
+#include "src/tables.h"
 
 static NOINLINE void
 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
-               const int eob, const int w, const int h, const int shift,
-               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
-               const int has_dconly HIGHBD_DECL_SUFFIX)
+               const int eob, const /*enum RectTxfmSize*/ int tx, const int shift,
+               const enum TxfmType txtp HIGHBD_DECL_SUFFIX)
 {
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+    const int w = 4 * t_dim->w, h = 4 * t_dim->h;
+    const int has_dconly = txtp == DCT_DCT;
     assert(w >= 4 && w <= 64);
     assert(h >= 4 && h <= 64);
     assert(eob >= 0);
@@ -64,6 +69,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
         return;
     }
 
+    const uint8_t *const txtps = dav1d_tx1d_types[txtp];
+    const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]];
+    const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]];
     const int sh = imin(h, 32), sw = imin(w, 32);
 #if BITDEPTH == 8
     const int row_clip_min = INT16_MIN;
@@ -76,7 +84,16 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
     const int col_clip_max = ~col_clip_min;
 
     int32_t tmp[64 * 64], *c = tmp;
-    for (int y = 0; y < sh; y++, c += w) {
+    int last_nonzero_col; // in first 1d itx
+    if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) {
+        last_nonzero_col = imin(sh - 1, eob);
+    } else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) {
+        last_nonzero_col = eob >> (t_dim->lw + 2);
+    } else {
+        last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob];
+    }
+    assert(last_nonzero_col < sh);
+    for (int y = 0; y <= last_nonzero_col; y++, c += w) {
         if (is_rect2)
             for (int x = 0; x < sw; x++)
                 c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
@@ -85,6 +102,8 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
                 c[x] = coeff[y + x * sh];
         first_1d_fn(c, 1, row_clip_min, row_clip_max);
     }
+    if (last_nonzero_col + 1 < sh)
+        memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w);
 
     memset(coeff, 0, sizeof(*coeff) * sw * sh);
     for (int i = 0; i < w * sh; i++)
@@ -99,7 +118,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
             dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
 }
 
-#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
+#define inv_txfm_fn(type1, type2, type, pfx, w, h, shift) \
 static void \
 inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
                                                const ptrdiff_t stride, \
@@ -107,57 +126,56 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
                                                const int eob \
                                                HIGHBD_DECL_SUFFIX) \
 { \
-    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
-                   dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
-                   has_dconly HIGHBD_TAIL_SUFFIX); \
+    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
+                   HIGHBD_TAIL_SUFFIX); \
 }
 
-#define inv_txfm_fn64(w, h, shift) \
-inv_txfm_fn(dct, dct, w, h, shift, 1)
+#define inv_txfm_fn64(pfx, w, h, shift) \
+inv_txfm_fn(dct, dct, DCT_DCT, pfx, w, h, shift)
 
-#define inv_txfm_fn32(w, h, shift) \
-inv_txfm_fn64(w, h, shift) \
-inv_txfm_fn(identity, identity, w, h, shift, 0)
+#define inv_txfm_fn32(pfx, w, h, shift) \
+inv_txfm_fn64(pfx, w, h, shift) \
+inv_txfm_fn(identity, identity, IDTX, pfx, w, h, shift)
 
-#define inv_txfm_fn16(w, h, shift) \
-inv_txfm_fn32(w, h, shift) \
-inv_txfm_fn(adst,     dct,      w, h, shift, 0) \
-inv_txfm_fn(dct,      adst,     w, h, shift, 0) \
-inv_txfm_fn(adst,     adst,     w, h, shift, 0) \
-inv_txfm_fn(dct,      flipadst, w, h, shift, 0) \
-inv_txfm_fn(flipadst, dct,      w, h, shift, 0) \
-inv_txfm_fn(adst,     flipadst, w, h, shift, 0) \
-inv_txfm_fn(flipadst, adst,     w, h, shift, 0) \
-inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
-inv_txfm_fn(identity, dct,      w, h, shift, 0) \
-inv_txfm_fn(dct,      identity, w, h, shift, 0) \
+#define inv_txfm_fn16(pfx, w, h, shift) \
+inv_txfm_fn32(pfx, w, h, shift) \
+inv_txfm_fn(adst,     dct,      ADST_DCT,          pfx,  w, h, shift) \
+inv_txfm_fn(dct,      adst,     DCT_ADST,          pfx, w, h, shift) \
+inv_txfm_fn(adst,     adst,     ADST_ADST,         pfx, w, h, shift) \
+inv_txfm_fn(dct,      flipadst, DCT_FLIPADST,      pfx, w, h, shift) \
+inv_txfm_fn(flipadst, dct,      FLIPADST_DCT,      pfx, w, h, shift) \
+inv_txfm_fn(adst,     flipadst, ADST_FLIPADST,     pfx, w, h, shift) \
+inv_txfm_fn(flipadst, adst,     FLIPADST_ADST,     pfx, w, h, shift) \
+inv_txfm_fn(flipadst, flipadst, FLIPADST_FLIPADST, pfx, w, h, shift) \
+inv_txfm_fn(identity, dct,      H_DCT,             pfx, w, h, shift) \
+inv_txfm_fn(dct,      identity, V_DCT,             pfx, w, h, shift) \
 
-#define inv_txfm_fn84(w, h, shift) \
-inv_txfm_fn16(w, h, shift) \
-inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
-inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
-inv_txfm_fn(identity, adst,     w, h, shift, 0) \
-inv_txfm_fn(adst,     identity, w, h, shift, 0) \
+#define inv_txfm_fn84(pfx, w, h, shift) \
+inv_txfm_fn16(pfx, w, h, shift) \
+inv_txfm_fn(identity, flipadst, H_FLIPADST, pfx, w, h, shift) \
+inv_txfm_fn(flipadst, identity, V_FLIPADST, pfx, w, h, shift) \
+inv_txfm_fn(identity, adst,     H_ADST,     pfx, w, h, shift) \
+inv_txfm_fn(adst,     identity, V_ADST,     pfx, w, h, shift) \
 
-inv_txfm_fn84( 4,  4, 0)
-inv_txfm_fn84( 4,  8, 0)
-inv_txfm_fn84( 4, 16, 1)
-inv_txfm_fn84( 8,  4, 0)
-inv_txfm_fn84( 8,  8, 1)
-inv_txfm_fn84( 8, 16, 1)
-inv_txfm_fn32( 8, 32, 2)
-inv_txfm_fn84(16,  4, 1)
-inv_txfm_fn84(16,  8, 1)
-inv_txfm_fn16(16, 16, 2)
-inv_txfm_fn32(16, 32, 1)
-inv_txfm_fn64(16, 64, 2)
-inv_txfm_fn32(32,  8, 2)
-inv_txfm_fn32(32, 16, 1)
-inv_txfm_fn32(32, 32, 2)
-inv_txfm_fn64(32, 64, 1)
-inv_txfm_fn64(64, 16, 2)
-inv_txfm_fn64(64, 32, 1)
-inv_txfm_fn64(64, 64, 2)
+inv_txfm_fn84( ,  4,  4, 0)
+inv_txfm_fn84(R,  4,  8, 0)
+inv_txfm_fn84(R,  4, 16, 1)
+inv_txfm_fn84(R,  8,  4, 0)
+inv_txfm_fn84( ,  8,  8, 1)
+inv_txfm_fn84(R,  8, 16, 1)
+inv_txfm_fn32(R,  8, 32, 2)
+inv_txfm_fn84(R, 16,  4, 1)
+inv_txfm_fn84(R, 16,  8, 1)
+inv_txfm_fn16( , 16, 16, 2)
+inv_txfm_fn32(R, 16, 32, 1)
+inv_txfm_fn64(R, 16, 64, 2)
+inv_txfm_fn32(R, 32,  8, 2)
+inv_txfm_fn32(R, 32, 16, 1)
+inv_txfm_fn32( , 32, 32, 2)
+inv_txfm_fn64(R, 32, 64, 1)
+inv_txfm_fn64(R, 64, 16, 2)
+inv_txfm_fn64(R, 64, 32, 1)
+inv_txfm_fn64( , 64, 64, 2)
 
 #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
   ARCH_AARCH64 || \
@@ -190,6 +208,8 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
 #include "src/arm/itx.h"
 #elif ARCH_LOONGARCH64
 #include "src/loongarch/itx.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/itx.h"
 #elif ARCH_RISCV
 #include "src/riscv/itx.h"
 #elif ARCH_X86
@@ -267,18 +287,25 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
     assign_itx_all_fn64(64, 32, R);
     assign_itx_all_fn64(64, 64, );
 
+    int all_simd = 0;
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
-    itx_dsp_init_arm(c, bpc);
+    itx_dsp_init_arm(c, bpc, &all_simd);
 #endif
 #if ARCH_LOONGARCH64
     itx_dsp_init_loongarch(c, bpc);
 #endif
+#if ARCH_PPC64LE
+    itx_dsp_init_ppc(c, bpc);
+#endif
 #if ARCH_RISCV
     itx_dsp_init_riscv(c, bpc);
 #endif
 #if ARCH_X86
-    itx_dsp_init_x86(c, bpc);
+    itx_dsp_init_x86(c, bpc, &all_simd);
 #endif
 #endif
+
+    if (!all_simd)
+        dav1d_init_last_nonzero_col_from_eob_tables();
 }
diff --git a/src/lf_mask.c b/src/lf_mask.c
index 09a5c532c4b8435db56e1f9a0add3aa27dbba55d..c81bd9b5f9bd0958e5abb3825481ebd3aef6db9e 100644
--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -64,18 +64,15 @@ static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /*
     } else {
         const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
 
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+#define set_ctx(rep_macro) \
         for (int y = 0; y < t_dim->h; y++) { \
-            rep_macro(type, txa[0][0][y], off, mul * lw); \
-            rep_macro(type, txa[1][0][y], off, mul * lh); \
+            rep_macro(txa[0][0][y], 0, lw); \
+            rep_macro(txa[1][0][y], 0, lh); \
             txa[0][1][y][0] = t_dim->w; \
         }
-        case_set_upto16(t_dim->w,,, 0);
-#undef set_ctx
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
-        case_set_upto16(t_dim->w,,, 0);
+        case_set_upto16(t_dim->lw);
 #undef set_ctx
+        dav1d_memset_pow2[t_dim->lw](txa[1][1][0], t_dim->h);
     }
 }
 
@@ -196,20 +193,8 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
         if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
     }
 
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-    rep_macro(type, a, off, mul * thl4c)
-#define default_memset(dir, diridx, off, var) \
-    memset(a, thl4c, var)
-    case_set_upto32_with_default(w4,,, 0);
-#undef default_memset
-#undef set_ctx
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-    rep_macro(type, l, off, mul * twl4c)
-#define default_memset(dir, diridx, off, var) \
-    memset(l, twl4c, var)
-    case_set_upto32_with_default(h4,,, 0);
-#undef default_memset
-#undef set_ctx
+    dav1d_memset_likely_pow2(a, thl4c, w4);
+    dav1d_memset_likely_pow2(l, twl4c, h4);
 }
 
 static void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
@@ -267,20 +252,8 @@ static void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
         }
     }
 
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-    rep_macro(type, a, off, mul * thl4c)
-#define default_memset(dir, diridx, off, var) \
-    memset(a, thl4c, var)
-    case_set_upto32_with_default(cw4,,, 0);
-#undef default_memset
-#undef set_ctx
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-    rep_macro(type, l, off, mul * twl4c)
-#define default_memset(dir, diridx, off, var) \
-    memset(l, twl4c, var)
-    case_set_upto32_with_default(ch4,,, 0);
-#undef default_memset
-#undef set_ctx
+    dav1d_memset_likely_pow2(a, thl4c, cw4);
+    dav1d_memset_likely_pow2(l, twl4c, ch4);
 }
 
 void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
diff --git a/src/lib.c b/src/lib.c
index 4d9a2d30e343e3e13c44c23065347bd18030de65..6d2d80dd93aa99389132529a8a31e72739d41d69 100644
--- a/src/lib.c
+++ b/src/lib.c
@@ -31,7 +31,7 @@
 #include <errno.h>
 #include <string.h>
 
-#if defined(__linux__) && defined(HAVE_DLSYM)
+#if defined(__linux__) && HAVE_DLSYM
 #include <dlfcn.h>
 #endif
 
@@ -90,7 +90,7 @@ static void close_internal(Dav1dContext **const c_out, int flush);
 
 NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
 static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
-#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__)
+#if defined(__linux__) && HAVE_DLSYM && defined(__GLIBC__)
     /* glibc has an issue where the size of the TLS is subtracted from the stack
      * size instead of allocated separately. As a result the specified stack
      * size may be insufficient when used in an application with large amounts
diff --git a/src/loongarch/cpu.c b/src/loongarch/cpu.c
index a79ade5472e6a4fef8c8ce95e2ca89134463377b..383aa01e5de537b715245c56a66635f716f23777 100644
--- a/src/loongarch/cpu.c
+++ b/src/loongarch/cpu.c
@@ -26,9 +26,11 @@
 
 #include "config.h"
 #include "common/attributes.h"
+
+#include "src/cpu.h"
 #include "src/loongarch/cpu.h"
 
-#if defined(HAVE_GETAUXVAL)
+#if HAVE_GETAUXVAL
 #include <sys/auxv.h>
 
 #define LA_HWCAP_LSX    ( 1 << 4 )
@@ -36,8 +38,8 @@
 #endif
 
 COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
-    unsigned flags = 0;
-#if defined(HAVE_GETAUXVAL)
+    unsigned flags = dav1d_get_default_cpu_flags();
+#if HAVE_GETAUXVAL
     unsigned long hw_cap = getauxval(AT_HWCAP);
     flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
     flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0;
diff --git a/src/mem.c b/src/mem.c
index 7e6eb4c066d1902ad6b5ee7b56c2135f921b2a17..9f0e3944728878550eef48875158c9fa0da168a6 100644
--- a/src/mem.c
+++ b/src/mem.c
@@ -109,16 +109,7 @@ void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
 void *dav1d_alloc_aligned(const enum AllocationType type,
                           const size_t sz, const size_t align)
 {
-    assert(!(align & (align - 1)));
-    void *ptr;
-#ifdef _WIN32
-    ptr = _aligned_malloc(sz + align, align);
-#elif defined(HAVE_POSIX_MEMALIGN)
-    if (posix_memalign(&ptr, align, sz + align)) return NULL;
-#else
-    ptr = memalign(align, sz + align);
-#endif
-
+    void *const ptr = dav1d_alloc_aligned_internal(align, sz + align);
     return track_alloc(type, ptr, sz, align);
 }
 
@@ -140,12 +131,7 @@ void dav1d_free(void *ptr) {
 
 void dav1d_free_aligned(void *ptr) {
     if (ptr) {
-        ptr = track_free(ptr);
-#ifdef _WIN32
-        _aligned_free(ptr);
-#else
-        free(ptr);
-#endif
+        dav1d_free_aligned_internal(track_free(ptr));
     }
 }
 
diff --git a/src/mem.h b/src/mem.h
index 0a8c18d709b85bee708f7f66eb44709dc31244dc..c8c45d314f597399468625a1d6b19ff2299e2c52 100644
--- a/src/mem.h
+++ b/src/mem.h
@@ -32,7 +32,7 @@
 
 #include <stdlib.h>
 
-#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
+#if defined(_WIN32) || HAVE_MEMALIGN
 #include <malloc.h>
 #endif
 
@@ -79,39 +79,33 @@ typedef struct Dav1dMemPool {
 #endif
 } Dav1dMemPool;
 
-
-#if TRACK_HEAP_ALLOCATIONS
-void *dav1d_malloc(enum AllocationType type, size_t sz);
-void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
-void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
-void dav1d_free(void *ptr);
-void dav1d_free_aligned(void *ptr);
-void dav1d_log_alloc_stats(Dav1dContext *c);
-#else
-#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
-#define dav1d_malloc(type, sz) malloc(sz)
-#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
-#define dav1d_free(ptr) free(ptr)
+// TODO: Move this to a common location?
+#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
 
 /*
  * Allocate align-byte aligned memory. The return value can be released
  * by calling the dav1d_free_aligned() function.
  */
-static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
+static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
     assert(!(align & (align - 1)));
 #ifdef _WIN32
     return _aligned_malloc(sz, align);
-#elif defined(HAVE_POSIX_MEMALIGN)
+#elif HAVE_POSIX_MEMALIGN
     void *ptr;
     if (posix_memalign(&ptr, align, sz)) return NULL;
     return ptr;
-#else
+#elif HAVE_MEMALIGN
     return memalign(align, sz);
+#elif HAVE_ALIGNED_ALLOC
+    // The C11 standard specifies that the size parameter
+    // must be an integral multiple of alignment.
+    return aligned_alloc(align, ROUND_UP(sz, align));
+#else
+#error No aligned allocation functions are available
 #endif
 }
-#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)
 
-static inline void dav1d_free_aligned(void *ptr) {
+static inline void dav1d_free_aligned_internal(void *ptr) {
 #ifdef _WIN32
     _aligned_free(ptr);
 #else
@@ -119,6 +113,20 @@ static inline void dav1d_free_aligned(void *ptr) {
 #endif
 }
 
+#if TRACK_HEAP_ALLOCATIONS
+void *dav1d_malloc(enum AllocationType type, size_t sz);
+void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
+void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
+void dav1d_free(void *ptr);
+void dav1d_free_aligned(void *ptr);
+void dav1d_log_alloc_stats(Dav1dContext *c);
+#else
+#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
+#define dav1d_malloc(type, sz) malloc(sz)
+#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
+#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
+#define dav1d_free(ptr) free(ptr)
+#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
 #endif /* TRACK_HEAP_ALLOCATIONS */
 
 void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
diff --git a/src/meson.build b/src/meson.build
index c754668053920f31ba8d86dcd8442db1382e53c1..8dbdc0cc53aaf8c1baa6a90fc69ced68bfef5065 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -30,6 +30,7 @@
 libdav1d_sources = files(
     'cdf.c',
     'cpu.c',
+    'ctx.c',
     'data.c',
     'decode.c',
     'dequant_tables.c',
@@ -119,6 +120,7 @@ if is_asm_enabled
                     'arm/64/loopfilter16.S',
                     'arm/64/looprestoration16.S',
                     'arm/64/mc16.S',
+                    'arm/64/mc16_sve.S',
                 )
             endif
         elif host_machine.cpu_family().startswith('arm')
@@ -256,6 +258,7 @@ if is_asm_enabled
         )}
         arch_flags += {'pwr9': ['-mcpu=power9', '-DDAV1D_PWR9']}
         libdav1d_arch_tmpl_sources += {'pwr9': files(
+            'ppc/itx_tmpl.c',
             'ppc/loopfilter_tmpl.c',
         )}
     elif host_machine.cpu_family().startswith('riscv')
@@ -370,7 +373,7 @@ libdav1d = library('dav1d',
 )
 
 dav1d_dep = declare_dependency(link_with: libdav1d,
-    include_directories : include_directories('../include/dav1d')
+    include_directories : include_directories('../include')
 )
 
 #
diff --git a/src/picture.c b/src/picture.c
index 94365bce8c3ee8453436e03cae223810d0a224cf..290bd095eaa8b285c101b7855029dee1c42ca924 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -201,16 +201,6 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
                                   (void **) &p->progress);
     if (res) return res;
 
-    dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
-                             c->mastering_display, c->mastering_display_ref,
-                             c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
-                             &f->tile[0].data.m);
-
-    // Must be removed from the context after being attached to the frame
-    dav1d_ref_dec(&c->itut_t35_ref);
-    c->itut_t35 = NULL;
-    c->n_itut_t35 = 0;
-
     // Don't clear these flags from c->frame_flags if the frame is not going to be output.
     // This way they will be added to the next visible frame too.
     const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) &&
@@ -221,6 +211,22 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
 
     p->visible = f->frame_hdr->show_frame;
     p->showable = f->frame_hdr->showable_frame;
+
+    if (p->visible) {
+        // Only add HDR10+ and T35 metadata when show frame flag is enabled
+        dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
+                                 c->mastering_display, c->mastering_display_ref,
+                                 c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
+                                 &f->tile[0].data.m);
+
+        // Must be removed from the context after being attached to the frame
+        dav1d_ref_dec(&c->itut_t35_ref);
+        c->itut_t35 = NULL;
+        c->n_itut_t35 = 0;
+    } else {
+        dav1d_data_props_copy(&p->p.m, &f->tile[0].data.m);
+    }
+
     if (c->n_fc > 1) {
         atomic_init(&p->progress[0], 0);
         atomic_init(&p->progress[1], 0);
diff --git a/src/ppc/cpu.c b/src/ppc/cpu.c
index 53287639de8eebb8c39f28cb2f15307816065433..f58e8fbf07e7f801ab453876f7509706d5696b8b 100644
--- a/src/ppc/cpu.c
+++ b/src/ppc/cpu.c
@@ -29,25 +29,26 @@
 
 #include "common/attributes.h"
 
+#include "src/cpu.h"
 #include "src/ppc/cpu.h"
 
-#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE
+#define HAVE_AUX ((HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO) && ARCH_PPC64LE)
+#if HAVE_AUX
 #include <sys/auxv.h>
-#define HAVE_AUX
 #endif
 
 COLD unsigned dav1d_get_cpu_flags_ppc(void) {
-    unsigned flags = 0;
-#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
+    unsigned flags = dav1d_get_default_cpu_flags();
+#if HAVE_GETAUXVAL && ARCH_PPC64LE
     unsigned long hw_cap = getauxval(AT_HWCAP);
     unsigned long hw_cap2 = getauxval(AT_HWCAP2);
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE
+#elif HAVE_ELF_AUX_INFO && ARCH_PPC64LE
     unsigned long hw_cap = 0;
     unsigned long hw_cap2 = 0;
     elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
     elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
 #endif
-#ifdef HAVE_AUX
+#if HAVE_AUX
     flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0;
     flags |= (hw_cap2 & PPC_FEATURE2_ARCH_3_00) ? DAV1D_PPC_CPU_FLAG_PWR9 : 0;
 #endif
diff --git a/src/ppc/itx.h b/src/ppc/itx.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bddf7a38fda20574ea282f6413d22f52e5d28f1
--- /dev/null
+++ b/src/ppc/itx.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2023, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+decl_itx17_fns( 4,  4, pwr9);
+decl_itx16_fns( 4,  8, pwr9);
+decl_itx16_fns( 4, 16, pwr9);
+decl_itx16_fns( 8,  4, pwr9);
+decl_itx16_fns( 8,  8, pwr9);
+decl_itx16_fns( 8, 16, pwr9);
+decl_itx2_fns ( 8, 32, pwr9);
+decl_itx16_fns(16,  4, pwr9);
+decl_itx16_fns(16,  8, pwr9);
+decl_itx12_fns(16, 16, pwr9);
+decl_itx2_fns (16, 32, pwr9);
+decl_itx2_fns (32,  8, pwr9);
+decl_itx2_fns (32, 16, pwr9);
+decl_itx2_fns (32, 32, pwr9);
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, pwr9));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, pwr9));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, pwr9));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, pwr9));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, pwr9));
+
+static ALWAYS_INLINE void itx_dsp_init_ppc(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_PPC_CPU_FLAG_PWR9)) return;
+
+#if BITDEPTH == 8
+    assign_itx17_fn( ,  4,  4, pwr9);
+    assign_itx16_fn(R,  4,  8, pwr9);
+    assign_itx16_fn(R,  8,  4, pwr9);
+    assign_itx16_fn(,   8,  8, pwr9);
+    assign_itx16_fn(R,  4, 16, pwr9);
+    assign_itx16_fn(R,  16, 4, pwr9);
+#endif
+}
diff --git a/src/ppc/itx_tmpl.c b/src/ppc/itx_tmpl.c
new file mode 100644
index 0000000000000000000000000000000000000000..818065522e51ea82927fa4922521e8dca3f1cd10
--- /dev/null
+++ b/src/ppc/itx_tmpl.c
@@ -0,0 +1,2006 @@
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * Copyright © 2024, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/ppc/dav1d_types.h"
+#include "src/ppc/itx.h"
+#include "src/ppc/utils.h"
+
+#if BITDEPTH == 8
+
+#define LOAD_4(src, stride, a, b, c, d) \
+{  \
+    uint8_t *s = src; \
+    a = vec_xl(0, s); \
+    s += stride; \
+    b = vec_xl(0, s); \
+    s += stride; \
+    c = vec_xl(0, s); \
+    s += stride; \
+    d = vec_xl(0, s); \
+}
+
+#define LOAD_DECLARE_2_I16(src, a, b) \
+    i16x8 a = vec_xl(0, src); \
+    i16x8 b = vec_xl(0, src + 8);
+
+#define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \
+    i32x4 a = i16h_to_i32(sa); \
+    i32x4 b = i16l_to_i32(sa); \
+    i32x4 c = i16h_to_i32(sb); \
+    i32x4 d = i16l_to_i32(sb);
+
+#define LOAD_COEFF_4(coeff) \
+    LOAD_DECLARE_2_I16(coeff, c01, c23) \
+    UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3)
+
+#define LOAD_SCALE_COEFF_4x8(coeff, scale) \
+    LOAD_DECLARE_2_I16(coeff, c04, c15) \
+    LOAD_DECLARE_2_I16(coeff+16, c26, c37) \
+    i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \
+    i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \
+    i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \
+    i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \
+    c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
+    c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
+    UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
+    c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
+    c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
+    UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
+
+#define LOAD_SCALE_COEFF_8x4(coeff, scale) \
+    LOAD_DECLARE_2_I16(coeff, c01, c23) \
+    LOAD_DECLARE_2_I16(coeff+16, c45, c67) \
+    c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
+    c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
+    UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
+    c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
+    c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
+    UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
+
+#define LOAD_COEFF_8x8(coeff) \
+    LOAD_DECLARE_2_I16(coeff, c0, c1) \
+    LOAD_DECLARE_2_I16(coeff+16, c2, c3) \
+    LOAD_DECLARE_2_I16(coeff+32, c4, c5) \
+    LOAD_DECLARE_2_I16(coeff+48, c6, c7) \
+    UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \
+    UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \
+    UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \
+    UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \
+
+#define LOAD_COEFF_4x16(coeff) \
+    LOAD_DECLARE_2_I16(coeff,    a0b0, c0d0) \
+    LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \
+    LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \
+    LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \
+    UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \
+    UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \
+    UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \
+    UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3)
+
+#define LOAD_DECLARE_4(src, stride, a, b, c, d) \
+    u8x16 a, b, c, d; \
+    LOAD_4(src, stride, a, b, c, d)
+
+#define STORE_LEN(l, dst, stride, a, b, c, d) \
+{ \
+    uint8_t *dst2 = dst; \
+    vec_xst_len(a, dst2, l); \
+    dst2 += stride; \
+    vec_xst_len(b, dst2, l); \
+    dst2 += stride; \
+    vec_xst_len(c, dst2, l); \
+    dst2 += stride; \
+    vec_xst_len(d, dst2, l); \
+}
+
+#define STORE_4(dst, stride, a, b, c, d) \
+    STORE_LEN(4, dst, stride, a, b, c, d)
+
+#define STORE_8(dst, stride, ab, cd, ef, gh) \
+    STORE_LEN(8, dst, stride, ab, cd, ef, gh)
+
+#define STORE_16(dst, stride, l0, l1, l2, l3) \
+{ \
+    uint8_t *dst##2 = dst; \
+    vec_xst(l0, 0, dst##2); \
+    dst##2 += stride; \
+    vec_xst(l1, 0, dst##2); \
+    dst##2 += stride; \
+    vec_xst(l2, 0, dst##2); \
+    dst##2 += stride; \
+    vec_xst(l3, 0, dst##2); \
+}
+
+#define APPLY_COEFF_4(a, b, c, d, c01, c23) \
+{ \
+    u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \
+    u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \
+ \
+    c01 = vec_adds(c01, vec_splat_s16(8)); \
+    c23 = vec_adds(c23, vec_splat_s16(8)); \
+    c01 = vec_sra(c01, vec_splat_u16(4)); \
+    c23 = vec_sra(c23, vec_splat_u16(4)); \
+ \
+    i16x8 abs = u8h_to_i16(ab); \
+    i16x8 cds = u8h_to_i16(cd); \
+ \
+    abs = vec_adds(abs, c01); \
+    cds = vec_adds(cds, c23); \
+ \
+    a = vec_packsu(abs, abs); \
+    c = vec_packsu(cds, cds); \
+ \
+    b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \
+    d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \
+}
+
+#define APPLY_COEFF_8x4(ab, cd, c01, c23) \
+{ \
+    i16x8 abs = u8h_to_i16(ab); \
+    i16x8 cds = u8h_to_i16(cd); \
+    c01 = vec_adds(c01, vec_splat_s16(8)); \
+    c23 = vec_adds(c23, vec_splat_s16(8)); \
+    c01 = vec_sra(c01, vec_splat_u16(4)); \
+    c23 = vec_sra(c23, vec_splat_u16(4)); \
+ \
+    abs = vec_adds(abs, c01); \
+    cds = vec_adds(cds, c23); \
+ \
+    ab = vec_packsu(abs, abs); \
+    cd = vec_packsu(cds, cds); \
+}
+
+#define APPLY_COEFF_16x4(a, b, c, d, \
+                         c00c01, c02c03, c04c05, c06c07, \
+                         c08c09, c10c11, c12c13, c14c15) \
+{ \
+    i16x8 ah = u8h_to_i16(a); \
+    i16x8 al = u8l_to_i16(a); \
+    i16x8 bh = u8h_to_i16(b); \
+    i16x8 bl = u8l_to_i16(b); \
+    i16x8 ch = u8h_to_i16(c); \
+    i16x8 cl = u8l_to_i16(c); \
+    i16x8 dh = u8h_to_i16(d); \
+    i16x8 dl = u8l_to_i16(d); \
+    SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \
+    SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \
+ \
+    ah = vec_adds(ah, c00c01); \
+    al = vec_adds(al, c02c03); \
+    bh = vec_adds(bh, c04c05); \
+    bl = vec_adds(bl, c06c07); \
+    ch = vec_adds(ch, c08c09); \
+    cl = vec_adds(cl, c10c11); \
+    dh = vec_adds(dh, c12c13); \
+    dl = vec_adds(dl, c14c15); \
+ \
+    a = vec_packsu(ah, al); \
+    b = vec_packsu(bh, bl); \
+    c = vec_packsu(ch, cl); \
+    d = vec_packsu(dh, dl); \
+}
+
+#define IDCT_4_INNER(c0, c1, c2, c3) \
+{ \
+    i32x4 o0 = vec_add(c0, c2); \
+    i32x4 o1 = vec_sub(c0, c2); \
+ \
+    i32x4 v2896 = vec_splats(2896); \
+    i32x4 v1567 = vec_splats(1567); \
+    i32x4 v3784 = vec_splats(3784); \
+    i32x4 v2048 = vec_splats(2048); \
+ \
+    o0 = vec_mul(o0, v2896); \
+    o1 = vec_mul(o1, v2896); \
+ \
+    i32x4 o2a = vec_mul(c1, v1567); \
+    i32x4 o2b = vec_mul(c3, v3784); \
+    i32x4 o3a = vec_mul(c1, v3784); \
+    i32x4 o3b = vec_mul(c3, v1567); \
+ \
+    i32x4 o2 = vec_sub(o2a, o2b); \
+    i32x4 o3 = vec_add(o3a, o3b); \
+ \
+    u32x4 v12 = vec_splat_u32(12); \
+ \
+    o0 = vec_add(o0, v2048); \
+    o1 = vec_add(o1, v2048); \
+    o2 = vec_add(o2, v2048); \
+    o3 = vec_add(o3, v2048); \
+ \
+    o0 = vec_sra(o0, v12); \
+    o1 = vec_sra(o1, v12); \
+    o2 = vec_sra(o2, v12); \
+    o3 = vec_sra(o3, v12); \
+ \
+    c0 = vec_add(o0, o3); \
+    c1 = vec_add(o1, o2); \
+    c2 = vec_sub(o1, o2); \
+    c3 = vec_sub(o0, o3); \
+ \
+}
+
+#define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \
+    IDCT_4_INNER(c0, c1, c2, c3) \
+    c03 = vec_packs(c0, c3); \
+    c12 = vec_packs(c1, c2); \
+
+#define dct_4_in(c0, c1, c2, c3, c01, c23) \
+{ \
+    IDCT_4_INNER(c0, c1, c2, c3) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+    c0 = i16h_to_i32(c01); \
+    c1 = i16l_to_i32(c01); \
+    c2 = i16h_to_i32(c23); \
+    c3 = i16l_to_i32(c23); \
+}
+
+#define dct_4_out(c0, c1, c2, c3, c01, c23) \
+    IDCT_4_INNER(c0, c1, c2, c3) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+
+
+#define IDENTITY_4(c01, c23) \
+{ \
+    i16x8 v1697 = vec_splats((int16_t)(1697*8)); \
+    i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \
+    i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \
+    c01 = vec_adds(c01, o01); \
+    c23 = vec_adds(c23, o23); \
+}
+
+#define identity_4_in(c0, c1, c2, c3, c01, c23) \
+{ \
+    IDENTITY_4(c01, c23) \
+    c0 = i16h_to_i32(c01); \
+    c1 = i16l_to_i32(c01); \
+    c2 = i16h_to_i32(c23); \
+    c3 = i16l_to_i32(c23); \
+}
+
+#define identity_4_out(c0, c1, c2, c3, c01, c23) \
+{ \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+    IDENTITY_4(c01, c23) \
+}
+
+#define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \
+{ \
+    i32x4 v1321 = vec_splats(1321); \
+    i32x4 v3803 = vec_splats(3803); \
+    i32x4 v2482 = vec_splats(2482); \
+    i32x4 v3344 = vec_splats(3344); \
+    i32x4 v2048 = vec_splats(2048); \
+    i32x4 i0_v1321 = vec_mul(c0, v1321); \
+    i32x4 i0_v2482 = vec_mul(c0, v2482); \
+    i32x4 i0_v3803 = vec_mul(c0, v3803); \
+    i32x4 i1 = vec_mul(c1, v3344); \
+    i32x4 i2_v1321 = vec_mul(c2, v1321); \
+    i32x4 i2_v2482 = vec_mul(c2, v2482); \
+    i32x4 i2_v3803 = vec_mul(c2, v3803); \
+    i32x4 i3_v1321 = vec_mul(c3, v1321); \
+    i32x4 i3_v2482 = vec_mul(c3, v2482); \
+    i32x4 i3_v3803 = vec_mul(c3, v3803); \
+ \
+    i32x4 n1 = vec_sub(i1, v2048); \
+    i1 = vec_add(i1, v2048); \
+ \
+ \
+    i32x4 o0 = vec_add(i0_v1321, i2_v3803); \
+    i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \
+    i32x4 o2 = vec_sub(c0, c2); \
+    i32x4 o3 = vec_add(i0_v3803, i2_v2482); \
+ \
+    o0 = vec_add(o0, i3_v2482); \
+    o1 = vec_sub(o1, i3_v3803); \
+    o2 = vec_add(o2, c3); \
+    o3 = vec_sub(o3, i3_v1321); \
+ \
+    o0 = vec_add(o0, i1); \
+    o1 = vec_add(o1, i1); \
+    o2 = vec_mul(o2, v3344); \
+    o3 = vec_sub(o3, n1); \
+ \
+    o2 = vec_add(o2, v2048); \
+ \
+    oc0 = vec_sra(o0, vec_splat_u32(12)); \
+    oc1 = vec_sra(o1, vec_splat_u32(12)); \
+    oc2 = vec_sra(o2, vec_splat_u32(12)); \
+    oc3 = vec_sra(o3, vec_splat_u32(12)); \
+}
+
+#define adst_4_in(c0, c1, c2, c3, c01, c23) \
+{ \
+    ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
+}
+
+#define flipadst_4_in(c0, c1, c2, c3, c01, c23) \
+{ \
+    ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
+}
+
+#define adst_4_out(c0, c1, c2, c3, c01, c23) \
+{ \
+    ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+}
+
+#define flipadst_4_out(c0, c1, c2, c3, c01, c23) \
+{ \
+    ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+}
+
+static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
+{
+    int dc = coeff[0];
+    const int rnd = (1 << shift) >> 1;
+    if (is_rect2)
+        dc = (dc * 181 + 128) >> 8;
+    dc = (dc * 181 + 128) >> 8;
+    dc = (dc + rnd) >> shift;
+    dc = (dc * 181 + 128 + 2048) >> 12;
+
+    i16x8 vdc = vec_splats((int16_t)dc);
+    coeff[0] = 0;
+    for (int i = 0; i < n; i++, dst += 4 * stride) {
+        LOAD_DECLARE_4(dst, stride, a, b, c, d)
+
+        i16x8 as = u8h_to_i16(a);
+        i16x8 bs = u8h_to_i16(b);
+        i16x8 cs = u8h_to_i16(c);
+        i16x8 ds = u8h_to_i16(d);
+
+        as = vec_adds(as, vdc);
+        bs = vec_adds(bs, vdc);
+        cs = vec_adds(cs, vdc);
+        ds = vec_adds(ds, vdc);
+
+        a = vec_packsu(as, as);
+        b = vec_packsu(bs, bs);
+        c = vec_packsu(cs, cs);
+        d = vec_packsu(ds, ds);
+
+        STORE_4(dst, stride, a, b, c, d)
+    }
+}
+
+static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
+{
+    int dc = coeff[0];
+    const int rnd = (1 << shift) >> 1;
+    if (is_rect2)
+        dc = (dc * 181 + 128) >> 8;
+    dc = (dc * 181 + 128) >> 8;
+    dc = (dc + rnd) >> shift;
+    dc = (dc * 181 + 128 + 2048) >> 12;
+
+    i16x8 vdc = vec_splats((int16_t)dc);
+    coeff[0] = 0;
+
+    for (int i = 0; i < n; i++, dst += 4 * stride) {
+        LOAD_DECLARE_4(dst, stride, a, b, c, d)
+
+        i16x8 as = u8h_to_i16(a);
+        i16x8 bs = u8h_to_i16(b);
+        i16x8 cs = u8h_to_i16(c);
+        i16x8 ds = u8h_to_i16(d);
+
+        as = vec_adds(as, vdc);
+        bs = vec_adds(bs, vdc);
+        cs = vec_adds(cs, vdc);
+        ds = vec_adds(ds, vdc);
+
+        a = vec_packsu(as, as);
+        b = vec_packsu(bs, bs);
+        c = vec_packsu(cs, cs);
+        d = vec_packsu(ds, ds);
+
+        STORE_8(dst, stride, a, b, c, d)
+    }
+}
+
+static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
+{
+    int dc = coeff[0];
+    const int rnd = (1 << shift) >> 1;
+    if (is_rect2)
+        dc = (dc * 181 + 128) >> 8;
+    dc = (dc * 181 + 128) >> 8;
+    dc = (dc + rnd) >> shift;
+    dc = (dc * 181 + 128 + 2048) >> 12;
+
+    i16x8 vdc = vec_splats((int16_t)dc);
+    coeff[0] = 0;
+
+    for (int i = 0; i < n; i++, dst += 4 * stride) {
+        LOAD_DECLARE_4(dst, stride, a, b, c, d)
+
+        i16x8 ah = u8h_to_i16(a);
+        i16x8 bh = u8h_to_i16(b);
+        i16x8 ch = u8h_to_i16(c);
+        i16x8 dh = u8h_to_i16(d);
+        i16x8 al = u8l_to_i16(a);
+        i16x8 bl = u8l_to_i16(b);
+        i16x8 cl = u8l_to_i16(c);
+        i16x8 dl = u8l_to_i16(d);
+
+        ah = vec_adds(ah, vdc);
+        bh = vec_adds(bh, vdc);
+        ch = vec_adds(ch, vdc);
+        dh = vec_adds(dh, vdc);
+        al = vec_adds(al, vdc);
+        bl = vec_adds(bl, vdc);
+        cl = vec_adds(cl, vdc);
+        dl = vec_adds(dl, vdc);
+
+        a = vec_packsu(ah, al);
+        b = vec_packsu(bh, bl);
+        c = vec_packsu(ch, cl);
+        d = vec_packsu(dh, dl);
+
+        STORE_16(dst, stride, a, b, c, d)
+    }
+}
+
+void dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
+                                              int16_t *const coeff, const int eob)
+{
+    assert(eob >= 0);
+
+    if (eob < 1) {
+        return dc_only_4xN(dst, stride, coeff, 1, 0, 0);
+    }
+
+    LOAD_COEFF_4(coeff)
+
+    dct_4_in(c0, c1, c2, c3, c01, c23)
+
+    TRANSPOSE4_I32(c0, c1, c2, c3)
+
+    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+
+    dct_4_out(c0, c1, c2, c3, c01, c23)
+
+    LOAD_DECLARE_4(dst, stride, a, b, c, d)
+
+    APPLY_COEFF_4(a, b, c, d, c01, c23)
+
+    STORE_4(dst, stride, a, b, c, d)
+}
+
+void dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride,
+                                              coef *const coeff, const int eob)
+{
+    LOAD_COEFF_4(coeff)
+
+    u32x4 v2 = vec_splat_u32(2);
+
+    c0 = vec_sra(c0, v2);
+    c1 = vec_sra(c1, v2);
+    c2 = vec_sra(c2, v2);
+    c3 = vec_sra(c3, v2);
+
+    i32x4 t0 = vec_add(c0, c1);
+    i32x4 t2 = vec_sub(c2, c3);
+    i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
+    i32x4 t3 = vec_sub(t4, c3);
+    i32x4 t1 = vec_sub(t4, c1);
+    c0 = vec_sub(t0, t3);
+    c1 = t3;
+    c2 = t1;
+    c3 = vec_add(t2, t1);
+
+    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+
+    TRANSPOSE4_I32(c0, c1, c2, c3)
+
+    t0 = vec_add(c0, c1);
+    t2 = vec_sub(c2, c3);
+    t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
+    t3 = vec_sub(t4, c3);
+    t1 = vec_sub(t4, c1);
+    c0 = vec_sub(t0, t3);
+    c1 = t3;
+    c2 = t1;
+    c3 = vec_add(t2, t1);
+
+    c01 = vec_packs(c0, c1);
+    c23 = vec_packs(c2, c3);
+
+    LOAD_DECLARE_4(dst, stride, a, b, c, d)
+
+    u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b);
+    u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d);
+
+    i16x8 abs = u8h_to_i16(ab);
+    i16x8 cds = u8h_to_i16(cd);
+
+    abs = vec_adds(abs, c01);
+    cds = vec_adds(cds, c23);
+
+    a = vec_packsu(abs, abs);
+    c = vec_packsu(cds, cds);
+
+    b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a);
+    d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c);
+
+    STORE_4(dst, stride, a, b, c, d)
+}
+
+#define inv_txfm_fn4x4(type1, type2) \
+void dav1d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                          int16_t *const coeff, const int eob) \
+{ \
+    LOAD_COEFF_4(coeff) \
+    type1##_4_in(c0, c1, c2, c3, c01, c23) \
+    memset(coeff, 0, sizeof(*coeff) * 4 * 4); \
+    TRANSPOSE4_I32(c0, c1, c2, c3) \
+    type2##_4_out(c0, c1, c2, c3, c01, c23) \
+    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
+    APPLY_COEFF_4(a, b, c, d, c01, c23) \
+    STORE_4(dst, stride, a, b, c, d) \
+}
+
+inv_txfm_fn4x4(adst,     dct     )
+inv_txfm_fn4x4(dct,      adst    )
+inv_txfm_fn4x4(dct,      flipadst)
+inv_txfm_fn4x4(flipadst, dct     )
+inv_txfm_fn4x4(adst,     flipadst)
+inv_txfm_fn4x4(flipadst, adst    )
+inv_txfm_fn4x4(identity, dct     )
+inv_txfm_fn4x4(dct,      identity)
+inv_txfm_fn4x4(identity, flipadst)
+inv_txfm_fn4x4(flipadst, identity)
+inv_txfm_fn4x4(identity, adst   )
+inv_txfm_fn4x4(adst,     identity)
+inv_txfm_fn4x4(identity, identity)
+inv_txfm_fn4x4(adst,     adst    )
+inv_txfm_fn4x4(flipadst, flipadst)
+
+
+#define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
+    dct4_for_dct8(c0, c2, c4, c6, c03, c12) \
+ \
+    i32x4 v799 = vec_splats(799); \
+    i32x4 v4017 = vec_splats(4017); \
+    i32x4 v3406 = vec_splats(3406); \
+    i32x4 v2276 = vec_splats(2276); \
+    i32x4 v2048 = vec_splats(2048); \
+    u32x4 v12 = vec_splat_u32(12); \
+ \
+    i32x4 c1v799 = vec_mul(c1, v799); \
+    i32x4 c7v4017 = vec_mul(c7, v4017); \
+    i32x4 c5v3406 = vec_mul(c5, v3406); \
+    i32x4 c3v2276 = vec_mul(c3, v2276); \
+    i32x4 c5v2276 = vec_mul(c5, v2276); \
+    i32x4 c3v3406 = vec_mul(c3, v3406); \
+    i32x4 c1v4017 = vec_mul(c1, v4017); \
+    i32x4 c7v799 = vec_mul(c7, v799); \
+ \
+    i32x4 t4a = vec_subs(c1v799, c7v4017); \
+    i32x4 t5a = vec_subs(c5v3406, c3v2276); \
+    i32x4 t6a = vec_adds(c5v2276, c3v3406); \
+    i32x4 t7a = vec_adds(c1v4017, c7v799); \
+ \
+    t4a = vec_adds(t4a, v2048); \
+    t5a = vec_adds(t5a, v2048); \
+    t6a = vec_adds(t6a, v2048); \
+    t7a = vec_adds(t7a, v2048); \
+ \
+    t4a = vec_sra(t4a, v12); \
+    t7a = vec_sra(t7a, v12); \
+    t5a = vec_sra(t5a, v12); \
+    t6a = vec_sra(t6a, v12); \
+ \
+    i16x8 t7at4a = vec_packs(t7a, t4a); \
+    i16x8 t6at5a = vec_packs(t6a, t5a); \
+ \
+    i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \
+    t6at5a = vec_subs(t7at4a, t6at5a); \
+ \
+    t6a = i16h_to_i32(t6at5a); \
+    t5a = i16l_to_i32(t6at5a); \
+ \
+    i32x4 t6 = vec_add(t6a, t5a); \
+    i32x4 t5 = vec_sub(t6a, t5a); \
+ \
+    t6 = vec_mul(t6, vec_splats(181)); \
+    t5 = vec_mul(t5, vec_splats(181)); \
+    t6 = vec_add(t6, vec_splats(128)); \
+    t5 = vec_add(t5, vec_splats(128)); \
+ \
+    t6 = vec_sra(t6, vec_splat_u32(8)); \
+    t5 = vec_sra(t5, vec_splat_u32(8)); \
+ \
+    i16x8 t6t5 = vec_packs(t6, t5); \
+ \
+    c74 = vec_subs(c03, t7t4); \
+    c65 = vec_subs(c12, t6t5); \
+    c03 = vec_adds(c03, t7t4); \
+    c12 = vec_adds(c12, t6t5); \
+
+#define UNPACK_4_I16_I32(t0, t1, t2, t3) \
+    t0 = i16h_to_i32(t0##t1); \
+    t1 = i16l_to_i32(t0##t1); \
+    t2 = i16h_to_i32(t2##t3); \
+    t3 = i16l_to_i32(t2##t3);
+
+#define UNPACK_PAIR_I16_I32(hi, lo, v) \
+    hi = i16h_to_i32(v); \
+    lo = i16l_to_i32(v); \
+
+
+#define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \
+{ \
+    i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \
+    IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \
+    UNPACK_4_I16_I32(c0, c3, c1, c2) \
+    UNPACK_4_I16_I32(c7, c4, c6, c5) \
+}
+
+#define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+{ \
+    i16x8 c03, c12, c74, c65; \
+    IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
+    c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \
+    c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \
+    c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \
+    c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \
+}
+
+#define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                   c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \
+    dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \
+}
+
+#define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                    c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    i16x8 c03h, c12h, c74h, c65h; \
+    i16x8 c03l, c12l, c74l, c65l; \
+    { \
+        IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \
+    } \
+    { \
+        IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \
+    } \
+    c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \
+    c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \
+    c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \
+    c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \
+    c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \
+    c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \
+    c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \
+    c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \
+}
+
+#define IDENTITY_8(c01, c23, c45, c67) \
+{ \
+    c01 = vec_adds(c01, c01); \
+    c23 = vec_adds(c23, c23); \
+    c45 = vec_adds(c45, c45); \
+    c67 = vec_adds(c67, c67); \
+}
+
+#define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+{ \
+    IDENTITY_8(c01, c23, c45, c67) \
+    UNPACK_PAIR_I16_I32(c0, c1, c01) \
+    UNPACK_PAIR_I16_I32(c2, c3, c23) \
+    UNPACK_PAIR_I16_I32(c4, c5, c45) \
+    UNPACK_PAIR_I16_I32(c6, c7, c67) \
+}
+
+#define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+    c45 = vec_packs(c4, c5); \
+    c67 = vec_packs(c6, c7); \
+    IDENTITY_8(c01, c23, c45, c67)
+
+#define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                        c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                        c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    IDENTITY_8(c0, c1, c2, c3) \
+    IDENTITY_8(c4, c5, c6, c7) \
+    UNPACK_PAIR_I16_I32(c0h, c0l, c0) \
+    UNPACK_PAIR_I16_I32(c1h, c1l, c1) \
+    UNPACK_PAIR_I16_I32(c2h, c2l, c2) \
+    UNPACK_PAIR_I16_I32(c3h, c3l, c3) \
+    UNPACK_PAIR_I16_I32(c4h, c4l, c4) \
+    UNPACK_PAIR_I16_I32(c5h, c5l, c5) \
+    UNPACK_PAIR_I16_I32(c6h, c6l, c6) \
+    UNPACK_PAIR_I16_I32(c7h, c7l, c7) \
+}
+
+#define PACK_4(c0, c1, c2, c3, \
+               c0h, c1h, c2h, c3h, \
+               c0l, c1l, c2l, c3l) \
+{ \
+    c0 = vec_packs(c0h, c0l); \
+    c1 = vec_packs(c1h, c1l); \
+    c2 = vec_packs(c2h, c2l); \
+    c3 = vec_packs(c3h, c3l); \
+}
+
+#define DECLARE_PACK_4(c0, c1, c2, c3, \
+                       c0h, c1h, c2h, c3h, \
+                       c0l, c1l, c2l, c3l) \
+    i16x8 c0, c1, c2, c3; \
+    PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l);
+
+#define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+               c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+               c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+{ \
+    c0 = vec_packs(c0h, c0l); \
+    c1 = vec_packs(c1h, c1l); \
+    c2 = vec_packs(c2h, c2l); \
+    c3 = vec_packs(c3h, c3l); \
+    c4 = vec_packs(c4h, c4l); \
+    c5 = vec_packs(c5h, c5l); \
+    c6 = vec_packs(c6h, c6l); \
+    c7 = vec_packs(c7h, c7l); \
+}
+
+#define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                         c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                         c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+           c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+           c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+    IDENTITY_8(c0, c1, c2, c3) \
+    IDENTITY_8(c4, c5, c6, c7) \
+}
+
+#define DECLARE_SPLAT_I32(val) \
+    i32x4 v##val = vec_splats(val);
+
+#define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \
+    i32x4 ca##va = vec_mul(ca, va); \
+    i32x4 cb##vb = vec_mul(cb, vb); \
+    i32x4 ca##vb = vec_mul(ca, vb); \
+    i32x4 cb##va = vec_mul(cb, va);
+
+#define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
+    r0 = vec_adds(ca##va, cb##vb); \
+    r1 = vec_subs(ca##vb, cb##va);
+
+#define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
+    i32x4 r0, r1; \
+    ADD_SUB_PAIR(r0, r1, ca, cb, va, vb)
+
+#define SCALE_ROUND_4(a, b, c, d, rnd, shift) \
+    a = vec_adds(a, rnd); \
+    b = vec_adds(b, rnd); \
+    c = vec_adds(c, rnd); \
+    d = vec_adds(d, rnd); \
+    a = vec_sra(a, shift); \
+    b = vec_sra(b, shift); \
+    c = vec_sra(c, shift); \
+    d = vec_sra(d, shift);
+
+#define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+                     o0, o1, o2, o3, o4, o5, o6, o7) \
+{ \
+    DECLARE_SPLAT_I32(4076) \
+    DECLARE_SPLAT_I32(401) \
+ \
+    DECLARE_SPLAT_I32(3612) \
+    DECLARE_SPLAT_I32(1931) \
+ \
+    DECLARE_SPLAT_I32(2598) \
+    DECLARE_SPLAT_I32(3166) \
+ \
+    DECLARE_SPLAT_I32(1189) \
+    DECLARE_SPLAT_I32(3920) \
+ \
+    DECLARE_SPLAT_I32(3784) \
+    DECLARE_SPLAT_I32(1567) \
+ \
+    DECLARE_SPLAT_I32(2048) \
+    u32x4 v12 = vec_splat_u32(12); \
+ \
+    DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \
+    DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \
+    DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \
+    DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \
+ \
+    DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \
+    DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \
+    DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \
+    DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \
+ \
+    SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \
+    SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
+ \
+    i32x4 t0 = vec_add(t0a, t4a); \
+    i32x4 t1 = vec_add(t1a, t5a); \
+    i32x4 t2 = vec_add(t2a, t6a); \
+    i32x4 t3 = vec_add(t3a, t7a); \
+    i32x4 t4 = vec_sub(t0a, t4a); \
+    i32x4 t5 = vec_sub(t1a, t5a); \
+    i32x4 t6 = vec_sub(t2a, t6a); \
+    i32x4 t7 = vec_sub(t3a, t7a); \
+ \
+    i16x8 t0t1 = vec_packs(t0, t1); \
+    i16x8 t2t3 = vec_packs(t2, t3); \
+    i16x8 t4t5 = vec_packs(t4, t5); \
+    i16x8 t6t7 = vec_packs(t6, t7); \
+ \
+    UNPACK_4_I16_I32(t4, t5, t6, t7) \
+    UNPACK_4_I16_I32(t0, t1, t2, t3) \
+ \
+    DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \
+    DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \
+ \
+    ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \
+    ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \
+ \
+    SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
+  \
+    o0 = vec_add(t0, t2); \
+    o1 = vec_add(t4a, t6a); \
+    o7 = vec_add(t1, t3); \
+    o6 = vec_add(t5a, t7a); \
+    t2 = vec_sub(t0, t2); \
+    t3 = vec_sub(t1, t3); \
+    t6 = vec_sub(t4a, t6a); \
+    t7 = vec_sub(t5a, t7a); \
+ \
+    i16x8 o7##o1 = vec_packs(o7, o1); \
+    i16x8 o0##o6 = vec_packs(o0, o6); \
+    t2t3 = vec_packs(t2, t3); \
+    t6t7 = vec_packs(t6, t7); \
+ \
+    UNPACK_4_I16_I32(t2, t3, t6, t7) \
+    UNPACK_4_I16_I32(o7, o1, o0, o6) \
+ \
+    o7 = -o7; \
+    o1 = -o1; \
+ \
+    o3 = vec_add(t2, t3); \
+    o4 = vec_sub(t2, t3); \
+    o5 = vec_sub(t6, t7); \
+    o2 = vec_add(t6, t7); \
+ \
+    i32x4 v181 = vec_splats(181); \
+    i32x4 v128 = vec_splats(128); \
+    u32x4 v8 = vec_splat_u32(8); \
+ \
+    o2 = vec_mul(o2, v181); \
+    o3 = vec_mul(o3, v181); \
+    o4 = vec_mul(o4, v181); \
+    o5 = vec_mul(o5, v181); \
+ \
+    SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \
+ \
+    o3 = -o3; \
+    o5 = -o5; \
+}
+
+#define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+{\
+    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+                 c0, c1, c2, c3, c4, c5, c6, c7) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+    c45 = vec_packs(c4, c5); \
+    c67 = vec_packs(c6, c7); \
+    UNPACK_PAIR_I16_I32(c0, c1, c01) \
+    UNPACK_PAIR_I16_I32(c2, c3, c23) \
+    UNPACK_PAIR_I16_I32(c4, c5, c45) \
+    UNPACK_PAIR_I16_I32(c6, c7, c67) \
+}
+
+#define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+{\
+    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+                 c0, c1, c2, c3, c4, c5, c6, c7) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+    c45 = vec_packs(c4, c5); \
+    c67 = vec_packs(c6, c7); \
+}
+
+#define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                    c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
+    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+}
+
+#define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                    c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
+    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+    PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+           c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+           c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+}
+
+#define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+{\
+    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+                 c7, c6, c5, c4, c3, c2, c1, c0) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+    c45 = vec_packs(c4, c5); \
+    c67 = vec_packs(c6, c7); \
+    UNPACK_PAIR_I16_I32(c0, c1, c01) \
+    UNPACK_PAIR_I16_I32(c2, c3, c23) \
+    UNPACK_PAIR_I16_I32(c4, c5, c45) \
+    UNPACK_PAIR_I16_I32(c6, c7, c67) \
+}
+
+#define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+{\
+    ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+                 c7, c6, c5, c4, c3, c2, c1, c0) \
+    c01 = vec_packs(c0, c1); \
+    c23 = vec_packs(c2, c3); \
+    c45 = vec_packs(c4, c5); \
+    c67 = vec_packs(c6, c7); \
+}
+
+#define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                        c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                        c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
+    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
+}
+
+#define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                         c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                         c0, c1, c2, c3, c4, c5, c6, c7) \
+{ \
+    ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
+    ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
+    PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
+           c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+           c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+}
+
+void dav1d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
+                                              int16_t *const coeff, const int eob)
+{
+    i16x8 v = vec_splats((int16_t)(2896*8));
+
+    if (eob < 1) {
+        return dc_only_4xN(dst, stride, coeff, 2, 1, 0);
+    }
+
+    LOAD_SCALE_COEFF_4x8(coeff, v)
+
+    dct_4_in(c0, c1, c2, c3, c01, c23)
+    dct_4_in(c4, c5, c6, c7, c45, c67)
+
+
+    memset(coeff, 0, sizeof(*coeff) * 4 * 8);
+
+    TRANSPOSE4_I32(c0, c1, c2, c3);
+    TRANSPOSE4_I32(c4, c5, c6, c7);
+
+    dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
+
+    LOAD_DECLARE_4(dst, stride, a, b, cc, d)
+    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
+
+    APPLY_COEFF_4(a, b, cc, d, c01, c23)
+    APPLY_COEFF_4(e, f, g, hh, c45, c67)
+
+    STORE_4(dst, stride, a, b, cc, d)
+    STORE_4(dst + 4 * stride, stride, e, f, g, hh)
+}
+
+
+#define inv_txfm_fn4x8(type1, type2) \
+void dav1d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                          int16_t *const coeff, const int eob) \
+{ \
+    i16x8 v = vec_splats((int16_t)(2896*8)); \
+    LOAD_SCALE_COEFF_4x8(coeff, v) \
+    type1##_4_in(c0, c1, c2, c3, c01, c23) \
+    type1##_4_in(c4, c5, c6, c7, c45, c67) \
+    memset(coeff, 0, sizeof(*coeff) * 4 * 8); \
+    TRANSPOSE4_I32(c0, c1, c2, c3); \
+    TRANSPOSE4_I32(c4, c5, c6, c7); \
+    type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
+    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
+    APPLY_COEFF_4(a, b, c, d, c01, c23) \
+    APPLY_COEFF_4(e, f, g, h, c45, c67) \
+    STORE_4(dst, stride, a, b, c, d) \
+    STORE_4(dst + 4 * stride, stride, e, f, g, h) \
+}
+
+inv_txfm_fn4x8(adst,     dct     )
+inv_txfm_fn4x8(dct,      adst    )
+inv_txfm_fn4x8(dct,      flipadst)
+inv_txfm_fn4x8(flipadst, dct     )
+inv_txfm_fn4x8(adst,     flipadst)
+inv_txfm_fn4x8(flipadst, adst    )
+inv_txfm_fn4x8(identity, dct     )
+inv_txfm_fn4x8(dct,      identity)
+inv_txfm_fn4x8(identity, flipadst)
+inv_txfm_fn4x8(flipadst, identity)
+inv_txfm_fn4x8(identity, adst   )
+inv_txfm_fn4x8(adst,     identity)
+inv_txfm_fn4x8(identity, identity)
+inv_txfm_fn4x8(adst,     adst    )
+inv_txfm_fn4x8(flipadst, flipadst)
+
+
+void dav1d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
+                                              int16_t *const coeff, const int eob)
+{
+    i16x8 v = vec_splats((int16_t)(2896*8));
+
+    if (eob < 1) {
+        return dc_only_8xN(dst, stride, coeff, 1, 1, 0);
+    }
+
+    LOAD_SCALE_COEFF_8x4(coeff, v)
+
+    dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
+
+    memset(coeff, 0, sizeof(*coeff) * 8 * 4);
+
+    TRANSPOSE4_I32(c0, c1, c2, c3)
+    TRANSPOSE4_I32(c4, c5, c6, c7)
+
+    dct_4_out(c0, c1, c2, c3, c01, c23)
+    dct_4_out(c4, c5, c6, c7, c45, c67)
+
+    LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh)
+
+    i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45);
+    i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45);
+    i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67);
+    i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67);
+
+    APPLY_COEFF_8x4(ae, bf, c04, c15)
+    APPLY_COEFF_8x4(cg, dh, c26, c37)
+
+    STORE_8(dst, stride, ae, bf, cg, dh)
+}
+
+
+#define inv_txfm_fn8x4(type1, type2) \
+void dav1d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                          int16_t *const coeff, const int eob) \
+{ \
+    i16x8 v = vec_splats((int16_t)(2896*8)); \
+    LOAD_SCALE_COEFF_8x4(coeff, v) \
+    type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
+    memset(coeff, 0, sizeof(*coeff) * 8 * 4); \
+    TRANSPOSE4_I32(c0, c1, c2, c3) \
+    TRANSPOSE4_I32(c4, c5, c6, c7) \
+    type2##_4_out(c0, c1, c2, c3, c01, c23) \
+    type2##_4_out(c4, c5, c6, c7, c45, c67) \
+    LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \
+    i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \
+    i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \
+    i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \
+    i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \
+    APPLY_COEFF_8x4(ae, bf, c04, c15) \
+    APPLY_COEFF_8x4(cg, dh, c26, c37) \
+    STORE_8(dst, stride, ae, bf, cg, dh) \
+}
+inv_txfm_fn8x4(adst,     dct     )
+inv_txfm_fn8x4(dct,      adst    )
+inv_txfm_fn8x4(dct,      flipadst)
+inv_txfm_fn8x4(flipadst, dct     )
+inv_txfm_fn8x4(adst,     flipadst)
+inv_txfm_fn8x4(flipadst, adst    )
+inv_txfm_fn8x4(identity, dct     )
+inv_txfm_fn8x4(dct,      identity)
+inv_txfm_fn8x4(identity, flipadst)
+inv_txfm_fn8x4(flipadst, identity)
+inv_txfm_fn8x4(identity, adst   )
+inv_txfm_fn8x4(adst,     identity)
+inv_txfm_fn8x4(identity, identity)
+inv_txfm_fn8x4(adst,     adst    )
+inv_txfm_fn8x4(flipadst, flipadst)
+
+void dav1d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
+                                              int16_t *const coeff, const int eob)
+{
+    if (eob < 1) {
+        return dc_only_8xN(dst, stride, coeff, 2, 0, 1);
+    }
+
+    LOAD_COEFF_8x8(coeff)
+
+    dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
+               c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
+               c0, c1, c2, c3, c4, c5, c6, c7)
+
+    memset(coeff, 0, sizeof(*coeff) * 8 * 8);
+
+    SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1))
+
+    TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
+                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l)
+
+    dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
+                c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
+                c0, c1, c2, c3, c4, c5, c6, c7)
+
+    LOAD_DECLARE_4(dst, stride, a, b, cc, d)
+    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
+
+    APPLY_COEFF_8x4(a, b, c0, c1)
+    APPLY_COEFF_8x4(cc, d, c2, c3)
+    APPLY_COEFF_8x4(e, f, c4, c5)
+    APPLY_COEFF_8x4(g, hh, c6, c7)
+
+    STORE_8(dst, stride, a, b, cc, d)
+    STORE_8(dst + 4 * stride, stride, e, f, g, hh)
+}
+
+#define inv_txfm_fn8x8(type1, type2) \
+void dav1d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                          int16_t *const coeff, const int eob) \
+{ \
+    LOAD_COEFF_8x8(coeff) \
+    type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                   c0, c1, c2, c3, c4, c5, c6, c7) \
+    SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \
+    memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
+    TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+    type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                    c0, c1, c2, c3, c4, c5, c6, c7) \
+    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
+    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
+    APPLY_COEFF_8x4(a, b, c0, c1) \
+    APPLY_COEFF_8x4(c, d, c2, c3) \
+    APPLY_COEFF_8x4(e, f, c4, c5) \
+    APPLY_COEFF_8x4(g, h, c6, c7) \
+    STORE_8(dst, stride, a, b, c, d) \
+    STORE_8(dst + 4 * stride, stride, e, f, g, h) \
+}
+inv_txfm_fn8x8(adst,     dct     )
+inv_txfm_fn8x8(dct,      adst    )
+inv_txfm_fn8x8(dct,      flipadst)
+inv_txfm_fn8x8(flipadst, dct     )
+inv_txfm_fn8x8(adst,     flipadst)
+inv_txfm_fn8x8(flipadst, adst    )
+inv_txfm_fn8x8(dct,      identity)
+inv_txfm_fn8x8(flipadst, identity)
+inv_txfm_fn8x8(adst,     identity)
+inv_txfm_fn8x8(adst,     adst    )
+inv_txfm_fn8x8(flipadst, flipadst)
+
+// identity + scale is a no op
+#define inv_txfm_fn8x8_identity(type2) \
+void dav1d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                         int16_t *const coeff, const int eob) \
+{ \
+    LOAD_COEFF_8x8(coeff) \
+    memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
+    TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                   c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
+    type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
+                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
+                    c0, c1, c2, c3, c4, c5, c6, c7) \
+    LOAD_DECLARE_4(dst, stride, a, b, c, d) \
+    LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
+    APPLY_COEFF_8x4(a, b, c0, c1) \
+    APPLY_COEFF_8x4(c, d, c2, c3) \
+    APPLY_COEFF_8x4(e, f, c4, c5) \
+    APPLY_COEFF_8x4(g, h, c6, c7) \
+    STORE_8(dst, stride, a, b, c, d) \
+    STORE_8(dst + 4 * stride, stride, e, f, g, h) \
+}
+inv_txfm_fn8x8_identity(dct     )
+inv_txfm_fn8x8_identity(flipadst)
+inv_txfm_fn8x8_identity(adst    )
+inv_txfm_fn8x8_identity(identity)
+
+#define CLIP16_I32_8(a, b, c, d, e, f, g, h, \
+                     ab, cd, ef, gh) \
+{ \
+    ab = vec_packs(a, b); \
+    cd = vec_packs(c, d); \
+    ef = vec_packs(e, f); \
+    gh = vec_packs(g, h); \
+    UNPACK_PAIR_I16_I32(a, b, ab) \
+    UNPACK_PAIR_I16_I32(c, d, cd) \
+    UNPACK_PAIR_I16_I32(e, f, ef) \
+    UNPACK_PAIR_I16_I32(g, h, gh) \
+}
+
+#define MUL_4_INPLACE(a, b, c, d, v) \
+    a = vec_mul(a, v); \
+    b = vec_mul(b, v); \
+    c = vec_mul(c, v); \
+    d = vec_mul(d, v); \
+
+#define IDENTITY_16_V(v) \
+{ \
+    i16x8 v_ = vec_adds(v, v); \
+    v = vec_mradds(v, v1697_16, v_); \
+}
+
+#define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
+                          c08c09, c10c11, c12c13, c14c15) \
+{ \
+    i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \
+    IDENTITY_16_V(c00c01) \
+    IDENTITY_16_V(c02c03) \
+    IDENTITY_16_V(c04c05) \
+    IDENTITY_16_V(c06c07) \
+    IDENTITY_16_V(c08c09) \
+    IDENTITY_16_V(c10c11) \
+    IDENTITY_16_V(c12c13) \
+    IDENTITY_16_V(c14c15) \
+}
+
+#define IDENTITY_16_4_I32(a, b, c, d) \
+{ \
+    i32x4 a2 = vec_add(a, a); \
+    i32x4 b2 = vec_add(b, b); \
+    i32x4 c2 = vec_add(c, c); \
+    i32x4 d2 = vec_add(d, d); \
+    MUL_4_INPLACE(a, b, c, d, v1697) \
+    SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \
+    a = vec_add(a2, a); \
+    b = vec_add(b2, b); \
+    c = vec_add(c2, c); \
+    d = vec_add(d2, d); \
+}
+
+
+#define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
+                       c08, c09, c10, c11, c12, c13, c14, c15, \
+                       c00c01, c02c03, c04c05, c06c07, \
+                       c08c09, c10c11, c12c13, c14c15) \
+{ \
+    DECLARE_SPLAT_I32(1697) \
+    DECLARE_SPLAT_I32(1024) \
+    IDENTITY_16_4_I32(c00, c01, c02, c03) \
+    IDENTITY_16_4_I32(c04, c05, c06, c07) \
+    IDENTITY_16_4_I32(c08, c09, c10, c11) \
+    IDENTITY_16_4_I32(c12, c13, c14, c15) \
+}
+
+#define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
+                        c08, c09, c10, c11, c12, c13, c14, c15, \
+                        c00c01, c02c03, c04c05, c06c07, \
+                        c08c09, c10c11, c12c13, c14c15) \
+{ \
+    PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
+           c00, c02, c04, c06, c08, c10, c12, c14, \
+           c01, c03, c05, c07, c09, c11, c13, c15)  \
+    IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
+                      c08c09, c10c11, c12c13, c14c15) \
+}
+
+#define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \
+                      c08, c09, c10, c11, c12, c13, c14, c15, \
+                      c00c03, c01c02, c07c04, c06c05, \
+                      c08c11, c09c10, c14c13, c15c12) \
+    IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \
+                 c00c03, c01c02, c07c04, c06c05) \
+    DECLARE_SPLAT_I32(128) \
+    DECLARE_SPLAT_I32(181) \
+    DECLARE_SPLAT_I32(401) \
+    DECLARE_SPLAT_I32(4076) \
+    DECLARE_SPLAT_I32(3166) \
+    DECLARE_SPLAT_I32(2598) \
+    DECLARE_SPLAT_I32(1931) \
+    DECLARE_SPLAT_I32(3612) \
+    DECLARE_SPLAT_I32(3920) \
+    DECLARE_SPLAT_I32(1189) \
+    DECLARE_SPLAT_I32(1567) \
+    DECLARE_SPLAT_I32(3784) \
+\
+    DECLARE_MUL_PAIR_I32(c01, c15,  v401, v4076) \
+    DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \
+    DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \
+    DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \
+\
+    DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076,  v401) \
+    DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \
+    DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \
+    DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \
+\
+    SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \
+    SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \
+\
+    CLIP16_I32_8(t15a, t08a, t14a, t09a, \
+                 t13a, t10a, t12a, t11a, \
+                 c08c11, c09c10, c14c13, c15c12) \
+    DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \
+    DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \
+    DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \
+    DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \
+\
+    CLIP16_I32_8(t08, t09, t11, t10, \
+                 t12, t13, t15, t14, \
+                 c08c11, c09c10, c14c13, c15c12) \
+\
+    DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \
+    DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \
+    \
+    ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \
+    ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \
+    t10a = -t10a; \
+\
+    SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \
+\
+    ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \
+    ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \
+    ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \
+    ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \
+\
+    CLIP16_I32_8(t08a, t11a, t09, t10, \
+                 t15a, t12a, t14, t13, \
+                 c08c11, c09c10, c14c13, c15c12) \
+    ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \
+    ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \
+\
+    MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \
+    SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \
+\
+    DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \
+                   t15a, t14, t08a, t09, \
+                   t12, t13a, t11,  t10a) \
+\
+    c15c12 = vec_subs(c00c03, t15at12); \
+    c14c13 = vec_subs(c01c02, t14t13a); \
+    c08c11 = vec_subs(c07c04, t08at11); \
+    c09c10 = vec_subs(c06c05, t09t10a); \
+    c00c03 = vec_adds(c00c03, t15at12); \
+    c01c02 = vec_adds(c01c02, t14t13a); \
+    c07c04 = vec_adds(c07c04, t08at11); \
+    c06c05 = vec_adds(c06c05, t09t10a); \
+
+#define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
+                   c08, c09, c10, c11, c12, c13, c14, c15, \
+                   c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
+\
+    i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \
+    IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
+    c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \
+    c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \
+    c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \
+    c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \
+    c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \
+    c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \
+    c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \
+    c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \
+
+#define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
+                  c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
+\
+    IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
+    UNPACK_PAIR_I16_I32(c00, c03, c00c03) \
+    UNPACK_PAIR_I16_I32(c01, c02, c01c02) \
+    UNPACK_PAIR_I16_I32(c07, c04, c07c04) \
+    UNPACK_PAIR_I16_I32(c06, c05, c06c05) \
+    UNPACK_PAIR_I16_I32(c08, c11, c08c11) \
+    UNPACK_PAIR_I16_I32(c09, c10, c09c10) \
+    UNPACK_PAIR_I16_I32(c14, c13, c14c13) \
+    UNPACK_PAIR_I16_I32(c15, c12, c15c12) \
+
+
+#define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
+                   cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
+                   a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
+    dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
+    dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
+    dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
+    dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
+
+
+#define PACK_4x4(c00, c01, c02, c03, \
+                 c04, c05, c06, c07, \
+                 c08, c09, c10, c11, \
+                 c12, c13, c14, c15, \
+                 c00c01, c02c03, c04c05, c06c07, \
+                 c08c09, c10c11, c12c13, c14c15) \
+{ \
+    c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \
+    c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \
+    c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \
+    c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \
+}
+
+
+
+#define dct_4x4_out(c00, c01, c02, c03, \
+                    c04, c05, c06, c07, \
+                    c08, c09, c10, c11, \
+                    c12, c13, c14, c15, \
+                    c00c01, c02c03, c04c05, c06c07, \
+                    c08c09, c10c11, c12c13, c14c15) \
+{ \
+    IDCT_4_INNER(c00, c01, c02, c03) \
+    IDCT_4_INNER(c04, c05, c06, c07) \
+    IDCT_4_INNER(c08, c09, c10, c11) \
+    IDCT_4_INNER(c12, c13, c14, c15) \
+\
+    PACK_4x4(c00, c01, c02, c03, \
+             c04, c05, c06, c07, \
+             c08, c09, c10, c11, \
+             c12, c13, c14, c15, \
+             c00c01, c02c03, c04c05, c06c07, \
+             c08c09, c10c11, c12c13, c14c15) \
+}
+
+#define IDENTITY_4_I32(a, b, c, d) \
+{ \
+    DECLARE_SPLAT_I32(5793) \
+    DECLARE_SPLAT_I32(2048) \
+    MUL_4_INPLACE(a, b, c, d, v5793) \
+    SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \
+}
+
+#define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
+                       cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
+                       a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
+{ \
+    IDENTITY_4_I32(cA0, cA1, cA2, cA3) \
+    IDENTITY_4_I32(cB0, cB1, cB2, cB3) \
+    IDENTITY_4_I32(cC0, cC1, cC2, cC3) \
+    IDENTITY_4_I32(cD0, cD1, cD2, cD3) \
+}
+
+#define identity_4x4_out(c00, c01, c02, c03, \
+                         c04, c05, c06, c07, \
+                         c08, c09, c10, c11, \
+                         c12, c13, c14, c15, \
+                         c00c01, c02c03, c04c05, c06c07, \
+                         c08c09, c10c11, c12c13, c14c15) \
+{ \
+    PACK_4x4(c00, c01, c02, c03, \
+             c04, c05, c06, c07, \
+             c08, c09, c10, c11, \
+             c12, c13, c14, c15, \
+             c00c01, c02c03, c04c05, c06c07, \
+             c08c09, c10c11, c12c13, c14c15) \
+    IDENTITY_4(c00c01, c02c03) \
+    IDENTITY_4(c04c05, c06c07) \
+    IDENTITY_4(c08c09, c10c11) \
+    IDENTITY_4(c12c13, c14c15) \
+}
+
+#define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
+                    cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
+                    a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
+    adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
+    adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
+    adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
+    adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
+
+#define adst_4x4_out(c00, c01, c02, c03, \
+                     c04, c05, c06, c07, \
+                     c08, c09, c10, c11, \
+                     c12, c13, c14, c15, \
+                     c00c01, c02c03, c04c05, c06c07, \
+                     c08c09, c10c11, c12c13, c14c15) \
+{ \
+    ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \
+    ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \
+    ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \
+    ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \
+\
+    PACK_4x4(c00, c01, c02, c03, \
+             c04, c05, c06, c07, \
+             c08, c09, c10, c11, \
+             c12, c13, c14, c15, \
+             c00c01, c02c03, c04c05, c06c07, \
+             c08c09, c10c11, c12c13, c14c15) \
+}
+
+#define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
+                        cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
+                        a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
+    flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
+    flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
+    flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
+    flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
+
+#define flipadst_4x4_out(c00, c01, c02, c03, \
+                         c04, c05, c06, c07, \
+                         c08, c09, c10, c11, \
+                         c12, c13, c14, c15, \
+                         c00c01, c02c03, c04c05, c06c07, \
+                         c08c09, c10c11, c12c13, c14c15) \
+{ \
+    ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \
+    ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \
+    ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \
+    ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \
+\
+    PACK_4x4(c00, c01, c02, c03, \
+             c04, c05, c06, c07, \
+             c08, c09, c10, c11, \
+             c12, c13, c14, c15, \
+             c00c01, c02c03, c04c05, c06c07, \
+             c08c09, c10c11, c12c13, c14c15) \
+}
+
+#define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \
+                      c08, c09, c10, c11, c12, c13, c14, c15, \
+                      o00, o01, o02, o03, o04, o05, o06, o07, \
+                      o08, o09, o10, o11, o12, o13, o14, o15, \
+                      c00c01, c02c03, c04c05, c06c07) \
+    DECLARE_SPLAT_I32(2048); \
+    u32x4 v12 = vec_splat_u32(12); \
+    DECLARE_SPLAT_I32(4091) \
+    DECLARE_SPLAT_I32(201) \
+    DECLARE_SPLAT_I32(3973) \
+    DECLARE_SPLAT_I32(995) \
+    DECLARE_SPLAT_I32(3703) \
+    DECLARE_SPLAT_I32(1751) \
+    DECLARE_SPLAT_I32(3290) \
+    DECLARE_SPLAT_I32(2440) \
+    DECLARE_SPLAT_I32(2751) \
+    DECLARE_SPLAT_I32(3035) \
+    DECLARE_SPLAT_I32(2106) \
+    DECLARE_SPLAT_I32(3513) \
+    DECLARE_SPLAT_I32(1380) \
+    DECLARE_SPLAT_I32(3857) \
+    DECLARE_SPLAT_I32(601) \
+    DECLARE_SPLAT_I32(4052) \
+\
+    DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \
+    DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \
+    DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \
+    DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \
+    DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \
+    DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \
+    DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \
+    DECLARE_MUL_PAIR_I32(c01, c14,  v601, v4052) \
+\
+    DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\
+    DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \
+    DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \
+    DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \
+    DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \
+    DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \
+    DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \
+    DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14,  v601, v4052) \
+\
+    SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \
+    SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \
+    SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
+    SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \
+\
+    DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \
+    DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \
+    DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \
+    DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \
+    DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \
+    DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \
+    DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \
+    DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \
+\
+    CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \
+                 c00c01, c02c03, c04c05, c06c07); \
+    CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \
+                 c00c01, c02c03, c04c05, c06c07); \
+\
+    DECLARE_SPLAT_I32(4017) \
+    DECLARE_SPLAT_I32(799) \
+    DECLARE_SPLAT_I32(2276) \
+    DECLARE_SPLAT_I32(3406) \
+\
+    DECLARE_MUL_PAIR_I32(t08a, t09a, v4017,  v799); \
+    DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \
+    DECLARE_MUL_PAIR_I32(t13a, t12a,  v799, v4017); \
+    DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \
+\
+    ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017,  v799); \
+    ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \
+    ADD_SUB_PAIR(t13, t12, t13a, t12a,  v799, v4017); \
+    ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \
+\
+    SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
+    SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \
+\
+    ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \
+    ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \
+    ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \
+    ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \
+    ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \
+    ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \
+    ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \
+    ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \
+\
+    CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \
+                 c00c01, c02c03, c04c05, c06c07) \
+    CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \
+                 c00c01, c02c03, c04c05, c06c07) \
+\
+    DECLARE_SPLAT_I32(3784) \
+    DECLARE_SPLAT_I32(1567) \
+\
+    DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \
+    DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \
+    DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \
+    DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \
+\
+    ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \
+    ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \
+    ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \
+    ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \
+\
+    SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \
+    SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \
+\
+    ADD_SUB_PAIR(o00, t02a, t00,  t02,,) \
+    ADD_SUB_PAIR(o15, t03a, t01,  t03,,) \
+    ADD_SUB_PAIR(o03, t06,  t04a, t06a,,) \
+    ADD_SUB_PAIR(o12, t07,  t05a, t07a,,) \
+    ADD_SUB_PAIR(o01, t10,  t08a, t10a,,) \
+    ADD_SUB_PAIR(o14, t11,  t09a, t11a,,) \
+    ADD_SUB_PAIR(o02, t14a, t12,  t14,,) \
+    ADD_SUB_PAIR(o13, t15a, t13,  t15,,) \
+\
+    CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \
+                 c00c01, c02c03, c04c05, c06c07) \
+    CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \
+                 c00c01, c02c03, c04c05, c06c07) \
+\
+    DECLARE_SPLAT_I32(181) \
+    DECLARE_SPLAT_I32(128) \
+    u32x4 v8 = vec_splat_u32(8); \
+\
+    ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \
+    ADD_SUB_PAIR(o04, o11, t06,  t07,,) \
+    ADD_SUB_PAIR(o06, o09, t10,  t11,,) \
+    ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \
+\
+    MUL_4_INPLACE(o07, o08, o04, o11, v181) \
+    MUL_4_INPLACE(o06, o09, o05, o10, v181) \
+\
+    SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \
+    SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \
+\
+    o01 = -o01; \
+    o03 = -o03; \
+    o05 = -o05; \
+    o07 = -o07; \
+    o09 = -o09; \
+    o11 = -o11; \
+    o13 = -o13; \
+    o15 = -o15; \
+
+#define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
+                   c08, c09, c10, c11, c12, c13, c14, c15, \
+                   c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
+{ \
+    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00c01, c02c03, c04c05, c06c07) \
+}
+
+#define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
+                    c08, c09, c10, c11, c12, c13, c14, c15, \
+                    c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
+{ \
+    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00c01, c02c03, c04c05, c06c07) \
+    PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
+           c00, c02, c04, c06, c08, c10, c12, c14, \
+           c01, c03, c05, c07, c09, c11, c13, c15) \
+}
+
+#define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
+                       c08, c09, c10, c11, c12, c13, c14, c15, \
+                       c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
+{ \
+    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
+                  c00c01, c02c03, c04c05, c06c07) \
+}
+
+#define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
+                        c08, c09, c10, c11, c12, c13, c14, c15, \
+                        c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
+{ \
+    ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
+                  c00c01, c02c03, c04c05, c06c07) \
+    PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
+           c00, c02, c04, c06, c08, c10, c12, c14, \
+           c01, c03, c05, c07, c09, c11, c13, c15) \
+}
+
+
+void dav1d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
+                                               int16_t *const coeff, const int eob
+                                               HIGHBD_DECL_SUFFIX)
+{
+    if (eob < 1) {
+        return dc_only_4xN(dst, stride, coeff, 4, 0, 1);
+    }
+
+    LOAD_COEFF_4x16(coeff)
+
+    dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
+               cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
+               a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
+
+    memset(coeff, 0, sizeof(*coeff) * 4 * 16);
+
+    SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1))
+    TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
+                      cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3)
+
+    dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
+               cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
+               a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
+
+    LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03)
+    LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07)
+    LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11)
+    LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15)
+
+    APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0);
+    APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1);
+    APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2);
+    APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3);
+
+    STORE_4(dst, stride,               l00, l01, l02, l03);
+    STORE_4(dst + 4 * stride, stride,  l04, l05, l06, l07);
+    STORE_4(dst + 8 * stride, stride,  l08, l09, l10, l11);
+    STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15);
+}
+
+#define inv_txfm_fn4x16(type1, type2) \
+void dav1d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                          int16_t *const coeff, const int eob) \
+{ \
+    LOAD_COEFF_4x16(coeff) \
+    type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
+                   cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
+                   a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
+    memset(coeff, 0, sizeof(*coeff) * 4 * 16); \
+    SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \
+    TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
+                      cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \
+    type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
+                   cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
+                   a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
+    LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \
+    LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \
+    LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \
+    LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \
+    APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \
+    APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \
+    APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \
+    APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \
+    STORE_4(dst, stride,               l00, l01, l02, l03); \
+    STORE_4(dst + 4 * stride, stride,  l04, l05, l06, l07); \
+    STORE_4(dst + 8 * stride, stride,  l08, l09, l10, l11); \
+    STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \
+}
+inv_txfm_fn4x16(adst,     dct     )
+inv_txfm_fn4x16(dct,      adst    )
+inv_txfm_fn4x16(dct,      flipadst)
+inv_txfm_fn4x16(flipadst, dct     )
+inv_txfm_fn4x16(adst,     flipadst)
+inv_txfm_fn4x16(flipadst, adst    )
+inv_txfm_fn4x16(identity, dct     )
+inv_txfm_fn4x16(dct,      identity)
+inv_txfm_fn4x16(identity, flipadst)
+inv_txfm_fn4x16(flipadst, identity)
+inv_txfm_fn4x16(identity, adst   )
+inv_txfm_fn4x16(adst,     identity)
+inv_txfm_fn4x16(identity, identity)
+inv_txfm_fn4x16(adst,     adst    )
+inv_txfm_fn4x16(flipadst, flipadst)
+
+void dav1d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
+                                               int16_t *const coeff, const int eob)
+{
+
+    if (eob < 1) {
+        return dc_only_16xN(dst, stride, coeff, 1, 0, 1);
+    }
+
+    LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
+    LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
+    LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
+    LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
+    UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03)
+    UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07)
+    UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11)
+    UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15)
+
+    dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07,
+              c08, c09, c10, c11, c12, c13, c14, c15,
+              c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15)
+    memset(coeff, 0, sizeof(*coeff) * 16 * 4);
+    SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1))
+    SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1))
+
+    TRANSPOSE4_I32(c00, c01, c02, c03);
+    TRANSPOSE4_I32(c04, c05, c06, c07);
+    TRANSPOSE4_I32(c08, c09, c10, c11);
+    TRANSPOSE4_I32(c12, c13, c14, c15);
+
+    dct_4x4_out(c00, c01, c02, c03,
+                c04, c05, c06, c07,
+                c08, c09, c10, c11,
+                c12, c13, c14, c15,
+                c00c01, c02c03, c04c05, c06c07,
+                c08c09, c10c11, c12c13, c14c15)
+
+    LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3)
+
+    APPLY_COEFF_16x4(l0, l1, l2, l3,
+                     c00c01, c02c03, c04c05, c06c07,
+                     c08c09, c10c11, c12c13, c14c15)
+
+    STORE_16(dst, stride, l0, l1, l2, l3)
+}
+
+#define inv_txfm_fn16x4(type1, type2) \
+void dav1d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                          int16_t *const coeff, const int eob) \
+{ \
+    LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
+    LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
+    LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
+    LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
+    UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
+    UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
+    UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
+    UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
+    type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
+                  c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
+    memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
+    SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
+    TRANSPOSE4_I32(c00, c01, c02, c03); \
+    TRANSPOSE4_I32(c04, c05, c06, c07); \
+    TRANSPOSE4_I32(c08, c09, c10, c11); \
+    TRANSPOSE4_I32(c12, c13, c14, c15); \
+    type2##_4x4_out(c00, c01, c02, c03, \
+                    c04, c05, c06, c07, \
+                    c08, c09, c10, c11, \
+                    c12, c13, c14, c15, \
+                    c00c01, c02c03, c04c05, c06c07, \
+                    c08c09, c10c11, c12c13, c14c15); \
+    LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
+    APPLY_COEFF_16x4(l0, l1, l2, l3, \
+                     c00c01, c02c03, c04c05, c06c07, \
+                     c08c09, c10c11, c12c13, c14c15) \
+    STORE_16(dst, stride, l0, l1, l2, l3) \
+}
+
+inv_txfm_fn16x4(adst,     dct     )
+inv_txfm_fn16x4(dct,      adst    )
+inv_txfm_fn16x4(dct,      flipadst)
+inv_txfm_fn16x4(flipadst, dct     )
+inv_txfm_fn16x4(adst,     flipadst)
+inv_txfm_fn16x4(flipadst, adst    )
+inv_txfm_fn16x4(dct,      identity)
+inv_txfm_fn16x4(flipadst, identity)
+inv_txfm_fn16x4(adst,     identity)
+inv_txfm_fn16x4(identity, identity)
+inv_txfm_fn16x4(adst,     adst    )
+inv_txfm_fn16x4(flipadst, flipadst)
+
+#define inv_txfm_fn16x4_identity(type2) \
+void dav1d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
+                                                          int16_t *const coeff, const int eob) \
+{ \
+    LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
+    LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
+    LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
+    LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
+    UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
+    UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
+    UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
+    UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
+    identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
+                  c08, c09, c10, c11, c12, c13, c14, c15, \
+                  c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
+    memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
+    SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
+    SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
+    CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \
+    CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \
+    TRANSPOSE4_I32(c00, c01, c02, c03); \
+    TRANSPOSE4_I32(c04, c05, c06, c07); \
+    TRANSPOSE4_I32(c08, c09, c10, c11); \
+    TRANSPOSE4_I32(c12, c13, c14, c15); \
+    type2##_4x4_out(c00, c01, c02, c03, \
+                    c04, c05, c06, c07, \
+                    c08, c09, c10, c11, \
+                    c12, c13, c14, c15, \
+                    c00c01, c02c03, c04c05, c06c07, \
+                    c08c09, c10c11, c12c13, c14c15); \
+    LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
+    APPLY_COEFF_16x4(l0, l1, l2, l3, \
+                     c00c01, c02c03, c04c05, c06c07, \
+                     c08c09, c10c11, c12c13, c14c15) \
+    STORE_16(dst, stride, l0, l1, l2, l3) \
+}
+
+inv_txfm_fn16x4_identity(dct)
+inv_txfm_fn16x4_identity(adst)
+inv_txfm_fn16x4_identity(flipadst)
+
+#endif // BITDEPTH
diff --git a/src/ppc/loopfilter_tmpl.c b/src/ppc/loopfilter_tmpl.c
index 4e658a701a886d70e8ec7b7c7c0c71feb5f7933b..107192f8361241ee5652a89a1e567ed61566b7c4 100644
--- a/src/ppc/loopfilter_tmpl.c
+++ b/src/ppc/loopfilter_tmpl.c
@@ -342,8 +342,7 @@ static inline void store_h_8(u8x16 outa, u8x16 outb, uint8_t *dst, int stridea)
 
 static inline void
 loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                const ptrdiff_t stridea, b32x4 apply
-                HIGHBD_DECL_SUFFIX)
+                const ptrdiff_t stridea, b32x4 apply)
 {
     dst -= 2;
     uint8_t *dst2 = dst;
@@ -428,8 +427,7 @@ loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 
 static inline void
 loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                    const ptrdiff_t stridea, b32x4 apply, b32x4 m6
-                    HIGHBD_DECL_SUFFIX)
+                    const ptrdiff_t stridea, b32x4 apply, b32x4 m6)
 {
     uint8_t *dst2 = dst - 2;
     dst -= 3;
@@ -572,8 +570,7 @@ loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 
 static inline void
 loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                    const ptrdiff_t stridea, b32x4 apply, b32x4 m8
-                    HIGHBD_DECL_SUFFIX)
+                    const ptrdiff_t stridea, b32x4 apply, b32x4 m8)
 {
     uint8_t *dst2 = dst - 3;
     dst -= 4;
@@ -718,8 +715,7 @@ loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 
 static inline void
 loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                    const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16
-                    HIGHBD_DECL_SUFFIX)
+                    const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16)
 {
     uint8_t *dst2 = dst -6 ;
     dst -= 7;
@@ -960,8 +956,7 @@ loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 
 static inline void
 loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                    const ptrdiff_t strideb, b32x4 apply
-                    HIGHBD_DECL_SUFFIX)
+                    const ptrdiff_t strideb, b32x4 apply)
 {
     uint8_t *p1d = dst + strideb * -2;
     uint8_t *p0d = dst + strideb * -1;
@@ -1007,8 +1002,7 @@ loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 
 static inline void
 loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                    const ptrdiff_t strideb, b32x4 apply, b32x4 m6
-                    HIGHBD_DECL_SUFFIX)
+                    const ptrdiff_t strideb, b32x4 apply, b32x4 m6)
 {
     uint8_t *p2d = dst + strideb * -3;
     uint8_t *p1d = dst + strideb * -2;
@@ -1114,9 +1108,7 @@ loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 
 static inline void
 loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                    const ptrdiff_t strideb, b32x4 apply, b32x4 m8
-                    HIGHBD_DECL_SUFFIX)
-
+                    const ptrdiff_t strideb, b32x4 apply, b32x4 m8)
 {
 
     uint8_t *p3d = dst + strideb * -4;
@@ -1216,9 +1208,7 @@ loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 
 static inline void
 loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
-                    const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16
-                    HIGHBD_DECL_SUFFIX)
-
+                    const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16)
 {
 
     uint8_t *p6d = dst + strideb * -7;
@@ -1373,8 +1363,7 @@ loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
 void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride,
                  const uint32_t *const vmask,
                  const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                 const Av1FilterLUT *lut, const int h
-                 HIGHBD_DECL_SUFFIX)
+                 const Av1FilterLUT *lut, const int h)
 {
     unsigned vm = vmask[0] | vmask[1] | vmask[2];
 
@@ -1449,11 +1438,11 @@ void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride,
         apply = vec_and(m4, apply);
 
         if (vec_any_ne(wd16, zero)) {
-            loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX);
+            loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16);
         } else if (vec_any_ne(wd8, zero)) {
-            loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX);
+            loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8);
         } else { // wd4 == 0 already tested
-            loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
+            loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
         }
     }
 }
@@ -1461,8 +1450,7 @@ void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride,
 void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride,
                  const uint32_t *const vmask,
                  const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                 const Av1FilterLUT *lut, const int w
-                 HIGHBD_DECL_SUFFIX)
+                 const Av1FilterLUT *lut, const int w)
 {
     unsigned vm = vmask[0] | vmask[1] | vmask[2];
 
@@ -1530,11 +1518,11 @@ void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride,
         apply = vec_and(apply, m4);
 
         if (vec_any_ne(wd16, zero)) {
-            loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX);
+            loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16);
         } else if (vec_any_ne(wd8, zero)) {
-            loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX);
+            loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8);
         } else {
-            loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
+            loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
         }
 
     }
@@ -1543,8 +1531,7 @@ void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride,
 void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride,
                   const uint32_t *const vmask,
                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                  const Av1FilterLUT *lut, const int h
-                  HIGHBD_DECL_SUFFIX)
+                  const Av1FilterLUT *lut, const int h)
 {
     unsigned vm = vmask[0] | vmask[1];
     u32x4 vm0 = vec_splats(vm);
@@ -1614,10 +1601,10 @@ void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride,
         apply = vec_and(m4, apply);
 
         if (vec_any_ne(wd6, zero)) {
-            loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX);
+            loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6);
             // loop_filter_h_8
         } else { // wd4 == 0 already tested
-            loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
+            loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
 
             // loop_filter_h_4
         }
@@ -1628,8 +1615,7 @@ void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride,
 void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride,
                   const uint32_t *const vmask,
                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                  const Av1FilterLUT *lut, const int w
-                  HIGHBD_DECL_SUFFIX)
+                  const Av1FilterLUT *lut, const int w)
 {
     unsigned vm = vmask[0] | vmask[1];
 
@@ -1694,9 +1680,9 @@ void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride,
         apply = vec_and(apply, m4);
 
         if (vec_any_ne(wd6, zero)) {
-            loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX);
+            loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6);
         } else {
-            loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
+            loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
         }
     }
 }
diff --git a/src/ppc/looprestoration.h b/src/ppc/looprestoration.h
index 3fe16318bd53f1a5fec46297fb6680a1feb2b451..614234abfc897180c46371f869cba4f2301ba658 100644
--- a/src/ppc/looprestoration.h
+++ b/src/ppc/looprestoration.h
@@ -35,7 +35,7 @@ void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
                              const uint8_t *lpf,
                              const int w, const int h,
                              const LooprestorationParams *const params,
-                             const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+                             const enum LrEdgeFlags edges);
 
 static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
     const unsigned flags = dav1d_get_cpu_flags();
diff --git a/src/ppc/looprestoration_tmpl.c b/src/ppc/looprestoration_tmpl.c
index c0c64e180023cfd1939be3727d0fa8988edf282f..76c1d07f802b110bc50a1c702d8a1404226703ab 100644
--- a/src/ppc/looprestoration_tmpl.c
+++ b/src/ppc/looprestoration_tmpl.c
@@ -305,7 +305,7 @@ void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
                              const uint8_t *lpf,
                              const int w, const int h,
                              const LooprestorationParams *const params,
-                             const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+                             const enum LrEdgeFlags edges)
 {
     const int16_t (*const filter)[8] = params->filter;
 
diff --git a/src/ppc/utils.h b/src/ppc/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b0a5445bca2bbe195a92164740e7171cca5bc5c
--- /dev/null
+++ b/src/ppc/utils.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * Copyright © 2024, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_UTILS_H
+#define DAV1D_SRC_PPC_UTILS_H
+
+#include "src/ppc/dav1d_types.h"
+
+#define assert_eq(a, b) \
+    if ((a) != (b)) \
+        printf("%d: %d vs %d\n", __LINE__, a, b); \
+    assert((a) == (b));
+
+#define MERGE_I32(a, b, h, l) \
+{ \
+    h = vec_mergeh(a, b); \
+    l = vec_mergel(a, b); \
+}
+
+#define DECLARE_MERGE_I32(a, b, h, l) \
+    i32x4 h, l; \
+    MERGE_I32(a, b, h, l)
+
+
+// Transpose a 4x4 matrix of i32x4 vectors
+#define TRANSPOSE4_I32(c0, c1, c2, c3) \
+{ \
+    DECLARE_MERGE_I32(c0, c2, m02h, m02l) \
+    DECLARE_MERGE_I32(c1, c3, m13h, m13l) \
+\
+    MERGE_I32(m02h, m13h, c0, c1) \
+    MERGE_I32(m02l, m13l, c2, c3) \
+}
+
+// Transpose a 8x8 matrix of i32x4 vectors
+#define TRANSPOSE8_I32(c0, c1, c2, c3, c4, c5, c6, c7, \
+                       c8, c9, cA, cB, cC, cD, cE, cF) \
+{ \
+    DECLARE_MERGE_I32(c0, c2, m02h, m02l) \
+    DECLARE_MERGE_I32(c1, c3, m13h, m13l) \
+    DECLARE_MERGE_I32(c4, c6, m46h, m46l) \
+    DECLARE_MERGE_I32(c5, c7, m57h, m57l) \
+    DECLARE_MERGE_I32(c8, cA, m8Ah, m8Al) \
+    DECLARE_MERGE_I32(c9, cB, m9Bh, m9Bl) \
+    DECLARE_MERGE_I32(cC, cE, mCEh, mCEl) \
+    DECLARE_MERGE_I32(cD, cF, mDFh, mDFl) \
+\
+    MERGE_I32(m02h, m13h, c0, c1) \
+    MERGE_I32(m02l, m13l, c2, c3) \
+    MERGE_I32(m46h, m57h, c8, c9) \
+    MERGE_I32(m46l, m57l, cA, cB) \
+    MERGE_I32(m8Ah, m9Bh, c4, c5) \
+    MERGE_I32(m8Al, m9Bl, c6, c7) \
+    MERGE_I32(mCEh, mDFh, cC, cD) \
+    MERGE_I32(mCEl, mDFl, cE, cF) \
+}
+
+// Transpose a 4x16 matrix of i32x4 vectors
+#define TRANSPOSE4x16_I32(c0, c1, c2, c3, c4, c5, c6, c7, \
+                          c8, c9, cA, cB, cC, cD, cE, cF) \
+{ \
+    DECLARE_MERGE_I32(c0, c2, m02h, m02l) \
+    DECLARE_MERGE_I32(c1, c3, m13h, m13l) \
+    DECLARE_MERGE_I32(c4, c6, m46h, m46l) \
+    DECLARE_MERGE_I32(c5, c7, m57h, m57l) \
+    DECLARE_MERGE_I32(c8, cA, m8Ah, m8Al) \
+    DECLARE_MERGE_I32(c9, cB, m9Bh, m9Bl) \
+    DECLARE_MERGE_I32(cC, cE, mCEh, mCEl) \
+    DECLARE_MERGE_I32(cD, cF, mDFh, mDFl) \
+\
+    MERGE_I32(m02h, m13h, c0, c1) \
+    MERGE_I32(m02l, m13l, c2, c3) \
+    MERGE_I32(m46h, m57h, c4, c5) \
+    MERGE_I32(m46l, m57l, c6, c7) \
+    MERGE_I32(m8Ah, m9Bh, c8, c9) \
+    MERGE_I32(m8Al, m9Bl, cA, cB) \
+    MERGE_I32(mCEh, mDFh, cC, cD) \
+    MERGE_I32(mCEl, mDFl, cE, cF) \
+}
+
+#endif // DAV1D_SRC_PPC_UTILS_H
diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c
index 0afd06c16bc763b1f55a6c87cbc5d28e6feb9859..426fa406ed1d1a29243c8d15158e90a1985e99e1 100644
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -402,7 +402,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
 
     // find end-of-block (eob)
     int eob_bin;
-    const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
+    const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);
+    const int tx2dszctx = slw + slh;
     const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
     const int is_1d = tx_class != TX_CLASS_2D;
     switch (tx2dszctx) {
@@ -449,10 +450,9 @@ static int decode_coefs(Dav1dTaskContext *const t,
     if (eob) {
         uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
         uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
-        const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
 
         /* eob */
-        unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
+        unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);
         int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
         int tok = eob_tok + 1;
         int level_tok = tok * 0x41;
@@ -460,6 +460,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
 
 #define DECODE_COEFS_CLASS(tx_class) \
         unsigned x, y; \
+        uint8_t *level; \
         if (tx_class == TX_CLASS_2D) \
             rc = scan[eob], x = rc >> shift, y = rc & mask; \
         else if (tx_class == TX_CLASS_H) \
@@ -480,7 +481,11 @@ static int decode_coefs(Dav1dTaskContext *const t,
                        ts->msac.rng); \
         } \
         cf[rc] = tok << 11; \
-        levels[x * stride + y] = (uint8_t) level_tok; \
+        if (TX_CLASS_2D) \
+            level = levels + rc; \
+        else \
+            level = levels + x * stride + y; \
+        *level = (uint8_t) level_tok; \
         for (int i = eob - 1; i > 0; i--) { /* ac */ \
             unsigned rc_i; \
             if (tx_class == TX_CLASS_2D) \
@@ -490,7 +495,10 @@ static int decode_coefs(Dav1dTaskContext *const t,
             else /* tx_class == TX_CLASS_V */ \
                 x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
             assert(x < 32 && y < 32); \
-            uint8_t *const level = levels + x * stride + y; \
+            if (TX_CLASS_2D) \
+                level = levels + rc; \
+            else \
+                level = levels + x * stride + y; \
             ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
             if (tx_class == TX_CLASS_2D) \
                 y |= x; \
@@ -547,26 +555,26 @@ static int decode_coefs(Dav1dTaskContext *const t,
             const uint8_t (*const lo_ctx_offsets)[5] =
                 dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
             scan = dav1d_scans[tx];
-            const ptrdiff_t stride = 4 * sh;
-            const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0;
-            const unsigned mask = 4 * sh - 1;
-            memset(levels, 0, stride * (4 * sw + 2));
+            const ptrdiff_t stride = 4 << slh;
+            const unsigned shift = slh + 2, shift2 = 0;
+            const unsigned mask = (4 << slh) - 1;
+            memset(levels, 0, stride * ((4 << slw) + 2));
             DECODE_COEFS_CLASS(TX_CLASS_2D);
         }
         case TX_CLASS_H: {
             const uint8_t (*const lo_ctx_offsets)[5] = NULL;
             const ptrdiff_t stride = 16;
-            const unsigned shift = t_dim->lh + 2, shift2 = 0;
-            const unsigned mask = 4 * sh - 1;
-            memset(levels, 0, stride * (4 * sh + 2));
+            const unsigned shift = slh + 2, shift2 = 0;
+            const unsigned mask = (4 << slh) - 1;
+            memset(levels, 0, stride * ((4 << slh) + 2));
             DECODE_COEFS_CLASS(TX_CLASS_H);
         }
         case TX_CLASS_V: {
             const uint8_t (*const lo_ctx_offsets)[5] = NULL;
             const ptrdiff_t stride = 16;
-            const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2;
-            const unsigned mask = 4 * sw - 1;
-            memset(levels, 0, stride * (4 * sw + 2));
+            const unsigned shift = slw + 2, shift2 = slh + 2;
+            const unsigned mask = (4 << slw) - 1;
+            memset(levels, 0, stride * ((4 << slw) + 2));
             DECODE_COEFS_CLASS(TX_CLASS_V);
         }
 #undef DECODE_COEFS_CLASS
@@ -785,21 +793,15 @@ static void read_coef_tree(Dav1dTaskContext *const t,
             if (DEBUG_BLOCK_INFO)
                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                        ytx, txtp, eob, ts->msac.rng);
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
-#define default_memset(dir, diridx, off, sz) \
-            memset(&t->dir lcoef[off], cf_ctx, sz)
-            case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
-            case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
-#undef default_memset
-#undef set_ctx
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
+            dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
+#define set_ctx(rep_macro) \
             for (int y = 0; y < txh; y++) { \
-                rep_macro(type, txtp_map, 0, mul * txtp); \
+                rep_macro(txtp_map, 0, txtp); \
                 txtp_map += 32; \
             }
             uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
-            case_set_upto16(txw,,,);
+            case_set_upto16(t_dim->lw);
 #undef set_ctx
             if (t->frame_thread.pass == 1)
                 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
@@ -838,18 +840,16 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                            (bh4 > ss_ver || t->by & 1);
 
     if (b->skip) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->dir lcoef, off, mul * 0x40)
-        case_set(bh4, l., 1, by4);
-        case_set(bw4, a->, 0, bx4);
-#undef set_ctx
+        BlockContext *const a = t->a;
+        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
+        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
         if (has_chroma) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
-            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
-            case_set(cbh4, l., 1, cby4);
-            case_set(cbw4, a->, 0, cbx4);
-#undef set_ctx
+            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
+            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
+            memset_cw(&a->ccoef[0][cbx4], 0x40);
+            memset_cw(&a->ccoef[1][cbx4], 0x40);
+            memset_ch(&t->l.ccoef[0][cby4], 0x40);
+            memset_ch(&t->l.ccoef[1][cby4], 0x40);
         }
         return;
     }
@@ -890,16 +890,8 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                                    b->tx, txtp, eob, ts->msac.rng);
                         *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                         ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                        rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
-#define default_memset(dir, diridx, off, sz) \
-                        memset(&t->dir lcoef[off], cf_ctx, sz)
-                        case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
-                                                     l., 1, by4 + y);
-                        case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
-                                                     a->, 0, bx4 + x);
-#undef default_memset
-#undef set_ctx
+                        dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
+                        dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
                     }
                 }
                 t->bx -= x;
@@ -933,18 +925,10 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                                    pl, b->uvtx, txtp, eob, ts->msac.rng);
                         *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                         ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                        rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
-#define default_memset(dir, diridx, off, sz) \
-                        memset(&t->dir ccoef[pl][off], cf_ctx, sz)
-                        case_set_upto16_with_default( \
-                                 imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
-                                 l., 1, cby4 + y);
-                        case_set_upto16_with_default( \
-                                 imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
-                                 a->, 0, cbx4 + x);
-#undef default_memset
-#undef set_ctx
+                        int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
+                        int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
+                        dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
+                        dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
                     }
                     t->bx -= x << ss_hor;
                 }
@@ -1329,16 +1313,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                             if (DEBUG_BLOCK_INFO)
                                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                        b->tx, txtp, eob, ts->msac.rng);
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
-#define default_memset(dir, diridx, off, sz) \
-                            memset(&t->dir lcoef[off], cf_ctx, sz)
-                            case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
-                                                         l., 1, by4 + y);
-                            case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
-                                                         a->, 0, bx4 + x);
-#undef default_memset
-#undef set_ctx
+                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
+                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
                         }
                         if (eob >= 0) {
                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@@ -1353,11 +1329,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                                          t_dim->w * 4, t_dim->h * 4, "recon");
                         }
                     } else if (!t->frame_thread.pass) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                        rep_macro(type, t->dir lcoef, off, mul * 0x40)
-                        case_set_upto16(t_dim->h, l., 1, by4 + y);
-                        case_set_upto16(t_dim->w, a->, 0, bx4 + x);
-#undef set_ctx
+                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
+                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
                     }
                     dst += 4 * t_dim->w;
                 }
@@ -1554,18 +1527,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                                     printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                            "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
                                            pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                                rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
-#define default_memset(dir, diridx, off, sz) \
-                                memset(&t->dir ccoef[pl][off], cf_ctx, sz)
-                                case_set_upto16_with_default( \
-                                         imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
-                                         l., 1, cby4 + y);
-                                case_set_upto16_with_default( \
-                                         imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
-                                         a->, 0, cbx4 + x);
-#undef default_memset
-#undef set_ctx
+                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
+                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
+                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
+                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
                             }
                             if (eob >= 0) {
                                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@@ -1579,11 +1544,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                                              uv_t_dim->h * 4, "recon");
                             }
                         } else if (!t->frame_thread.pass) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                            rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
-                            case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
-                            case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
-#undef set_ctx
+                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
+                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
                         }
                         dst += uv_t_dim->w * 4;
                     }
@@ -1921,18 +1883,16 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
 
     if (b->skip) {
         // reset coef contexts
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-        rep_macro(type, t->dir lcoef, off, mul * 0x40)
-        case_set(bh4, l., 1, by4);
-        case_set(bw4, a->, 0, bx4);
-#undef set_ctx
+        BlockContext *const a = t->a;
+        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
+        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
         if (has_chroma) {
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
-            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
-            case_set(cbh4, l., 1, cby4);
-            case_set(cbw4, a->, 0, cbx4);
-#undef set_ctx
+            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
+            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
+            memset_cw(&a->ccoef[0][cbx4], 0x40);
+            memset_cw(&a->ccoef[1][cbx4], 0x40);
+            memset_ch(&t->l.ccoef[0][cby4], 0x40);
+            memset_ch(&t->l.ccoef[1][cby4], 0x40);
         }
         return 0;
     }
@@ -1998,18 +1958,10 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
                                 printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                        "txtp=%d,eob=%d]: r=%d\n",
                                        pl, b->uvtx, txtp, eob, ts->msac.rng);
-#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
-                            rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
-#define default_memset(dir, diridx, off, sz) \
-                            memset(&t->dir ccoef[pl][off], cf_ctx, sz)
-                            case_set_upto16_with_default( \
-                                     imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
-                                     l., 1, cby4 + y);
-                            case_set_upto16_with_default( \
-                                     imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
-                                     a->, 0, cbx4 + x);
-#undef default_memset
-#undef set_ctx
+                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
+                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
+                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
+                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
                         }
                         if (eob >= 0) {
                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
diff --git a/src/refmvs.h b/src/refmvs.h
index d29feedce5ab5aaccd43dd82ffd620710d908c60..2c429844468710b099b9d5e845fc929e75bffde0 100644
--- a/src/refmvs.h
+++ b/src/refmvs.h
@@ -43,22 +43,26 @@ PACKED(typedef struct refmvs_temporal_block {
     mv mv;
     int8_t ref;
 }) refmvs_temporal_block;
+CHECK_SIZE(refmvs_temporal_block, 5);
 
-typedef union refmvs_refpair {
+PACKED(typedef union refmvs_refpair {
     int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
     uint16_t pair;
-} refmvs_refpair;
+}) ALIGN(refmvs_refpair, 2);
+CHECK_SIZE(refmvs_refpair, 2);
 
 typedef union refmvs_mvpair {
     mv mv[2];
     uint64_t n;
 } refmvs_mvpair;
+CHECK_SIZE(refmvs_mvpair, 8);
 
 PACKED(typedef struct refmvs_block {
     refmvs_mvpair mv;
     refmvs_refpair ref;
     uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
 }) ALIGN(refmvs_block, 4);
+CHECK_SIZE(refmvs_block, 12);
 
 typedef struct refmvs_frame {
     const Dav1dFrameHeader *frm_hdr;
diff --git a/src/riscv/cpu.c b/src/riscv/cpu.c
index 30e135435960b18df0e98a3a71b5880f2d78eb6a..345ff079e11ea15915a2b83303098aceb582cc65 100644
--- a/src/riscv/cpu.c
+++ b/src/riscv/cpu.c
@@ -29,9 +29,10 @@
 
 #include "common/attributes.h"
 
+#include "src/cpu.h"
 #include "src/riscv/cpu.h"
 
-#if defined(HAVE_GETAUXVAL)
+#if HAVE_GETAUXVAL
 #include <sys/auxv.h>
 
 #define HWCAP_RVV (1 << ('v' - 'a'))
@@ -41,8 +42,8 @@
 int dav1d_has_compliant_rvv(void);
 
 COLD unsigned dav1d_get_cpu_flags_riscv(void) {
-    unsigned flags = 0;
-#if defined(HAVE_GETAUXVAL)
+    unsigned flags = dav1d_get_default_cpu_flags();
+#if HAVE_GETAUXVAL
     unsigned long hw_cap = getauxval(AT_HWCAP);
     flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0;
 #endif
diff --git a/src/scan.c b/src/scan.c
index 5261ccd3d1ad91dd91fafce1db832f89bfccec2b..6f9dc03691e4563d100bc6b19ca2a70f792c3296 100644
--- a/src/scan.c
+++ b/src/scan.c
@@ -28,7 +28,10 @@
 #include "config.h"
 
 #include "common/attributes.h"
+#include "common/intops.h"
+
 #include "src/scan.h"
+#include "src/thread.h"
 
 static const uint16_t ALIGN(scan_4x4[], 32) = {
      0,  4,  1,  2,
@@ -297,3 +300,76 @@ const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
     [RTX_16X64] = scan_16x32,
     [RTX_64X16] = scan_32x16,
 };
+
+static uint8_t last_nonzero_col_from_eob_4x4[16];
+static uint8_t last_nonzero_col_from_eob_8x8[64];
+static uint8_t last_nonzero_col_from_eob_16x16[256];
+static uint8_t last_nonzero_col_from_eob_32x32[1024];
+static uint8_t last_nonzero_col_from_eob_4x8[32];
+static uint8_t last_nonzero_col_from_eob_8x4[32];
+static uint8_t last_nonzero_col_from_eob_8x16[128];
+static uint8_t last_nonzero_col_from_eob_16x8[128];
+static uint8_t last_nonzero_col_from_eob_16x32[512];
+static uint8_t last_nonzero_col_from_eob_32x16[512];
+static uint8_t last_nonzero_col_from_eob_4x16[64];
+static uint8_t last_nonzero_col_from_eob_16x4[64];
+static uint8_t last_nonzero_col_from_eob_8x32[256];
+static uint8_t last_nonzero_col_from_eob_32x8[256];
+
+static COLD void init_tbl(uint8_t *const last_nonzero_col_from_eob,
+                          const uint16_t *const scan, const int w, const int h)
+{
+    int max_col = 0;
+    for (int y = 0, n = 0; y < h; y++) {
+        for (int x = 0; x < w; x++, n++) {
+            const int rc = scan[n];
+            const int rcx = rc & (h - 1);
+            max_col = imax(max_col, rcx);
+            last_nonzero_col_from_eob[n] = max_col;
+        }
+    }
+}
+
+static COLD void init_internal(void) {
+    init_tbl(last_nonzero_col_from_eob_4x4,   scan_4x4,    4,  4);
+    init_tbl(last_nonzero_col_from_eob_8x8,   scan_8x8,    8,  8);
+    init_tbl(last_nonzero_col_from_eob_16x16, scan_16x16, 16, 16);
+    init_tbl(last_nonzero_col_from_eob_32x32, scan_32x32, 32, 32);
+    init_tbl(last_nonzero_col_from_eob_4x8,   scan_4x8,    4,  8);
+    init_tbl(last_nonzero_col_from_eob_8x4,   scan_8x4,    8,  4);
+    init_tbl(last_nonzero_col_from_eob_8x16,  scan_8x16,   8, 16);
+    init_tbl(last_nonzero_col_from_eob_16x8,  scan_16x8,  16,  8);
+    init_tbl(last_nonzero_col_from_eob_16x32, scan_16x32, 16, 32);
+    init_tbl(last_nonzero_col_from_eob_32x16, scan_32x16, 32, 16);
+    init_tbl(last_nonzero_col_from_eob_4x16,  scan_4x16,   4, 16);
+    init_tbl(last_nonzero_col_from_eob_16x4,  scan_16x4,  16,  4);
+    init_tbl(last_nonzero_col_from_eob_8x32,  scan_8x32,   8, 32);
+    init_tbl(last_nonzero_col_from_eob_32x8,  scan_32x8,  32,  8);
+}
+
+COLD void dav1d_init_last_nonzero_col_from_eob_tables(void) {
+    static pthread_once_t initted = PTHREAD_ONCE_INIT;
+    pthread_once(&initted, init_internal);
+}
+
+const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES] = {
+    [ TX_4X4  ] = last_nonzero_col_from_eob_4x4,
+    [ TX_8X8  ] = last_nonzero_col_from_eob_8x8,
+    [ TX_16X16] = last_nonzero_col_from_eob_16x16,
+    [ TX_32X32] = last_nonzero_col_from_eob_32x32,
+    [ TX_64X64] = last_nonzero_col_from_eob_32x32,
+    [RTX_4X8  ] = last_nonzero_col_from_eob_4x8,
+    [RTX_8X4  ] = last_nonzero_col_from_eob_8x4,
+    [RTX_8X16 ] = last_nonzero_col_from_eob_8x16,
+    [RTX_16X8 ] = last_nonzero_col_from_eob_16x8,
+    [RTX_16X32] = last_nonzero_col_from_eob_16x32,
+    [RTX_32X16] = last_nonzero_col_from_eob_32x16,
+    [RTX_32X64] = last_nonzero_col_from_eob_32x32,
+    [RTX_64X32] = last_nonzero_col_from_eob_32x32,
+    [RTX_4X16 ] = last_nonzero_col_from_eob_4x16,
+    [RTX_16X4 ] = last_nonzero_col_from_eob_16x4,
+    [RTX_8X32 ] = last_nonzero_col_from_eob_8x32,
+    [RTX_32X8 ] = last_nonzero_col_from_eob_32x8,
+    [RTX_16X64] = last_nonzero_col_from_eob_16x32,
+    [RTX_64X16] = last_nonzero_col_from_eob_32x16,
+};
diff --git a/src/scan.h b/src/scan.h
index 09df9887799efbc8d7b8cb631791358944e2e21b..2bd0b5b84e9521f77fac6b4bf23d4279f8f6d7b0 100644
--- a/src/scan.h
+++ b/src/scan.h
@@ -33,5 +33,8 @@
 #include "src/levels.h"
 
 EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
+EXTERN const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES];
+
+void dav1d_init_last_nonzero_col_from_eob_tables(void);
 
 #endif /* DAV1D_SRC_SCAN_H */
diff --git a/src/thread.h b/src/thread.h
index c44de736c3a788166c61bb3c8cef6f3306559a67..459aaced66292722b6f844102ab12d9ef28eb192 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -132,6 +132,14 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
 #else
 
 #include <pthread.h>
+#if defined(__FreeBSD__)
+ /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
+#define _SYS_PARAM_H_
+#include <sys/types.h>
+#endif
+#if HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
 
 #define dav1d_init_thread() do {} while (0)
 
@@ -145,29 +153,28 @@ static inline void dav1d_set_thread_name(const char *const name) {
     prctl(PR_SET_NAME, name);
 }
 
-#elif defined(__APPLE__)
+#elif HAVE_PTHREAD_SETNAME_NP && defined(__APPLE__)
 
 static inline void dav1d_set_thread_name(const char *const name) {
     pthread_setname_np(name);
 }
 
-#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+#elif HAVE_PTHREAD_SETNAME_NP && defined(__NetBSD__)
 
-#if defined(__FreeBSD__)
- /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
-#define _SYS_PARAM_H_
-#include <sys/types.h>
-#endif
-#include <pthread_np.h>
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_setname_np(pthread_self(), "%s", (void*)name);
+}
+
+#elif HAVE_PTHREAD_SETNAME_NP
 
 static inline void dav1d_set_thread_name(const char *const name) {
-    pthread_set_name_np(pthread_self(), name);
+    pthread_setname_np(pthread_self(), name);
 }
 
-#elif defined(__NetBSD__)
+#elif HAVE_PTHREAD_SET_NAME_NP
 
 static inline void dav1d_set_thread_name(const char *const name) {
-    pthread_setname_np(pthread_self(), "%s", (void*)name);
+    pthread_set_name_np(pthread_self(), name);
 }
 
 #elif defined(__HAIKU__)
diff --git a/src/x86/cpu.c b/src/x86/cpu.c
index f570fd7f391c535d19f84313a39e645007030423..80f91e16eb08fe97649ad923087b3727983bd1be 100644
--- a/src/x86/cpu.c
+++ b/src/x86/cpu.c
@@ -32,6 +32,7 @@
 
 #include "common/attributes.h"
 
+#include "src/cpu.h"
 #include "src/x86/cpu.h"
 
 typedef struct {
@@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
         };
     } cpu;
     dav1d_cpu_cpuid(&cpu.r, 0, 0);
-    unsigned flags = 0;
+    unsigned flags = dav1d_get_default_cpu_flags();
 
     if (cpu.max_leaf >= 1) {
         CpuidRegisters r;
diff --git a/src/x86/itx.h b/src/x86/itx.h
index 23d7a73806e19ab9e7549b8e8a8a0065b596da0d..a8a490fa47d751af940e5b21f5edfee74fe13fc5 100644
--- a/src/x86/itx.h
+++ b/src/x86/itx.h
@@ -107,7 +107,9 @@ decl_itx_fns(ssse3);
 decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
 decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
 
-static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c,
+                                           const int bpc, int *const all_simd)
+{
 #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
     c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
         BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
@@ -167,6 +169,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
     assign_itx1_fn (R, 64, 16, ssse3);
     assign_itx1_fn (R, 64, 32, ssse3);
     assign_itx1_fn ( , 64, 64, ssse3);
+    *all_simd = 1;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
@@ -192,6 +195,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
         assign_itx1_fn (R, 64, 16, sse4);
         assign_itx1_fn (R, 64, 32, sse4);
         assign_itx1_fn (,  64, 64, sse4);
+        *all_simd = 1;
     }
 #endif
 
diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm
index b0c42597f7293c5ec5e7321f76498c1a52173b58..319dd45544c0064835f86faa19ecd38584c95097 100644
--- a/src/x86/mc16_sse.asm
+++ b/src/x86/mc16_sse.asm
@@ -67,6 +67,8 @@ pw_m512:          times 8 dw -512
 pd_63:            times 4 dd 63
 pd_64:            times 4 dd 64
 pd_512:           times 4 dd 512
+pd_2560:          times 2 dd 2560
+pd_8704:          times 2 dd 8704
 pd_m524256:       times 4 dd -524256 ; -8192 << 6 + 32
 pd_0x3ff:         times 4 dd 0x3ff
 pd_0x4000:        times 4 dd 0x4000
@@ -1158,7 +1160,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
 %assign FILTER_SHARP   (2*15 << 16) | 3*15
 
-%macro FN 4 ; prefix, type, type_h, type_v
+%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
 cglobal %1_%2_16bpc
     mov                 t0d, FILTER_%3
 %ifidn %3, %4
@@ -1166,8 +1168,8 @@ cglobal %1_%2_16bpc
 %else
     mov                 t1d, FILTER_%4
 %endif
-%ifnidn %2, regular ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%if %0 == 5 ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
 %endif
 %endmacro
 
@@ -1180,40 +1182,25 @@ DECLARE_REG_TMP 7, 8, 8
 %endif
 
 %define PUT_8TAP_FN FN put_8tap,
-PUT_8TAP_FN sharp,          SHARP,   SHARP
-PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
-PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
-PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
+PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
 PUT_8TAP_FN regular,        REGULAR, REGULAR
 
+cglobal put_6tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
+    %define            base  t2-put_ssse3
 %if ARCH_X86_32
-cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my
-%define mxb r0b
-%define mxd r0
-%define mxq r0
-%define myb r1b
-%define myd r1
-%define myq r1
-%define  m8 [esp+16*0]
-%define  m9 [esp+16*1]
-%define m10 [esp+16*2]
-%define m11 [esp+16*3]
-%define m12 [esp+16*4]
-%define m13 [esp+16*5]
-%define m14 [esp+16*6]
-%define m15 [esp+16*7]
-%else
-cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
-%endif
-%define base t2-put_ssse3
+    %define             mxb  r0b
+    %define             mxd  r0
+    %define             mxq  r0
+    %define             myb  r1b
+    %define             myd  r1
+    %define             myq  r1
+%endif
     imul                mxd, mxm, 0x010101
-    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
-    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    add                 myd, t1d ; 6tap_v, my, 4tap_v
     LEA                  t2, put_ssse3
     movifnidn            wd, wm
     movifnidn          srcq, srcmp
@@ -1223,6 +1210,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     jnz .h
     test                myd, 0xf00
     jnz .v
+.put:
     tzcnt                wd, wd
     movzx                wd, word [base+put_ssse3_table+wq*2]
     movifnidn          dstq, dstmp
@@ -1233,24 +1221,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     pop                  r7
 %endif
     jmp                  wq
-.h:
-    test                myd, 0xf00
-    jnz .hv
-    mov                 myd, r8m
-    movd                 m5, r8m
-    shr                 myd, 11
-    movddup              m4, [base+put_8tap_h_rnd+myq*8]
-    movifnidn           dsq, dsmp
-    pshufb               m5, [base+pw_256]
-    cmp                  wd, 4
-    jg .h_w8
-    movzx               mxd, mxb
-    lea                srcq, [srcq-2]
-    movq                 m3, [base+subpel_filters+mxq*8]
-    movifnidn          dstq, dstmp
-    punpcklbw            m3, m3
-    psraw                m3, 8 ; sign-extend
-    je .h_w4
 .h_w2:
     mova                 m2, [base+spel_h_shuf2]
     pshufd               m3, m3, q2121
@@ -1277,89 +1247,111 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     jg .h_w2_loop
     RET
 .h_w4:
-    WIN64_SPILL_XMM       8
-    mova                 m6, [base+spel_h_shufA]
-    mova                 m7, [base+spel_h_shufB]
+    movzx               mxd, mxb
+    lea                srcq, [srcq-2]
+    movq                 m3, [base+subpel_filters+mxq*8]
+    movifnidn          dstq, dstmp
+    punpcklbw            m3, m3
+    psraw                m3, 8 ; sign-extend
+    jl .h_w2
+    WIN64_SPILL_XMM       9
+    mova                 m7, [base+spel_h_shufA]
+%if ARCH_X86_32
+    %define              m8  [base+spel_h_shufB]
+%else
+    mova                 m8, [base+spel_h_shufB]
+%endif
     pshufd               m2, m3, q1111
     pshufd               m3, m3, q2222
 .h_w4_loop:
-    movu                 m1, [srcq]
-    add                srcq, ssq
-    pshufb               m0, m1, m6 ; 0 1 1 2 2 3 3 4
-    pshufb               m1, m7     ; 2 3 3 4 4 5 5 6
-    pmaddwd              m0, m2
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m6, m0, m7 ; 0 1 1 2 2 3 3 4
+    pmaddwd              m6, m2
+    pshufb               m0, m8     ; 2 3 3 4 4 5 5 6
+    pmaddwd              m0, m3
+    paddd                m0, m6
+    pshufb               m6, m1, m7
+    pmaddwd              m6, m2
+    pshufb               m1, m8
     pmaddwd              m1, m3
     paddd                m0, m4
-    paddd                m0, m1
+    paddd                m6, m4
+    paddd                m1, m6
     psrad                m0, 6
-    packssdw             m0, m0
+    psrad                m1, 6
+    packssdw             m0, m1
     pxor                 m1, m1
     pminsw               m0, m5
     pmaxsw               m0, m1
-    movq             [dstq], m0
-    add                dstq, dsq
-    dec                  hd
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
     jg .h_w4_loop
     RET
-.h_w8:
-    WIN64_SPILL_XMM      12
+.h:
+    RESET_STACK_STATE
+    test                myd, 0xf00
+    jnz .hv
+    mov                 myd, r8m
+    movd                 m5, r8m
+    shr                 myd, 11
+    movddup              m4, [base+put_8tap_h_rnd+myq*8]
+    movifnidn           dsq, dsmp
+    pshufb               m5, [base+pw_256]
+    sub                  wd, 4
+    jle .h_w4
+    WIN64_SPILL_XMM      11
     shr                 mxd, 16
-    movq                 m3, [base+subpel_filters+mxq*8]
+    movq                 m2, [base+subpel_filters+1+mxq*8]
     movifnidn          dstq, dstmp
     mova                 m6, [base+spel_h_shufA]
     mova                 m7, [base+spel_h_shufB]
-%if UNIX64
-    mov                  wd, wd
-%endif
     lea                srcq, [srcq+wq*2]
-    punpcklbw            m3, m3
+    punpcklbw            m2, m2
     lea                dstq, [dstq+wq*2]
-    psraw                m3, 8
+    psraw                m2, 8
     neg                  wq
 %if ARCH_X86_32
-    ALLOC_STACK       -16*4
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
+    ALLOC_STACK       -16*3
+    %define              m8  [rsp+16*0]
+    %define              m9  [rsp+16*1]
+    %define             m10  [rsp+16*2]
+    pshufd               m0, m2, q0000
+    pshufd               m1, m2, q1111
+    pshufd               m2, m2, q2222
     mova                 m8, m0
     mova                 m9, m1
     mova                m10, m2
-    mova                m11, m3
 %else
-    pshufd               m8, m3, q0000
-    pshufd               m9, m3, q1111
-    pshufd              m10, m3, q2222
-    pshufd              m11, m3, q3333
+    pshufd               m8, m2, q0000
+    pshufd               m9, m2, q1111
+    pshufd              m10, m2, q2222
 %endif
 .h_w8_loop0:
     mov                  r6, wq
 .h_w8_loop:
-    movu                 m0, [srcq+r6*2- 6]
-    movu                 m1, [srcq+r6*2+ 2]
-    pshufb               m2, m0, m6   ; 0 1 1 2 2 3 3 4
-    pshufb               m0, m7       ; 2 3 3 4 4 5 5 6
-    pmaddwd              m2, m8       ; abcd0
-    pmaddwd              m0, m9       ; abcd1
-    pshufb               m3, m1, m6   ; 4 5 5 6 6 7 7 8
-    pshufb               m1, m7       ; 6 7 7 8 8 9 9 a
-    paddd                m2, m4
-    paddd                m0, m2
+    movu                 m3, [srcq+r6*2-4]
+    movu                 m2, [srcq+r6*2+8]
+    pshufb               m0, m3, m6   ; 01 12 23 34
+    pmaddwd              m0, m8       ; abcd0
+    pshufb               m3, m7       ; 23 34 45 56
+    pmaddwd              m1, m9, m3   ; abcd1
+    paddd                m0, m1
+    pshufb               m1, m2, m6   ; 67 78 89 9a
+    shufpd               m3, m1, 0x01 ; 45 56 67 78
+    pmaddwd              m1, m9       ; efgh1
+    pshufb               m2, m7       ; 89 9a ab bc
+    pmaddwd              m2, m10      ; efgh2
+    paddd                m1, m2
     pmaddwd              m2, m10, m3  ; abcd2
     pmaddwd              m3, m8       ; efgh0
+    paddd                m0, m4
+    paddd                m1, m4
     paddd                m0, m2
-    pmaddwd              m2, m11, m1  ; abcd3
-    pmaddwd              m1, m9       ; efgh1
-    paddd                m0, m2
-    movu                 m2, [srcq+r6*2+10]
-    paddd                m3, m4
-    paddd                m1, m3
-    pshufb               m3, m2, m6   ; 8 9 9 a a b b c
-    pshufb               m2, m7       ; a b b c c d d e
-    pmaddwd              m3, m10      ; efgh2
-    pmaddwd              m2, m11      ; efgh3
     paddd                m1, m3
-    paddd                m1, m2
     psrad                m0, 6
     psrad                m1, 6
     packssdw             m0, m1
@@ -1379,78 +1371,71 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
     shr                 myd, 16
     cmp                  hd, 6
     cmovb               myd, mxd
-    movq                 m3, [base+subpel_filters+myq*8]
-    WIN64_SPILL_XMM      15
-    movd                 m7, r8m
+    movq                 m2, [base+subpel_filters+1+myq*8]
+    WIN64_SPILL_XMM      11, 16
+    movd                 m5, r8m
     movifnidn          dstq, dstmp
     movifnidn           dsq, dsmp
-    punpcklbw            m3, m3
-    pshufb               m7, [base+pw_256]
-    psraw                m3, 8 ; sign-extend
+    punpcklbw            m2, m2
+    pshufb               m5, [base+pw_256]
+    psraw                m2, 8 ; sign-extend
 %if ARCH_X86_32
-    ALLOC_STACK       -16*7
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
+    ALLOC_STACK       -16*4
+    pshufd               m0, m2, q0000
+    mov                  r6, ssq
+    pshufd               m1, m2, q1111
+    neg                  r6
+    pshufd               m2, m2, q2222
     mova                 m8, m0
     mova                 m9, m1
     mova                m10, m2
-    mova                m11, m3
-%else
-    pshufd               m8, m3, q0000
-    pshufd               m9, m3, q1111
-    pshufd              m10, m3, q2222
-    pshufd              m11, m3, q3333
-%endif
-    lea                  r6, [ssq*3]
-    sub                srcq, r6
     cmp                  wd, 2
     jne .v_w4
+%else
+    mov                  r6, ssq
+    pshufd               m8, m2, q0000
+    neg                  r6
+    cmp                  wd, 4
+    jg .v_w8
+    pshufd               m9, m2, q1111
+    pshufd              m10, m2, q2222
+    je .v_w4
+%endif
 .v_w2:
-    movd                 m1, [srcq+ssq*0]
+    movd                 m1, [srcq+r6 *2]
+    movd                 m3, [srcq+r6 *1]
+    movd                 m2, [srcq+ssq*0]
     movd                 m4, [srcq+ssq*1]
-    movd                 m2, [srcq+ssq*2]
-    add                srcq, r6
-    movd                 m5, [srcq+ssq*0]
-    movd                 m3, [srcq+ssq*1]
-    movd                 m6, [srcq+ssq*2]
-    add                srcq, r6
+    lea                srcq, [srcq+ssq*2]
     movd                 m0, [srcq+ssq*0]
-    punpckldq            m1, m4      ; 0 1
-    punpckldq            m4, m2      ; 1 2
-    punpckldq            m2, m5      ; 2 3
-    punpckldq            m5, m3      ; 3 4
-    punpckldq            m3, m6      ; 4 5
-    punpckldq            m6, m0      ; 5 6
-    punpcklwd            m1, m4      ; 01 12
-    punpcklwd            m2, m5      ; 23 34
-    punpcklwd            m3, m6      ; 45 56
+    punpckldq            m1, m3      ; 0 1
+    punpckldq            m3, m2      ; 1 2
+    punpckldq            m2, m4      ; 2 3
+    punpckldq            m4, m0      ; 3 4
+    punpcklwd            m1, m3      ; 01 12
+    punpcklwd            m2, m4      ; 23 34
     pxor                 m6, m6
 .v_w2_loop:
-    movd                 m4, [srcq+ssq*1]
+    movd                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    pmaddwd              m5, m8, m1  ; a0 b0
+    pmaddwd              m4, m8, m1  ; a0 b0
     mova                 m1, m2
     pmaddwd              m2, m9      ; a1 b1
-    paddd                m5, m2
-    mova                 m2, m3
-    pmaddwd              m3, m10     ; a2 b2
-    paddd                m5, m3
-    punpckldq            m3, m0, m4  ; 6 7
+    paddd                m4, m2
+    punpckldq            m2, m0, m3  ; 4 5
     movd                 m0, [srcq+ssq*0]
-    punpckldq            m4, m0      ; 7 8
-    punpcklwd            m3, m4      ; 67 78
-    pmaddwd              m4, m11, m3 ; a3 b3
-    paddd                m5, m4
-    psrad                m5, 5
-    packssdw             m5, m5
-    pmaxsw               m5, m6
-    pavgw                m5, m6
-    pminsw               m5, m7
-    movd       [dstq+dsq*0], m5
-    pshuflw              m5, m5, q3232
-    movd       [dstq+dsq*1], m5
+    punpckldq            m3, m0      ; 5 6
+    punpcklwd            m2, m3      ; 67 78
+    pmaddwd              m3, m10, m2 ; a2 b2
+    paddd                m4, m3
+    psrad                m4, 5
+    packssdw             m4, m4
+    pmaxsw               m4, m6
+    pavgw                m4, m6
+    pminsw               m4, m5
+    movd       [dstq+dsq*0], m4
+    pshuflw              m4, m4, q3232
+    movd       [dstq+dsq*1], m4
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w2_loop
@@ -1458,563 +1443,1991 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
 .v_w4:
 %if ARCH_X86_32
     shl                  wd, 14
-%if STACK_ALIGNMENT < 16
-    mov          [esp+4*29], srcq
-    mov          [esp+4*30], dstq
-%else
-    mov               srcmp, srcq
-%endif
+    lea                srcq, [srcq+r6*2]
     lea                  wd, [wq+hq-(1<<16)]
-%else
-    shl                  wd, 6
-    mov                  r7, srcq
-    mov                  r8, dstq
-    lea                  wd, [wq+hq-(1<<8)]
+%if STACK_ALIGNMENT < 16
+    %define           dstmp  [esp+16*3]
 %endif
 .v_w4_loop0:
+    mov               dstmp, dstq
     movq                 m1, [srcq+ssq*0]
     movq                 m2, [srcq+ssq*1]
-    movq                 m3, [srcq+ssq*2]
-    add                srcq, r6
-    movq                 m4, [srcq+ssq*0]
-    movq                 m5, [srcq+ssq*1]
-    movq                 m6, [srcq+ssq*2]
-    add                srcq, r6
-    movq                 m0, [srcq+ssq*0]
+    lea                  r6, [srcq+ssq*2]
+    movq                 m3, [r6  +ssq*0]
+    movq                 m4, [r6  +ssq*1]
+    lea                  r6, [r6  +ssq*2]
+%else
+    movq                 m1, [srcq+r6 *2]
+    movq                 m2, [srcq+r6 *1]
+    lea                  r6, [srcq+ssq*2]
+    movq                 m3, [srcq+ssq*0]
+    movq                 m4, [srcq+ssq*1]
+%endif
+    movq                 m0, [r6  +ssq*0]
     punpcklwd            m1, m2      ; 01
     punpcklwd            m2, m3      ; 12
     punpcklwd            m3, m4      ; 23
-    punpcklwd            m4, m5      ; 34
-    punpcklwd            m5, m6      ; 45
-    punpcklwd            m6, m0      ; 56
-%if ARCH_X86_32
-    jmp .v_w4_loop_start
+    punpcklwd            m4, m0      ; 34
 .v_w4_loop:
-    mova                 m1, m12
-    mova                 m2, m13
-    mova                 m3, m14
-.v_w4_loop_start:
-    pmaddwd              m1, m8      ; a0
-    pmaddwd              m2, m8      ; b0
-    mova                m12, m3
-    mova                m13, m4
+    pmaddwd              m6, m8, m1  ; a0
+    pmaddwd              m7, m8, m2  ; b0
+    mova                 m1, m3
     pmaddwd              m3, m9      ; a1
+    mova                 m2, m4
     pmaddwd              m4, m9      ; b1
-    paddd                m1, m3
-    paddd                m2, m4
-    mova                m14, m5
-    mova                 m4, m6
-    pmaddwd              m5, m10     ; a2
-    pmaddwd              m6, m10     ; b2
-    paddd                m1, m5
-    paddd                m2, m6
-    movq                 m6, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    punpcklwd            m5, m0, m6  ; 67
-    movq                 m0, [srcq+ssq*0]
-    pmaddwd              m3, m11, m5 ; a3
-    punpcklwd            m6, m0      ; 78
-    paddd                m1, m3
-    pmaddwd              m3, m11, m6 ; b3
-    paddd                m2, m3
-    psrad                m1, 5
-    psrad                m2, 5
-    packssdw             m1, m2
-    pxor                 m2, m2
-    pmaxsw               m1, m2
-    pavgw                m1, m2
-    pminsw               m1, m7
-    movq       [dstq+dsq*0], m1
-    movhps     [dstq+dsq*1], m1
+    paddd                m6, m3
+    movq                 m3, [r6+ssq*0]
+    paddd                m7, m4
+    movq                 m4, [r6+ssq*1]
+    lea                  r6, [r6+ssq*2]
+    movq                 m0, [r6+ssq*0]
+    punpcklwd            m3, m4      ; 45
+    punpcklwd            m4, m0      ; 56
+    pmaddwd              m0, m10, m3 ; a2
+    paddd                m6, m0
+    pmaddwd              m0, m10, m4 ; b2
+    paddd                m7, m0
+    psrad                m6, 5
+    psrad                m7, 5
+    packssdw             m6, m7
+    pxor                 m7, m7
+    pmaxsw               m6, m7
+    pavgw                m6, m7
+    pminsw               m6, m5
+    movq       [dstq+dsq*0], m6
+    movhps     [dstq+dsq*1], m6
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w4_loop
-%if STACK_ALIGNMENT < 16
-    mov                srcq, [esp+4*29]
-    mov                dstq, [esp+4*30]
-    movzx                hd, ww
-    add                srcq, 8
-    add                dstq, 8
-    mov          [esp+4*29], srcq
-    mov          [esp+4*30], dstq
-%else
-    mov                srcq, srcmp
+%if ARCH_X86_32
     mov                dstq, dstmp
-    movzx                hd, ww
     add                srcq, 8
+    movzx                hd, ww
     add                dstq, 8
-    mov               srcmp, srcq
-    mov               dstmp, dstq
-%endif
     sub                  wd, 1<<16
+    jg .v_w4_loop0
+    RET
 %else
-.v_w4_loop:
-    pmaddwd             m12, m8, m1  ; a0
-    pmaddwd             m13, m8, m2  ; b0
-    mova                 m1, m3
-    mova                 m2, m4
-    pmaddwd              m3, m9      ; a1
-    pmaddwd              m4, m9      ; b1
-    paddd               m12, m3
-    paddd               m13, m4
-    mova                 m3, m5
-    mova                 m4, m6
-    pmaddwd              m5, m10     ; a2
-    pmaddwd              m6, m10     ; b2
-    paddd               m12, m5
-    paddd               m13, m6
-    movq                 m6, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    punpcklwd            m5, m0, m6  ; 67
-    movq                 m0, [srcq+ssq*0]
-    pmaddwd             m14, m11, m5 ; a3
-    punpcklwd            m6, m0      ; 78
-    paddd               m12, m14
-    pmaddwd             m14, m11, m6 ; b3
-    paddd               m13, m14
-    psrad               m12, 5
-    psrad               m13, 5
-    packssdw            m12, m13
-    pxor                m13, m13
-    pmaxsw              m12, m13
-    pavgw               m12, m13
-    pminsw              m12, m7
-    movq       [dstq+dsq*0], m12
-    movhps     [dstq+dsq*1], m12
-    lea                dstq, [dstq+dsq*2]
+    RET
+.v_w8:
+    mova                r6m, m8
+    shl                  wd, 5
+    pshufd               m6, m2, q1111
+    lea                  wd, [wq+hq-(1<<8)]
+    pshufd               m7, m2, q2222
+    WIN64_PUSH_XMM       16
+.v_w8_loop0:
+    movu                 m9, [srcq+ r6*2]
+    movu                m11, [srcq+ r6*1]
+    lea                  r7, [srcq+ssq*2]
+    movu                m13, [srcq+ssq*0]
+    movu                m15, [srcq+ssq*1]
+    mov                  r8, dstq
+    movu                 m4, [r7  +ssq*0]
+    punpcklwd            m8, m9, m11  ; 01
+    punpckhwd            m9, m11
+    punpcklwd           m10, m11, m13 ; 12
+    punpckhwd           m11, m13
+    punpcklwd           m12, m13, m15 ; 23
+    punpckhwd           m13, m15
+    punpcklwd           m14, m15, m4  ; 34
+    punpckhwd           m15, m4
+.v_w8_loop:
+    mova                 m3, r6m
+    pmaddwd              m0, m8, m3   ; a0
+    pmaddwd              m2, m9, m3   ; a0'
+    pmaddwd              m1, m10, m3  ; b0
+    pmaddwd              m3, m11      ; b0'
+    mova                 m8, m12
+    pmaddwd             m12, m6       ; a1
+    mova                 m9, m13
+    pmaddwd             m13, m6       ; a1'
+    mova                m10, m14
+    pmaddwd             m14, m6       ; b1
+    mova                m11, m15
+    pmaddwd             m15, m6       ; b1'
+    paddd                m0, m12
+    paddd                m2, m13
+    movu                m13, [r7+ssq*0]
+    paddd                m1, m14
+    paddd                m3, m15
+    movu                m15, [r7+ssq*1]
+    lea                  r7, [r7+ssq*2]
+    movu                 m4, [r7+ssq*0]
+    punpcklwd           m12, m13, m15 ; 45
+    punpckhwd           m13, m15
+    punpcklwd           m14, m15, m4  ; 56
+    punpckhwd           m15, m4
+    pmaddwd              m4, m7, m12  ; a2
+    paddd                m0, m4
+    pmaddwd              m4, m7, m13  ; a2'
+    paddd                m2, m4
+    pmaddwd              m4, m7, m14  ; b2
+    paddd                m1, m4
+    pmaddwd              m4, m7, m15  ; b2'
+    paddd                m3, m4
+    REPX       {psrad x, 5}, m0, m2, m1, m3
+    packssdw             m0, m2
+    packssdw             m1, m3
+    pxor                 m2, m2
+    pmaxsw               m0, m2
+    pmaxsw               m1, m2
+    pavgw                m0, m2
+    pavgw                m1, m2
+    pminsw               m0, m5
+    pminsw               m1, m5
+    mova         [r8+dsq*0], m0
+    mova         [r8+dsq*1], m1
+    lea                  r8, [r8+dsq*2]
     sub                  hd, 2
-    jg .v_w4_loop
-    add                  r7, 8
-    add                  r8, 8
+    jg .v_w8_loop
+    add                srcq, 16
+    add                dstq, 16
     movzx                hd, wb
-    mov                srcq, r7
-    mov                dstq, r8
     sub                  wd, 1<<8
-%endif
-    jg .v_w4_loop0
+    jg .v_w8_loop0
     RET
+%endif
 .hv:
-    RESET_STACK_STATE
+    cmp                  wd, 4
+    jg .hv_w8
+    WIN64_SPILL_XMM      12, 16
 %if ARCH_X86_32
-    movd                 m4, r8m
-    mova                 m6, [base+pd_512]
-    pshufb               m4, [base+pw_256]
+    movd                 m3, r8m
+    pshufb               m3, [base+pw_256]
 %else
-%if WIN64
-    ALLOC_STACK        16*6, 16
-%endif
-    movd                m15, r8m
-    pshufb              m15, [base+pw_256]
+    movd                m11, r8m
+    pshufb              m11, [base+pw_256]
 %endif
-    cmp                  wd, 4
-    jg .hv_w8
     movzx               mxd, mxb
-    je .hv_w4
     movq                 m0, [base+subpel_filters+mxq*8]
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
     cmovb               myd, mxd
-    movq                 m3, [base+subpel_filters+myq*8]
-%if ARCH_X86_32
-    mov                dstq, dstmp
-    mov                 dsq, dsmp
-    mova                 m5, [base+spel_h_shuf2]
-    ALLOC_STACK       -16*8
-%else
-    mova                 m6, [base+pd_512]
-    mova                 m9, [base+spel_h_shuf2]
-%endif
+    movq                 m2, [base+subpel_filters+1+myq*8]
+    movddup              m7, [base+pd_8704]
+    sub                srcq, 2
     pshuflw              m0, m0, q2121
-    pxor                 m7, m7
-    punpcklbw            m7, m0
-    punpcklbw            m3, m3
-    psraw                m3, 8 ; sign-extend
+    pxor                 m6, m6
+    punpcklbw            m6, m0
+    punpcklbw            m2, m2
+    psraw                m2, 8 ; sign-extend
     test          dword r8m, 0x800
     jz .hv_w2_10bpc
-    psraw                m7, 2
-    psllw                m3, 2
+    movddup              m7, [base+pd_2560]
+    psraw                m6, 2
+    psllw                m2, 2
 .hv_w2_10bpc:
-    lea                  r6, [ssq*3]
-    sub                srcq, 2
-    sub                srcq, r6
 %if ARCH_X86_32
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    mova                 m9, m5
-    mova                m11, m0
-    mova                m12, m1
-    mova                m13, m2
-    mova                m14, m3
-    mova                m15, m4
+%assign regs_used 2
+    ALLOC_STACK       -16*7
+%assign regs_used 7
+    mov                dstq, r0mp
+    mov                 dsq, r1mp
+    %define             m11  [esp+16*4]
+    pshufd               m0, m2, q0000
+    pshufd               m1, m2, q1111
+    pshufd               m2, m2, q2222
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m10, m2
+    mova                m11, m3
+    neg                 ssq
+    movu                 m3, [srcq+ssq*2]
+    movu                 m4, [srcq+ssq*1]
+    neg                 ssq
 %else
-    pshufd              m11, m3, q0000
-    pshufd              m12, m3, q1111
-    pshufd              m13, m3, q2222
-    pshufd              m14, m3, q3333
+    pshufd               m8, m2, q0000
+    mov                  r6, ssq
+    pshufd               m9, m2, q1111
+    neg                  r6
+    pshufd              m10, m2, q2222
+    movu                 m3, [srcq+r6 *2]
+    movu                 m4, [srcq+r6 *1]
 %endif
+    movu                 m1, [srcq+ssq*0]
+    movu                 m0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     movu                 m2, [srcq+ssq*0]
-    movu                 m3, [srcq+ssq*1]
-    movu                 m1, [srcq+ssq*2]
-    add                srcq, r6
-    movu                 m4, [srcq+ssq*0]
-%if ARCH_X86_32
-    REPX    {pshufb  x, m5}, m2, m3, m1, m4
-%else
-    REPX    {pshufb  x, m9}, m2, m3, m1, m4
-%endif
-    REPX    {pmaddwd x, m7}, m2, m3, m1, m4
-    phaddd               m2, m3        ; 0 1
-    phaddd               m1, m4        ; 2 3
-    movu                 m3, [srcq+ssq*1]
-    movu                 m4, [srcq+ssq*2]
-    add                srcq, r6
-    movu                 m0, [srcq+ssq*0]
-%if ARCH_X86_32
-    REPX    {pshufb  x, m5}, m3, m4, m0
-%else
-    REPX    {pshufb  x, m9}, m3, m4, m0
-%endif
-    REPX    {pmaddwd x, m7}, m3, m4, m0
-    phaddd               m3, m4        ; 4 5
-    phaddd               m0, m0        ; 6 6
-    REPX    {paddd   x, m6}, m2, m1, m3, m0
-    REPX    {psrad   x, 10}, m2, m1, m3, m0
-    packssdw             m2, m1        ; 0 1 2 3
-    packssdw             m3, m0        ; 4 5 6 _
-    palignr              m4, m3, m2, 4 ; 1 2 3 4
-    pshufd               m5, m3, q0321 ; 5 6 _ _
+    cmp                  wd, 4
+    je .hv_w4
+    mova                 m5, [base+spel_h_shuf2]
+    REPX    {pshufb  x, m5}, m3, m4, m0, m1, m2
+    REPX    {pmaddwd x, m6}, m3, m0, m4, m1, m2
+    phaddd               m3, m0        ; 0 3
+    phaddd               m4, m1        ; 1 2
+    phaddd               m0, m2        ; 3 4
+    REPX    {paddd   x, m7}, m3, m4, m0
+    REPX    {psrad   x, 10}, m3, m4, m0
+    packssdw             m3, m4        ; 0 3 1 2
+    packssdw             m4, m0        ; 1 2 3 4
+    pshufd               m2, m3, q1320 ; 0 1 2 3
     punpcklwd            m1, m2, m4    ; 01 12
     punpckhwd            m2, m4        ; 23 34
-    punpcklwd            m3, m5        ; 45 56
 .hv_w2_loop:
-    movu                 m4, [srcq+ssq*1]
+    movu                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    movu                 m5, [srcq+ssq*0]
-    pshufb               m4, m9
-    pshufb               m5, m9
-    pmaddwd              m4, m7
-    pmaddwd              m5, m7
-    phaddd               m4, m5
-    pmaddwd              m5, m11, m1   ; a0 b0
+    movu                 m4, [srcq+ssq*0]
+    pshufb               m3, m5
+    pshufb               m4, m5
+    pmaddwd              m3, m6
+    pmaddwd              m4, m6
+    phaddd               m3, m4
+    pmaddwd              m4, m8, m1    ; a0 b0
     mova                 m1, m2
-    pmaddwd              m2, m12       ; a1 b1
-    paddd                m5, m2
-    mova                 m2, m3
-    pmaddwd              m3, m13       ; a2 b2
+    pmaddwd              m2, m9        ; a1 b1
+    paddd                m4, m2
+    paddd                m3, m7
+    psrad                m3, 10        ; 5 6
+    packssdw             m0, m3
+    pshufd               m2, m0, q2103
+    punpckhwd            m2, m0        ; 45 56
+    mova                 m0, m3
+    pmaddwd              m3, m10, m2   ; a2 b2
+    paddd                m4, m3
+    psrad                m4, 10
+    packssdw             m4, m4
+    pxor                 m3, m3
+    pminsw               m4, m11
+    pmaxsw               m4, m3
+    movd       [dstq+dsq*0], m4
+    pshuflw              m4, m4, q1032
+    movd       [dstq+dsq*1], m4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+%if ARCH_X86_32
+    %define             m12  [esp+16*5]
+    %define             m13  [esp+16*6]
+    %define             m14  [base+spel_h_shufA]
+    %define             m15  [base+spel_h_shufB]
+    pshufd               m5, m6, q0000
+    pshufd               m6, m6, q1111
+    mova                m12, m5
+    mova                m13, m6
+%else
+    WIN64_PUSH_XMM       16
+    mova                m14, [base+spel_h_shufA]
+    mova                m15, [base+spel_h_shufB]
+    pshufd              m12, m6, q0000
+    pshufd              m13, m6, q1111
+%endif
+%macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB
+    pshufb               %3, %2, m14
+    pmaddwd              %3, m12
+    pshufb               %2, %4
+    pmaddwd              %2, m13
+    paddd                %3, m7
+    paddd                %1, %2, %3
+%endmacro
+    HV_H_W4_6TAP         m3, m3, m5
+    HV_H_W4_6TAP         m4, m4, m5
+    HV_H_W4_6TAP         m5, m1, m5
+    HV_H_W4_6TAP         m0, m0, m1
+    HV_H_W4_6TAP         m2, m2, m1
+    REPX      {psrad x, 10}, m3, m5, m4, m0, m2
+    packssdw             m3, m5      ; 0 2
+    packssdw             m4, m0      ; 1 3
+    packssdw             m5, m2      ; 2 4
+    punpcklwd            m1, m3, m4  ; 01
+    punpckhwd            m3, m4      ; 23
+    punpcklwd            m2, m4, m5  ; 12
+    punpckhwd            m4, m5      ; 34
+.hv_w4_loop:
+    movu                 m0, [srcq+ssq*1]
+    pmaddwd              m5, m8, m1  ; a0
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd              m6, m8, m2  ; b0
+    mova                 m1, m3
+    pmaddwd              m3, m9      ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m9      ; b1
     paddd                m5, m3
-    paddd                m4, m6
-    psrad                m4, 10        ; 7 8
-    packssdw             m0, m4
-    pshufd               m3, m0, q2103
-    punpckhwd            m3, m0        ; 67 78
-    mova                 m0, m4
-    pmaddwd              m4, m14, m3   ; a3 b3
-    paddd                m5, m6
-    paddd                m5, m4
+    movu                 m3, [srcq+ssq*0]
+    paddd                m6, m4
+    HV_H_W4_6TAP         m0, m0, m4
+    HV_H_W4_6TAP         m3, m3, m4
+    psrad                m4, m2, 16
+    psrad                m0, 10
+    psrad                m3, 10
+    packssdw             m4, m0      ; 4 5
+    packssdw             m0, m3      ; 5 6
+    punpcklwd            m3, m4, m0  ; 45
+    punpckhwd            m4, m0      ; 56
+    pmaddwd              m0, m10, m3 ; a2
+    paddd                m5, m0
+    pmaddwd              m0, m10, m4 ; b2
+    paddd                m6, m0
     psrad                m5, 10
-    packssdw             m5, m5
-    pxor                 m4, m4
-    pminsw               m5, m15
-    pmaxsw               m5, m4
-    movd       [dstq+dsq*0], m5
-    pshuflw              m5, m5, q3232
-    movd       [dstq+dsq*1], m5
+    psrad                m6, 10
+    packssdw             m5, m6
+    pxor                 m6, m6
+    pminsw               m5, m11
+    pmaxsw               m5, m6
+    movq       [dstq+dsq*0], m5
+    movhps     [dstq+dsq*1], m5
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
-    jg .hv_w2_loop
+    jg .hv_w4_loop
     RET
 .hv_w8:
+    RESET_STACK_STATE
     shr                 mxd, 16
-.hv_w4:
-    movq                 m2, [base+subpel_filters+mxq*8]
+    movq                 m2, [base+subpel_filters+1+mxq*8]
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
     cmovb               myd, mxd
-    movq                 m3, [base+subpel_filters+myq*8]
-%if ARCH_X86_32
-    RESET_STACK_STATE
-    mov                dstq, dstmp
-    mov                 dsq, dsmp
-    mova                 m0, [base+spel_h_shufA]
-    mova                 m1, [base+spel_h_shufB]
-    ALLOC_STACK      -16*15
-    mova                 m8, m0
-    mova                 m9, m1
-    mova                m14, m6
-%else
-    mova                 m8, [base+spel_h_shufA]
-    mova                 m9, [base+spel_h_shufB]
-%endif
+    movq                 m1, [base+subpel_filters+1+myq*8]
+    movd                 m3, r8m
+    movddup              m4, [base+pd_8704]
+    pshufb               m3, [base+pw_256]
     pxor                 m0, m0
     punpcklbw            m0, m2
-    punpcklbw            m3, m3
-    psraw                m3, 8
+    punpcklbw            m1, m1
+    sub                srcq, 4
+    psraw                m1, 8 ; sign-extend
     test          dword r8m, 0x800
-    jz .hv_w4_10bpc
+    jz .hv_w8_10bpc
+    movddup              m4, [base+pd_2560]
     psraw                m0, 2
-    psllw                m3, 2
-.hv_w4_10bpc:
-    lea                  r6, [ssq*3]
-    sub                srcq, 6
-    sub                srcq, r6
+    psllw                m1, 2
+.hv_w8_10bpc:
+%if ARCH_X86_32
+%assign regs_used 2
+    ALLOC_STACK       -16*9
+%assign regs_used 7
+    mov                dstq, r0mp
+    mov                 dsq, r1mp
+    mova         [rsp+16*7], m4
+%else
+    ALLOC_STACK        16*7, 16
+%endif
+    mova         [rsp+16*6], m3
+    pshufd               m2, m0, q0000
+    mova         [rsp+16*0], m2
+    pshufd               m2, m0, q1111
+    mova         [rsp+16*1], m2
+    pshufd               m0, m0, q2222
+    mova         [rsp+16*2], m0
+    pshufd               m2, m1, q0000
+    mova         [rsp+16*3], m2
+    pshufd               m2, m1, q1111
+    mova         [rsp+16*4], m2
+    pshufd               m1, m1, q2222
+    mova         [rsp+16*5], m1
+    mov                  r6, ssq
+    neg                  r6
 %if ARCH_X86_32
-    %define tmp esp+16*8
     shl                  wd, 14
+    lea                 r4d, [wq+hq-(1<<16)]
 %if STACK_ALIGNMENT < 16
-    mov          [esp+4*61], srcq
-    mov          [esp+4*62], dstq
-%else
+    %define           srcmp  [esp+16*8+4*0]
+    %define           dstmp  [esp+16*8+4*1]
+%endif
+%macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3]
+    punpcklwd            %1, %2, %3   ; 01 12 23 34
+    punpckhwd            %2, %3       ; 45 56 67 78
+    pmaddwd              %3, %4, %1   ; a0
+    shufpd               %1, %2, 0x01 ; 23 34 45 56
+    pmaddwd              %2, %6       ; a2
+    pmaddwd              %1, %5       ; a1
+    paddd                %2, %3
+    paddd                %1, %2
+%endmacro
+.hv_w8_loop0:
     mov               srcmp, srcq
-%endif
-    mova         [tmp+16*5], m4
-    lea                  wd, [wq+hq-(1<<16)]
-    pshufd               m1, m0, q0000
-    pshufd               m2, m0, q1111
-    pshufd               m5, m0, q2222
-    pshufd               m0, m0, q3333
-    mova                m10, m1
-    mova                m11, m2
-    mova                m12, m5
-    mova                m13, m0
-%else
-%if WIN64
-    %define tmp rsp
-%else
-    %define tmp rsp-104 ; red zone
-%endif
-    shl                  wd, 6
-    mov                  r7, srcq
-    mov                  r8, dstq
-    lea                  wd, [wq+hq-(1<<8)]
-    pshufd              m10, m0, q0000
-    pshufd              m11, m0, q1111
-    pshufd              m12, m0, q2222
-    pshufd              m13, m0, q3333
-    mova         [tmp+16*5], m15
-%endif
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    mova         [tmp+16*1], m0
-    mova         [tmp+16*2], m1
-    mova         [tmp+16*3], m2
-    mova         [tmp+16*4], m3
-%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
-    pshufb              m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
-    pshufb              m%1, m9      ; 2 3 3 4 4 5 5 6
-    pmaddwd             m%3, m10
-    pmaddwd             m%1, m11
-    paddd               m%3, %5
-    paddd               m%1, m%3
-    pshufb              m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
-    pshufb              m%2, m9      ; 6 7 7 8 8 9 9 a
-    pmaddwd             m%3, m12
-    pmaddwd             m%2, m13
-    paddd               m%1, m%3
-    paddd               m%1, m%2
-    psrad               m%1, %4
-%endmacro
-.hv_w4_loop0:
-%if ARCH_X86_64
-    mova                m14, [pd_512]
-%endif
-    movu                 m4, [srcq+ssq*0+0]
-    movu                 m1, [srcq+ssq*0+8]
+    mov               dstmp, dstq
+    movu                 m5, [srcq+r6*2+0]
+    movu                 m6, [srcq+r6*2+2]
+    mova                 m7, [rsp+16*0]
+    mova                 m1, [rsp+16*1]
+    mova                 m0, [rsp+16*2]
+    HV_H_6TAP            m2, m5, m6, m7, m1, m0
+    movu                 m5, [srcq+r6*1+0]
+    movu                 m6, [srcq+r6*1+2]
+    HV_H_6TAP            m3, m5, m6, m7, m1, m0
+    movu                 m5, [srcq+ssq*0+0]
+    movu                 m6, [srcq+ssq*0+2]
+    HV_H_6TAP            m4, m5, m6, m7, m1, m0
     movu                 m5, [srcq+ssq*1+0]
-    movu                 m2, [srcq+ssq*1+8]
-    movu                 m6, [srcq+ssq*2+0]
-    movu                 m3, [srcq+ssq*2+8]
-    add                srcq, r6
-    PUT_8TAP_HV_H         4, 1, 0, 10
-    PUT_8TAP_HV_H         5, 2, 0, 10
-    PUT_8TAP_HV_H         6, 3, 0, 10
-    movu                 m7, [srcq+ssq*0+0]
-    movu                 m2, [srcq+ssq*0+8]
-    movu                 m1, [srcq+ssq*1+0]
-    movu                 m3, [srcq+ssq*1+8]
-    PUT_8TAP_HV_H         7, 2, 0, 10
-    PUT_8TAP_HV_H         1, 3, 0, 10
-    movu                 m2, [srcq+ssq*2+0]
-    movu                 m3, [srcq+ssq*2+8]
-    add                srcq, r6
-    PUT_8TAP_HV_H         2, 3, 0, 10
-    packssdw             m4, m7      ; 0 3
-    packssdw             m5, m1      ; 1 4
-    movu                 m0, [srcq+ssq*0+0]
-    movu                 m1, [srcq+ssq*0+8]
-    PUT_8TAP_HV_H         0, 1, 3, 10
-    packssdw             m6, m2      ; 2 5
-    packssdw             m7, m0      ; 3 6
-    punpcklwd            m1, m4, m5  ; 01
-    punpckhwd            m4, m5      ; 34
-    punpcklwd            m2, m5, m6  ; 12
-    punpckhwd            m5, m6      ; 45
-    punpcklwd            m3, m6, m7  ; 23
-    punpckhwd            m6, m7      ; 56
-%if ARCH_X86_32
-    jmp .hv_w4_loop_start
-.hv_w4_loop:
-    mova                 m1, [tmp+16*6]
-    mova                 m2, m15
-.hv_w4_loop_start:
-    mova                 m7, [tmp+16*1]
-    pmaddwd              m1, m7      ; a0
-    pmaddwd              m2, m7      ; b0
-    mova                 m7, [tmp+16*2]
-    mova         [tmp+16*6], m3
-    pmaddwd              m3, m7      ; a1
-    mova                m15, m4
-    pmaddwd              m4, m7      ; b1
-    mova                 m7, [tmp+16*3]
-    paddd                m1, m3
-    paddd                m2, m4
-    mova                 m3, m5
-    pmaddwd              m5, m7      ; a2
-    mova                 m4, m6
-    pmaddwd              m6, m7      ; b2
-    paddd                m1, m5
-    paddd                m2, m6
-    movu                 m7, [srcq+ssq*1+0]
-    movu                 m5, [srcq+ssq*1+8]
+    movu                 m6, [srcq+ssq*1+2]
     lea                srcq, [srcq+ssq*2]
-    PUT_8TAP_HV_H         7, 5, 6, 10
-    packssdw             m0, m7      ; 6 7
-    mova         [tmp+16*0], m0
-    movu                 m0, [srcq+ssq*0+0]
-    movu                 m5, [srcq+ssq*0+8]
-    PUT_8TAP_HV_H         0, 5, 6, 10
-    mova                 m6, [tmp+16*0]
-    packssdw             m7, m0      ; 7 8
-    punpcklwd            m5, m6, m7  ; 67
-    punpckhwd            m6, m7      ; 78
-    pmaddwd              m7, m5, [tmp+16*4]
-    paddd                m1, m7      ; a3
-    pmaddwd              m7, m6, [tmp+16*4]
-    paddd                m2, m7      ; b3
-    psrad                m1, 9
-    psrad                m2, 9
-    packssdw             m1, m2
-    pxor                 m7, m7
-    pmaxsw               m1, m7
-    pavgw                m7, m1
-    pminsw               m7, [tmp+16*5]
-    movq       [dstq+dsq*0], m7
-    movhps     [dstq+dsq*1], m7
+    HV_H_6TAP            m0, m5, m6, m7, m1
+    movu                 m5, [srcq+ssq*0+0]
+    movu                 m6, [srcq+ssq*0+2]
+    HV_H_6TAP            m1, m5, m6, m7
+    mova                 m5, [rsp+16*7]
+    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
+    REPX      {psrad x, 10}, m2, m4, m3, m0, m1
+    packssdw             m2, m4     ; 0 2
+    packssdw             m3, m0     ; 1 3
+    packssdw             m4, m1     ; 2 4
+    punpcklwd            m0, m2, m3 ; 01
+    punpckhwd            m2, m3     ; 23
+    punpcklwd            m1, m3, m4 ; 12
+    punpckhwd            m3, m4     ; 34
+.hv_w8_loop:
+    mova                 m5, [rsp+16*3]
+    mova                 m6, [rsp+16*4]
+    pmaddwd              m4, m0, m5 ; a0
+    pmaddwd              m5, m1     ; b0
+    mova                 m0, m2
+    pmaddwd              m2, m6     ; a1
+    mova                 m1, m3
+    pmaddwd              m3, m6     ; b1
+    paddd                m4, m2
+    movu                 m2, [srcq+ssq*1+0]
+    paddd                m5, m3
+    movu                 m3, [srcq+ssq*1+2]
+    lea                srcq, [srcq+ssq*2]
+    HV_H_6TAP            m6, m2, m3
+    movu                 m2, [srcq+ssq*0+0]
+    movu                 m3, [srcq+ssq*0+2]
+    HV_H_6TAP            m7, m2, m3
+    mova                 m2, [rsp+16*7]
+    psrad                m3, m1, 16
+    paddd                m6, m2
+    paddd                m7, m2
+    psrad                m6, 10
+    psrad                m7, 10
+    packssdw             m3, m6     ; 4 5
+    packssdw             m6, m7     ; 5 6
+    mova                 m7, [rsp+16*5]
+    punpcklwd            m2, m3, m6 ; 45
+    punpckhwd            m3, m6     ; 56
+    pmaddwd              m6, m2, m7 ; a2
+    pmaddwd              m7, m3     ; b2
+    paddd                m4, m6
+    paddd                m5, m7
+    psrad                m4, 10
+    psrad                m5, 10
+    packssdw             m4, m5
+    pxor                 m5, m5
+    pminsw               m4, [rsp+16*6]
+    pmaxsw               m4, m5
+    movq       [dstq+dsq*0], m4
+    movhps     [dstq+dsq*1], m4
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
-    jg .hv_w4_loop
-%if STACK_ALIGNMENT < 16
-    mov                srcq, [esp+4*61]
-    mov                dstq, [esp+4*62]
-    add                srcq, 8
-    add                dstq, 8
-    mov          [esp+4*61], srcq
-    mov          [esp+4*62], dstq
-%else
+    jg .hv_w8_loop
     mov                srcq, srcmp
     mov                dstq, dstmp
+    movzx                hd, r4w
     add                srcq, 8
     add                dstq, 8
-    mov               srcmp, srcq
-    mov               dstmp, dstq
-%endif
-    movzx                hd, ww
-    sub                  wd, 1<<16
+    sub                 r4d, 1<<16
 %else
-.hv_w4_loop:
-    mova                m15, [tmp+16*1]
-    pmaddwd             m14, m15, m1 ; a0
-    pmaddwd             m15, m2      ; b0
-    mova                 m7, [tmp+16*2]
-    mova                 m1, m3
-    pmaddwd              m3, m7      ; a1
-    mova                 m2, m4
-    pmaddwd              m4, m7      ; b1
-    mova                 m7, [tmp+16*3]
-    paddd               m14, m3
-    paddd               m15, m4
-    mova                 m3, m5
-    pmaddwd              m5, m7      ; a2
-    mova                 m4, m6
-    pmaddwd              m6, m7      ; b2
-    paddd               m14, m5
-    paddd               m15, m6
-    movu                 m7, [srcq+ssq*1+0]
-    movu                 m5, [srcq+ssq*1+8]
-    lea                srcq, [srcq+ssq*2]
-    PUT_8TAP_HV_H         7, 5, 6, 10, [pd_512]
-    packssdw             m0, m7      ; 6 7
-    mova         [tmp+16*0], m0
-    movu                 m0, [srcq+ssq*0+0]
-    movu                 m5, [srcq+ssq*0+8]
-    PUT_8TAP_HV_H         0, 5, 6, 10, [pd_512]
-    mova                 m6, [tmp+16*0]
-    packssdw             m7, m0      ; 7 8
-    punpcklwd            m5, m6, m7  ; 67
-    punpckhwd            m6, m7      ; 78
-    pmaddwd              m7, m5, [tmp+16*4]
-    paddd               m14, m7      ; a3
-    pmaddwd              m7, m6, [tmp+16*4]
-    paddd               m15, m7      ; b3
-    psrad               m14, 9
-    psrad               m15, 9
-    packssdw            m14, m15
-    pxor                 m7, m7
-    pmaxsw              m14, m7
-    pavgw                m7, m14
-    pminsw               m7, [tmp+16*5]
-    movq       [dstq+dsq*0], m7
-    movhps     [dstq+dsq*1], m7
-    lea                dstq, [dstq+dsq*2]
+    shl                  wd, 5
+    lea                 r8d, [wq+hq-256]
+%macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3]
+%ifid %6
+    REPX     {pshufb x, %6}, %2, %3, %4
+%else
+    mova                 %1, %6
+    pshufb               %2, %1       ; 01 12 23 34
+    pshufb               %3, %1       ; 45 56 67 78
+    pshufb               %4, %1       ; 89 9a ab bc
+%endif
+    pmaddwd              %1, %7, %2
+    shufpd               %2, %3, 0x01 ; 23 34 45 56
+    pmaddwd              %2, %8
+    paddd                %1, %2
+    pmaddwd              %2, %9, %3
+    paddd                %1, %2
+    pmaddwd              %2, %7, %3
+    shufpd               %3, %4, 0x01 ; 67 78 89 9a
+    pmaddwd              %4, %9
+    pmaddwd              %3, %8
+    paddd                %1, m4
+    paddd                %2, m4
+    paddd                %3, %4
+    paddd                %2, %3
+    psrad                %1, %5
+    psrad                %2, %5
+    packssdw             %1, %2
+%endmacro
+.hv_w8_loop0:
+    mova                 m5, [spel_h_shufA]
+    movu                 m0, [srcq+r6*2+ 0]
+    mova                 m6, [rsp+16*0]
+    movu                 m1, [srcq+r6*2+ 8]
+    mova                 m7, [rsp+16*1]
+    movu                 m2, [srcq+r6*2+16]
+    mova                 m8, [rsp+16*2]
+    HV_H_6TAP            m9, m0, m1, m2, 10, m5, m6, m7, m8
+    movu                 m0, [srcq+r6*1+ 0]
+    movu                 m1, [srcq+r6*1+ 8]
+    movu                 m2, [srcq+r6*1+16]
+    lea                  r4, [srcq+ssq*2]
+    HV_H_6TAP           m11, m0, m1, m2, 10, m5, m6, m7, m8
+    movu                 m0, [srcq+ssq*0+ 0]
+    movu                 m1, [srcq+ssq*0+ 8]
+    movu                 m2, [srcq+ssq*0+16]
+    mov                  r7, dstq
+    HV_H_6TAP           m13, m0, m1, m2, 10, m5, m6, m7, m8
+    movu                 m0, [srcq+ssq*1+ 0]
+    movu                 m1, [srcq+ssq*1+ 8]
+    movu                 m2, [srcq+ssq*1+16]
+    HV_H_6TAP           m15, m0, m1, m2, 10, m5, m6, m7, m8
+    movu                 m0, [r4+ssq*0+ 0]
+    movu                 m1, [r4+ssq*0+ 8]
+    movu                 m2, [r4+ssq*0+16]
+    HV_H_6TAP            m5, m0, m1, m2, 10, m5, m6, m7, m8
+    punpcklwd            m8, m9, m11  ; 01
+    punpckhwd            m9, m11
+    punpcklwd           m10, m11, m13 ; 12
+    punpckhwd           m11, m13
+    punpcklwd           m12, m13, m15 ; 23
+    punpckhwd           m13, m15
+    punpcklwd           m14, m15, m5  ; 34
+    punpckhwd           m15, m5
+.hv_w8_loop:
+    mova                 m3, [rsp+16*3]
+    mova                 m7, [rsp+16*4]
+    pmaddwd              m0, m8, m3   ; a0
+    mova                 m8, m12
+    pmaddwd              m2, m9, m3   ; a0'
+    mova                 m9, m13
+    pmaddwd              m1, m10, m3  ; b0
+    mova                m10, m14
+    pmaddwd              m3, m11      ; b0'
+    mova                m11, m15
+    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
+    movu                 m6, [r4+ssq*1+ 0]
+    paddd                m0, m12
+    movu                 m7, [r4+ssq*1+ 8]
+    paddd                m2, m13
+    movu                m12, [r4+ssq*1+16]
+    paddd                m1, m14
+    lea                  r4, [r4+ssq*2]
+    paddd                m3, m15
+    HV_H_6TAP           m15, m6, m7, m12, 10
+    movu                 m6, [r4+ssq*0+ 0]
+    movu                 m7, [r4+ssq*0+ 8]
+    movu                m14, [r4+ssq*0+16]
+    punpcklwd           m12, m5, m15 ; 45
+    punpckhwd           m13, m5, m15
+    HV_H_6TAP            m5, m6, m7, m14, 10
+    mova                 m7, [rsp+16*5]
+    punpcklwd           m14, m15, m5  ; 56
+    punpckhwd           m15, m5
+    pmaddwd              m6, m12, m7  ; a2
+    paddd                m0, m6
+    pmaddwd              m6, m13, m7  ; a2'
+    paddd                m2, m6
+    pmaddwd              m6, m14, m7  ; b2
+    pmaddwd              m7, m15      ; b2'
+    paddd                m1, m6
+    mova                 m6, [rsp+16*6]
+    paddd                m3, m7
+    REPX      {psrad x, 10}, m0, m2, m1, m3
+    packssdw             m0, m2
+    packssdw             m1, m3
+    pxor                 m2, m2
+    pminsw               m0, m6
+    pminsw               m1, m6
+    pmaxsw               m0, m2
+    pmaxsw               m1, m2
+    mova         [r7+dsq*0], m0
+    mova         [r7+dsq*1], m1
+    lea                  r7, [r7+dsq*2]
     sub                  hd, 2
-    jg .hv_w4_loop
-    add                  r7, 8
-    add                  r8, 8
-    movzx                hd, wb
-    mov                srcq, r7
-    mov                dstq, r8
-    sub                  wd, 1<<8
+    jg .hv_w8_loop
+    add                srcq, 16
+    add                dstq, 16
+    movzx                hd, r8b
+    sub                 r8d, 1<<8
 %endif
-    jg .hv_w4_loop0
+    jg .hv_w8_loop0
     RET
-%undef tmp
 
+PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
+PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
+PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
+PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
+PUT_8TAP_FN sharp,          SHARP,   SHARP
+
+cglobal put_8tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
 %if ARCH_X86_32
-DECLARE_REG_TMP 2, 1, 6, 4
-%elif WIN64
-DECLARE_REG_TMP 6, 4, 7, 4
-%else
-DECLARE_REG_TMP 6, 7, 7, 8
+    %define             mxb  r0b
+    %define             mxd  r0
+    %define             mxq  r0
+    %define             myb  r1b
+    %define             myd  r1
+    %define             myq  r1
+    %define              m8  [esp+16*0]
+    %define              m9  [esp+16*1]
+    %define             m10  [esp+16*2]
+    %define             m11  [esp+16*3]
+    %define             m12  [esp+16*4]
+    %define             m13  [esp+16*5]
+    %define             m14  [esp+16*6]
+    %define             m15  [esp+16*7]
 %endif
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    LEA                  t2, put_ssse3
+    movifnidn            wd, wm
+    movifnidn          srcq, srcmp
+    movifnidn           ssq, ssmp
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jz mangle(private_prefix %+ _put_6tap_16bpc_ssse3).put
+.v:
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovb               myd, mxd
+    movq                 m3, [base+subpel_filters+myq*8]
+    WIN64_SPILL_XMM      15
+    movd                 m7, r8m
+    movifnidn          dstq, dstmp
+    movifnidn           dsq, dsmp
+    punpcklbw            m3, m3
+    pshufb               m7, [base+pw_256]
+    psraw                m3, 8 ; sign-extend
+%if ARCH_X86_32
+    ALLOC_STACK       -16*7
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m10, m2
+    mova                m11, m3
+%else
+    pshufd               m8, m3, q0000
+    pshufd               m9, m3, q1111
+    pshufd              m10, m3, q2222
+    pshufd              m11, m3, q3333
+%endif
+    lea                  r6, [ssq*3]
+    sub                srcq, r6
+    cmp                  wd, 2
+    jne .v_w4
+.v_w2:
+    movd                 m1, [srcq+ssq*0]
+    movd                 m4, [srcq+ssq*1]
+    movd                 m2, [srcq+ssq*2]
+    add                srcq, r6
+    movd                 m5, [srcq+ssq*0]
+    movd                 m3, [srcq+ssq*1]
+    movd                 m6, [srcq+ssq*2]
+    add                srcq, r6
+    movd                 m0, [srcq+ssq*0]
+    punpckldq            m1, m4      ; 0 1
+    punpckldq            m4, m2      ; 1 2
+    punpckldq            m2, m5      ; 2 3
+    punpckldq            m5, m3      ; 3 4
+    punpckldq            m3, m6      ; 4 5
+    punpckldq            m6, m0      ; 5 6
+    punpcklwd            m1, m4      ; 01 12
+    punpcklwd            m2, m5      ; 23 34
+    punpcklwd            m3, m6      ; 45 56
+    pxor                 m6, m6
+.v_w2_loop:
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd              m5, m8, m1  ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m9      ; a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, m10     ; a2 b2
+    paddd                m5, m3
+    punpckldq            m3, m0, m4  ; 6 7
+    movd                 m0, [srcq+ssq*0]
+    punpckldq            m4, m0      ; 7 8
+    punpcklwd            m3, m4      ; 67 78
+    pmaddwd              m4, m11, m3 ; a3 b3
+    paddd                m5, m4
+    psrad                m5, 5
+    packssdw             m5, m5
+    pmaxsw               m5, m6
+    pavgw                m5, m6
+    pminsw               m5, m7
+    movd       [dstq+dsq*0], m5
+    pshuflw              m5, m5, q3232
+    movd       [dstq+dsq*1], m5
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+%if ARCH_X86_32
+    shl                  wd, 14
+%if STACK_ALIGNMENT < 16
+    mov          [esp+4*29], srcq
+    mov          [esp+4*30], dstq
+%else
+    mov               srcmp, srcq
+%endif
+    lea                  wd, [wq+hq-(1<<16)]
+%else
+    shl                  wd, 6
+    mov                  r7, srcq
+    mov                  r8, dstq
+    lea                  wd, [wq+hq-(1<<8)]
+%endif
+.v_w4_loop0:
+    movq                 m1, [srcq+ssq*0]
+    movq                 m2, [srcq+ssq*1]
+    movq                 m3, [srcq+ssq*2]
+    add                srcq, r6
+    movq                 m4, [srcq+ssq*0]
+    movq                 m5, [srcq+ssq*1]
+    movq                 m6, [srcq+ssq*2]
+    add                srcq, r6
+    movq                 m0, [srcq+ssq*0]
+    punpcklwd            m1, m2      ; 01
+    punpcklwd            m2, m3      ; 12
+    punpcklwd            m3, m4      ; 23
+    punpcklwd            m4, m5      ; 34
+    punpcklwd            m5, m6      ; 45
+    punpcklwd            m6, m0      ; 56
+%if ARCH_X86_32
+    jmp .v_w4_loop_start
+.v_w4_loop:
+    mova                 m1, m12
+    mova                 m2, m13
+    mova                 m3, m14
+.v_w4_loop_start:
+    pmaddwd              m1, m8      ; a0
+    pmaddwd              m2, m8      ; b0
+    mova                m12, m3
+    mova                m13, m4
+    pmaddwd              m3, m9      ; a1
+    pmaddwd              m4, m9      ; b1
+    paddd                m1, m3
+    paddd                m2, m4
+    mova                m14, m5
+    mova                 m4, m6
+    pmaddwd              m5, m10     ; a2
+    pmaddwd              m6, m10     ; b2
+    paddd                m1, m5
+    paddd                m2, m6
+    movq                 m6, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklwd            m5, m0, m6  ; 67
+    movq                 m0, [srcq+ssq*0]
+    pmaddwd              m3, m11, m5 ; a3
+    punpcklwd            m6, m0      ; 78
+    paddd                m1, m3
+    pmaddwd              m3, m11, m6 ; b3
+    paddd                m2, m3
+    psrad                m1, 5
+    psrad                m2, 5
+    packssdw             m1, m2
+    pxor                 m2, m2
+    pmaxsw               m1, m2
+    pavgw                m1, m2
+    pminsw               m1, m7
+    movq       [dstq+dsq*0], m1
+    movhps     [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+%if STACK_ALIGNMENT < 16
+    mov                srcq, [esp+4*29]
+    mov                dstq, [esp+4*30]
+    movzx                hd, ww
+    add                srcq, 8
+    add                dstq, 8
+    mov          [esp+4*29], srcq
+    mov          [esp+4*30], dstq
+%else
+    mov                srcq, srcmp
+    mov                dstq, dstmp
+    movzx                hd, ww
+    add                srcq, 8
+    add                dstq, 8
+    mov               srcmp, srcq
+    mov               dstmp, dstq
+%endif
+    sub                  wd, 1<<16
+%else
+.v_w4_loop:
+    pmaddwd             m12, m8, m1  ; a0
+    pmaddwd             m13, m8, m2  ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddwd              m3, m9      ; a1
+    pmaddwd              m4, m9      ; b1
+    paddd               m12, m3
+    paddd               m13, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddwd              m5, m10     ; a2
+    pmaddwd              m6, m10     ; b2
+    paddd               m12, m5
+    paddd               m13, m6
+    movq                 m6, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklwd            m5, m0, m6  ; 67
+    movq                 m0, [srcq+ssq*0]
+    pmaddwd             m14, m11, m5 ; a3
+    punpcklwd            m6, m0      ; 78
+    paddd               m12, m14
+    pmaddwd             m14, m11, m6 ; b3
+    paddd               m13, m14
+    psrad               m12, 5
+    psrad               m13, 5
+    packssdw            m12, m13
+    pxor                m13, m13
+    pmaxsw              m12, m13
+    pavgw               m12, m13
+    pminsw              m12, m7
+    movq       [dstq+dsq*0], m12
+    movhps     [dstq+dsq*1], m12
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    add                  r7, 8
+    add                  r8, 8
+    movzx                hd, wb
+    mov                srcq, r7
+    mov                dstq, r8
+    sub                  wd, 1<<8
+%endif
+    jg .v_w4_loop0
+    RET
+.h:
+    RESET_STACK_STATE
+    test                myd, 0xf00
+    jnz .hv
+    mov                 myd, r8m
+    movd                 m5, r8m
+    shr                 myd, 11
+    movddup              m4, [base+put_8tap_h_rnd+myq*8]
+    movifnidn           dsq, dsmp
+    pshufb               m5, [base+pw_256]
+    cmp                  wd, 4
+    jle mangle(private_prefix %+ _put_6tap_16bpc_ssse3).h_w4
+    WIN64_SPILL_XMM      12
+    shr                 mxd, 16
+    movq                 m3, [base+subpel_filters+mxq*8]
+    movifnidn          dstq, dstmp
+    mova                 m6, [base+spel_h_shufA]
+    mova                 m7, [base+spel_h_shufB]
+%if UNIX64
+    mov                  wd, wd
+%endif
+    lea                srcq, [srcq+wq*2]
+    punpcklbw            m3, m3
+    lea                dstq, [dstq+wq*2]
+    psraw                m3, 8
+    neg                  wq
+%if ARCH_X86_32
+    ALLOC_STACK       -16*4
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m10, m2
+    mova                m11, m3
+%else
+    pshufd               m8, m3, q0000
+    pshufd               m9, m3, q1111
+    pshufd              m10, m3, q2222
+    pshufd              m11, m3, q3333
+%endif
+.h_w8_loop0:
+    mov                  r6, wq
+.h_w8_loop:
+    movu                 m0, [srcq+r6*2- 6]
+    movu                 m1, [srcq+r6*2+ 2]
+    pshufb               m2, m0, m6   ; 0 1 1 2 2 3 3 4
+    pshufb               m0, m7       ; 2 3 3 4 4 5 5 6
+    pmaddwd              m2, m8       ; abcd0
+    pmaddwd              m0, m9       ; abcd1
+    pshufb               m3, m1, m6   ; 4 5 5 6 6 7 7 8
+    pshufb               m1, m7       ; 6 7 7 8 8 9 9 a
+    paddd                m2, m4
+    paddd                m0, m2
+    pmaddwd              m2, m10, m3  ; abcd2
+    pmaddwd              m3, m8       ; efgh0
+    paddd                m0, m2
+    pmaddwd              m2, m11, m1  ; abcd3
+    pmaddwd              m1, m9       ; efgh1
+    paddd                m0, m2
+    movu                 m2, [srcq+r6*2+10]
+    paddd                m3, m4
+    paddd                m1, m3
+    pshufb               m3, m2, m6   ; 8 9 9 a a b b c
+    pshufb               m2, m7       ; a b b c c d d e
+    pmaddwd              m3, m10      ; efgh2
+    pmaddwd              m2, m11      ; efgh3
+    paddd                m1, m3
+    paddd                m1, m2
+    psrad                m0, 6
+    psrad                m1, 6
+    packssdw             m0, m1
+    pxor                 m1, m1
+    pminsw               m0, m5
+    pmaxsw               m0, m1
+    mova        [dstq+r6*2], m0
+    add                  r6, 8
+    jl .h_w8_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w8_loop0
+    RET
+.hv:
+    RESET_STACK_STATE
+%if ARCH_X86_32
+    movd                 m4, r8m
+    pshufb               m4, [base+pw_256]
+%else
+%if WIN64
+    ALLOC_STACK        16*6, 16
+%endif
+    movd                m15, r8m
+    pshufb              m15, [base+pw_256]
+%endif
+    cmp                  wd, 4
+    jg .hv_w8
+    movzx               mxd, mxb
+    je .hv_w4
+    movq                 m0, [base+subpel_filters+mxq*8]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovb               myd, mxd
+    movq                 m3, [base+subpel_filters+myq*8]
+    movddup              m6, [base+pd_8704]
+    pshuflw              m0, m0, q2121
+    pxor                 m7, m7
+    punpcklbw            m7, m0
+    punpcklbw            m3, m3
+    psraw                m3, 8 ; sign-extend
+    test          dword r8m, 0x800
+    jz .hv_w2_10bpc
+    movddup              m6, [base+pd_2560]
+    psraw                m7, 2
+    psllw                m3, 2
+.hv_w2_10bpc:
+%if ARCH_X86_32
+    mov                dstq, dstmp
+    mov                 dsq, dsmp
+    mova                 m5, [base+spel_h_shuf2]
+    ALLOC_STACK       -16*8
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    mova                 m9, m5
+    mova                m11, m0
+    mova                m12, m1
+    mova                m13, m2
+    mova                m14, m3
+    mova                m15, m4
+%else
+    mova                 m9, [base+spel_h_shuf2]
+    pshufd              m11, m3, q0000
+    pshufd              m12, m3, q1111
+    pshufd              m13, m3, q2222
+    pshufd              m14, m3, q3333
+%endif
+    lea                  r6, [ssq*3]
+    sub                srcq, 2
+    sub                srcq, r6
+    movu                 m2, [srcq+ssq*0]
+    movu                 m3, [srcq+ssq*1]
+    movu                 m1, [srcq+ssq*2]
+    add                srcq, r6
+    movu                 m4, [srcq+ssq*0]
+%if ARCH_X86_32
+    REPX    {pshufb  x, m5}, m2, m3, m1, m4
+%else
+    REPX    {pshufb  x, m9}, m2, m3, m1, m4
+%endif
+    REPX    {pmaddwd x, m7}, m2, m3, m1, m4
+    phaddd               m2, m3        ; 0 1
+    phaddd               m1, m4        ; 2 3
+    movu                 m3, [srcq+ssq*1]
+    movu                 m4, [srcq+ssq*2]
+    add                srcq, r6
+    movu                 m0, [srcq+ssq*0]
+%if ARCH_X86_32
+    REPX    {pshufb  x, m5}, m3, m4, m0
+%else
+    REPX    {pshufb  x, m9}, m3, m4, m0
+%endif
+    REPX    {pmaddwd x, m7}, m3, m4, m0
+    phaddd               m3, m4        ; 4 5
+    phaddd               m0, m0        ; 6 6
+    REPX    {paddd   x, m6}, m2, m1, m3, m0
+    REPX    {psrad   x, 10}, m2, m1, m3, m0
+    packssdw             m2, m1        ; 0 1 2 3
+    packssdw             m3, m0        ; 4 5 6 _
+    palignr              m4, m3, m2, 4 ; 1 2 3 4
+    pshufd               m5, m3, q0321 ; 5 6 _ _
+    punpcklwd            m1, m2, m4    ; 01 12
+    punpckhwd            m2, m4        ; 23 34
+    punpcklwd            m3, m5        ; 45 56
+.hv_w2_loop:
+    movu                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movu                 m5, [srcq+ssq*0]
+    pshufb               m4, m9
+    pshufb               m5, m9
+    pmaddwd              m4, m7
+    pmaddwd              m5, m7
+    phaddd               m4, m5
+    pmaddwd              m5, m11, m1   ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m12       ; a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, m13       ; a2 b2
+    paddd                m5, m3
+    paddd                m4, m6
+    psrad                m4, 10        ; 7 8
+    packssdw             m0, m4
+    pshufd               m3, m0, q2103
+    punpckhwd            m3, m0        ; 67 78
+    mova                 m0, m4
+    pmaddwd              m4, m14, m3   ; a3 b3
+    paddd                m5, m4
+    psrad                m5, 10
+    packssdw             m5, m5
+    pxor                 m4, m4
+    pminsw               m5, m15
+    pmaxsw               m5, m4
+    movd       [dstq+dsq*0], m5
+    pshuflw              m5, m5, q3232
+    movd       [dstq+dsq*1], m5
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w8:
+    shr                 mxd, 16
+.hv_w4:
+    movq                 m2, [base+subpel_filters+mxq*8]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovb               myd, mxd
+    movq                 m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+    RESET_STACK_STATE
+    mov                dstq, dstmp
+    mov                 dsq, dsmp
+    mova                 m0, [base+spel_h_shufA]
+    mova                 m1, [base+spel_h_shufB]
+    mova                 m6, [base+pd_512]
+    ALLOC_STACK      -16*15
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m14, m6
+%else
+    mova                 m8, [base+spel_h_shufA]
+    mova                 m9, [base+spel_h_shufB]
+%endif
+    pxor                 m0, m0
+    punpcklbw            m0, m2
+    punpcklbw            m3, m3
+    psraw                m3, 8
+    test          dword r8m, 0x800
+    jz .hv_w4_10bpc
+    psraw                m0, 2
+    psllw                m3, 2
+.hv_w4_10bpc:
+    lea                  r6, [ssq*3]
+    sub                srcq, 6
+    sub                srcq, r6
+%if ARCH_X86_32
+    %define tmp esp+16*8
+    shl                  wd, 14
+%if STACK_ALIGNMENT < 16
+    mov          [esp+4*61], srcq
+    mov          [esp+4*62], dstq
+%else
+    mov               srcmp, srcq
+%endif
+    mova         [tmp+16*5], m4
+    lea                  wd, [wq+hq-(1<<16)]
+    pshufd               m1, m0, q0000
+    pshufd               m2, m0, q1111
+    pshufd               m5, m0, q2222
+    pshufd               m0, m0, q3333
+    mova                m10, m1
+    mova                m11, m2
+    mova                m12, m5
+    mova                m13, m0
+%else
+%if WIN64
+    %define tmp rsp
+%else
+    %define tmp rsp-104 ; red zone
+%endif
+    shl                  wd, 6
+    mov                  r7, srcq
+    mov                  r8, dstq
+    lea                  wd, [wq+hq-(1<<8)]
+    pshufd              m10, m0, q0000
+    pshufd              m11, m0, q1111
+    pshufd              m12, m0, q2222
+    pshufd              m13, m0, q3333
+    mova         [tmp+16*5], m15
+%endif
+    pshufd               m0, m3, q0000
+    pshufd               m1, m3, q1111
+    pshufd               m2, m3, q2222
+    pshufd               m3, m3, q3333
+    mova         [tmp+16*1], m0
+    mova         [tmp+16*2], m1
+    mova         [tmp+16*3], m2
+    mova         [tmp+16*4], m3
+%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
+    pshufb              m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
+    pshufb              m%1, m9      ; 2 3 3 4 4 5 5 6
+    pmaddwd             m%3, m10
+    pmaddwd             m%1, m11
+    paddd               m%3, %5
+    paddd               m%1, m%3
+    pshufb              m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
+    pshufb              m%2, m9      ; 6 7 7 8 8 9 9 a
+    pmaddwd             m%3, m12
+    pmaddwd             m%2, m13
+    paddd               m%1, m%3
+    paddd               m%1, m%2
+    psrad               m%1, %4
+%endmacro
+.hv_w4_loop0:
+%if ARCH_X86_64
+    mova                m14, [pd_512]
+%endif
+    movu                 m4, [srcq+ssq*0+0]
+    movu                 m1, [srcq+ssq*0+8]
+    movu                 m5, [srcq+ssq*1+0]
+    movu                 m2, [srcq+ssq*1+8]
+    movu                 m6, [srcq+ssq*2+0]
+    movu                 m3, [srcq+ssq*2+8]
+    add                srcq, r6
+    PUT_8TAP_HV_H         4, 1, 0, 10
+    PUT_8TAP_HV_H         5, 2, 0, 10
+    PUT_8TAP_HV_H         6, 3, 0, 10
+    movu                 m7, [srcq+ssq*0+0]
+    movu                 m2, [srcq+ssq*0+8]
+    movu                 m1, [srcq+ssq*1+0]
+    movu                 m3, [srcq+ssq*1+8]
+    PUT_8TAP_HV_H         7, 2, 0, 10
+    PUT_8TAP_HV_H         1, 3, 0, 10
+    movu                 m2, [srcq+ssq*2+0]
+    movu                 m3, [srcq+ssq*2+8]
+    add                srcq, r6
+    PUT_8TAP_HV_H         2, 3, 0, 10
+    packssdw             m4, m7      ; 0 3
+    packssdw             m5, m1      ; 1 4
+    movu                 m0, [srcq+ssq*0+0]
+    movu                 m1, [srcq+ssq*0+8]
+    PUT_8TAP_HV_H         0, 1, 3, 10
+    packssdw             m6, m2      ; 2 5
+    packssdw             m7, m0      ; 3 6
+    punpcklwd            m1, m4, m5  ; 01
+    punpckhwd            m4, m5      ; 34
+    punpcklwd            m2, m5, m6  ; 12
+    punpckhwd            m5, m6      ; 45
+    punpcklwd            m3, m6, m7  ; 23
+    punpckhwd            m6, m7      ; 56
+%if ARCH_X86_32
+    jmp .hv_w4_loop_start
+.hv_w4_loop:
+    mova                 m1, [tmp+16*6]
+    mova                 m2, m15
+.hv_w4_loop_start:
+    mova                 m7, [tmp+16*1]
+    pmaddwd              m1, m7      ; a0
+    pmaddwd              m2, m7      ; b0
+    mova                 m7, [tmp+16*2]
+    mova         [tmp+16*6], m3
+    pmaddwd              m3, m7      ; a1
+    mova                m15, m4
+    pmaddwd              m4, m7      ; b1
+    mova                 m7, [tmp+16*3]
+    paddd                m1, m3
+    paddd                m2, m4
+    mova                 m3, m5
+    pmaddwd              m5, m7      ; a2
+    mova                 m4, m6
+    pmaddwd              m6, m7      ; b2
+    paddd                m1, m5
+    paddd                m2, m6
+    movu                 m7, [srcq+ssq*1+0]
+    movu                 m5, [srcq+ssq*1+8]
+    lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_HV_H         7, 5, 6, 10
+    packssdw             m0, m7      ; 6 7
+    mova         [tmp+16*0], m0
+    movu                 m0, [srcq+ssq*0+0]
+    movu                 m5, [srcq+ssq*0+8]
+    PUT_8TAP_HV_H         0, 5, 6, 10
+    mova                 m6, [tmp+16*0]
+    packssdw             m7, m0      ; 7 8
+    punpcklwd            m5, m6, m7  ; 67
+    punpckhwd            m6, m7      ; 78
+    pmaddwd              m7, m5, [tmp+16*4]
+    paddd                m1, m7      ; a3
+    pmaddwd              m7, m6, [tmp+16*4]
+    paddd                m2, m7      ; b3
+    psrad                m1, 9
+    psrad                m2, 9
+    packssdw             m1, m2
+    pxor                 m7, m7
+    pmaxsw               m1, m7
+    pavgw                m7, m1
+    pminsw               m7, [tmp+16*5]
+    movq       [dstq+dsq*0], m7
+    movhps     [dstq+dsq*1], m7
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+%if STACK_ALIGNMENT < 16
+    mov                srcq, [esp+4*61]
+    mov                dstq, [esp+4*62]
+    add                srcq, 8
+    add                dstq, 8
+    mov          [esp+4*61], srcq
+    mov          [esp+4*62], dstq
+%else
+    mov                srcq, srcmp
+    mov                dstq, dstmp
+    add                srcq, 8
+    add                dstq, 8
+    mov               srcmp, srcq
+    mov               dstmp, dstq
+%endif
+    movzx                hd, ww
+    sub                  wd, 1<<16
+%else
+.hv_w4_loop:
+    mova                m15, [tmp+16*1]
+    pmaddwd             m14, m15, m1 ; a0
+    pmaddwd             m15, m2      ; b0
+    mova                 m7, [tmp+16*2]
+    mova                 m1, m3
+    pmaddwd              m3, m7      ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m7      ; b1
+    mova                 m7, [tmp+16*3]
+    paddd               m14, m3
+    paddd               m15, m4
+    mova                 m3, m5
+    pmaddwd              m5, m7      ; a2
+    mova                 m4, m6
+    pmaddwd              m6, m7      ; b2
+    paddd               m14, m5
+    paddd               m15, m6
+    movu                 m7, [srcq+ssq*1+0]
+    movu                 m5, [srcq+ssq*1+8]
+    lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_HV_H         7, 5, 6, 10, [pd_512]
+    packssdw             m0, m7      ; 6 7
+    mova         [tmp+16*0], m0
+    movu                 m0, [srcq+ssq*0+0]
+    movu                 m5, [srcq+ssq*0+8]
+    PUT_8TAP_HV_H         0, 5, 6, 10, [pd_512]
+    mova                 m6, [tmp+16*0]
+    packssdw             m7, m0      ; 7 8
+    punpcklwd            m5, m6, m7  ; 67
+    punpckhwd            m6, m7      ; 78
+    pmaddwd              m7, m5, [tmp+16*4]
+    paddd               m14, m7      ; a3
+    pmaddwd              m7, m6, [tmp+16*4]
+    paddd               m15, m7      ; b3
+    psrad               m14, 9
+    psrad               m15, 9
+    packssdw            m14, m15
+    pxor                 m7, m7
+    pmaxsw              m14, m7
+    pavgw                m7, m14
+    pminsw               m7, [tmp+16*5]
+    movq       [dstq+dsq*0], m7
+    movhps     [dstq+dsq*1], m7
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    add                  r7, 8
+    add                  r8, 8
+    movzx                hd, wb
+    mov                srcq, r7
+    mov                dstq, r8
+    sub                  wd, 1<<8
+%endif
+    jg .hv_w4_loop0
+    RET
+%undef tmp
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 2, 1, 6, 4
+%elif WIN64
+DECLARE_REG_TMP 6, 4, 7, 4
+%else
+DECLARE_REG_TMP 6, 7, 7, 8
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
+PREP_8TAP_FN regular,        REGULAR, REGULAR
+
+cglobal prep_6tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
+    %define            base  t2-prep_ssse3
+%if ARCH_X86_32
+    %define             mxb  r0b
+    %define             mxd  r0
+    %define             mxq  r0
+    %define             myb  r2b
+    %define             myd  r2
+    %define             myq  r2
+%endif
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 6tap_v, my, 4tap_v
+    LEA                  t2, prep_ssse3
+    movifnidn            wd, wm
+    movifnidn            hd, hm
+    movifnidn          srcq, srcmp
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jnz .v
+.prep:
+    tzcnt                wd, wd
+    mov                 myd, r7m ; bitdepth_max
+    movzx                wd, word [base+prep_ssse3_table+wq*2]
+    mova                 m5, [base+pw_8192]
+    shr                 myd, 11
+    add                  wq, t2
+    movddup              m4, [base+prep_mul+myq*8]
+    movifnidn           ssq, ssmp
+    movifnidn          tmpq, tmpmp
+    lea                  r6, [ssq*3]
+%if WIN64
+    pop                  r7
+%endif
+    jmp                  wq
+.h:
+    RESET_STACK_STATE
+    test                myd, 0xf00
+    jnz .hv
+    movifnidn           ssq, r2mp
+    movddup              m5, [base+prep_8tap_1d_rnd]
+    cmp                  wd, 4
+    je mangle(private_prefix %+ _prep_8tap_16bpc_ssse3).h_w4
+    WIN64_SPILL_XMM      10
+    shr                 mxd, 16
+    movq                 m2, [base+subpel_filters+1+mxq*8]
+    movifnidn          tmpq, r0mp
+    mova                 m4, [base+spel_h_shufA]
+    add                  wd, wd
+    mova                 m6, [base+spel_h_shufB]
+    add                srcq, wq
+    punpcklbw            m2, m2
+    add                tmpq, wq
+    psraw                m2, 8
+    neg                  wq
+    test          dword r7m, 0x800
+    jnz .h_w8_12bpc
+    psllw                m2, 2
+.h_w8_12bpc:
+    pshufd               m7, m2, q0000
+%if ARCH_X86_32
+    ALLOC_STACK       -16*2
+    %define              m8  [rsp+16*0]
+    %define              m9  [rsp+16*1]
+    pshufd               m0, m2, q1111
+    pshufd               m1, m2, q2222
+    mova                 m8, m0
+    mova                 m9, m1
+%else
+    pshufd               m8, m2, q1111
+    pshufd               m9, m2, q2222
+%endif
+.h_w8_loop0:
+    mov                  r6, wq
+.h_w8_loop:
+    movu                 m3, [srcq+r6-4]
+    movu                 m2, [srcq+r6+8]
+    pshufb               m0, m3, m4  ; 01 12 23 34
+    pmaddwd              m0, m7      ; abcd0
+    pshufb               m3, m6      ; 23 34 45 56
+    pmaddwd              m1, m8, m3  ; abcd1
+    paddd                m0, m1
+    pshufb               m1, m2, m4  ; 67 78 89 9a
+    shufpd               m3, m1, 0x01; 45 56 67 78
+    pmaddwd              m1, m8      ; efgh1
+    pshufb               m2, m6      ; 89 9a ab bc
+    pmaddwd              m2, m9      ; efgh2
+    paddd                m1, m2
+    pmaddwd              m2, m9 , m3 ; abcd2
+    pmaddwd              m3, m7      ; efgh0
+    paddd                m0, m5
+    paddd                m1, m5
+    paddd                m0, m2
+    paddd                m1, m3
+    psrad                m0, 4
+    psrad                m1, 4
+    packssdw             m0, m1
+    mova          [tmpq+r6], m0
+    add                  r6, 16
+    jl .h_w8_loop
+    add                srcq, ssq
+    sub                tmpq, wq
+    dec                  hd
+    jg .h_w8_loop0
+    RET
+.v:
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovb               myd, mxd
+    movddup              m5, [base+prep_8tap_1d_rnd]
+    movq                 m2, [base+subpel_filters+1+myq*8]
+    WIN64_SPILL_XMM      11, 16
+    movifnidn           ssq, r2mp
+    movifnidn          tmpq, r0mp
+    punpcklbw            m2, m2
+    sub                srcq, ssq
+    psraw                m2, 8 ; sign-extend
+    test          dword r7m, 0x800
+    jnz .v_12bpc
+    psllw                m2, 2
+.v_12bpc:
+    sub                srcq, ssq
+%if ARCH_X86_32
+    ALLOC_STACK       -16*4
+    pshufd               m0, m2, q0000
+    mov                 r6d, wd
+    pshufd               m1, m2, q1111
+    shl                 r6d, 14
+    pshufd               m2, m2, q2222
+    lea                 r6d, [r6+hq-(1<<16)]
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m10, m2
+%if STACK_ALIGNMENT < 16
+    %define           srcmp  [esp+16*3+4*0]
+    %define           tmpmp  [esp+16*3+4*1]
+%endif
+.v_w4_loop0:
+    mov               srcmp, srcq
+    mov               tmpmp, tmpq
+%else
+    pshufd               m8, m2, q0000
+    and                  wd, -8
+    jnz .v_w8
+    pshufd               m9, m2, q1111
+    pshufd              m10, m2, q2222
+%endif
+    movq                 m1, [srcq+ssq*0]
+    movq                 m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movq                 m3, [srcq+ssq*0]
+    movq                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movq                 m0, [srcq+ssq*0]
+    punpcklwd            m1, m2      ; 01
+    punpcklwd            m2, m3      ; 12
+    punpcklwd            m3, m4      ; 23
+    punpcklwd            m4, m0      ; 34
+.v_w4_loop:
+    pmaddwd              m6, m8, m1  ; a0
+    pmaddwd              m7, m8, m2  ; b0
+    mova                 m1, m3
+    pmaddwd              m3, m9      ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m9      ; b1
+    paddd                m6, m3
+    movq                 m3, [srcq+ssq*0]
+    paddd                m7, m4
+    movq                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movq                 m0, [srcq+ssq*0]
+    punpcklwd            m3, m4      ; 45
+    punpcklwd            m4, m0      ; 56
+    pmaddwd              m0, m10, m3 ; a2
+    paddd                m6, m5
+    paddd                m6, m0
+    pmaddwd              m0, m10, m4 ; b2
+    paddd                m7, m5
+    paddd                m7, m0
+    psrad                m6, 4
+    psrad                m7, 4
+    packssdw             m6, m7
+%if ARCH_X86_32
+    movq        [tmpq+wq*0], m6
+    movhps      [tmpq+wq*2], m6
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_w4_loop
+    mov                srcq, srcmp
+    mov                tmpq, tmpmp
+    movzx                hd, r6w
+    add                srcq, 8
+    add                tmpq, 8
+    sub                 r6d, 1<<16
+    jg .v_w4_loop0
+    RET
+%else
+    mova             [tmpq], m6
+    add                tmpq, 16
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    mova                r6m, m8
+    lea                 r6d, [wq*4-(1<<5)]
+    pshufd               m6, m2, q1111
+    lea                 r6d, [hq+r6*8]
+    pshufd               m7, m2, q2222
+    WIN64_PUSH_XMM       16
+.v_w8_loop0:
+    movu                 m9, [srcq+ssq*0]
+    lea                  r5, [srcq+ssq*2]
+    movu                m11, [srcq+ssq*1]
+    mov                  r7, tmpq
+    movu                m13, [r5+ssq*0]
+    movu                m15, [r5+ssq*1]
+    lea                  r5, [r5+ssq*2]
+    movu                 m4, [r5+ssq*0]
+    punpcklwd            m8, m9, m11  ; 01
+    punpckhwd            m9, m11
+    punpcklwd           m10, m11, m13 ; 12
+    punpckhwd           m11, m13
+    punpcklwd           m12, m13, m15 ; 23
+    punpckhwd           m13, m15
+    punpcklwd           m14, m15, m4  ; 34
+    punpckhwd           m15, m4
+.v_w8_loop:
+    mova                 m3, r6m
+    pmaddwd              m0, m8, m3   ; a0
+    pmaddwd              m2, m9, m3   ; a0'
+    pmaddwd              m1, m10, m3  ; b0
+    pmaddwd              m3, m11      ; b0'
+    mova                 m8, m12
+    pmaddwd             m12, m6       ; a1
+    mova                 m9, m13
+    pmaddwd             m13, m6       ; a1'
+    mova                m10, m14
+    pmaddwd             m14, m6       ; b1
+    mova                m11, m15
+    pmaddwd             m15, m6       ; b1'
+    paddd                m0, m12
+    paddd                m2, m13
+    movu                m13, [r5+ssq*0]
+    paddd                m1, m14
+    paddd                m3, m15
+    movu                m15, [r5+ssq*1]
+    lea                  r5, [r5+ssq*2]
+    movu                 m4, [r5+ssq*0]
+    REPX      {paddd x, m5}, m0, m2, m1, m3
+    punpcklwd           m12, m13, m15 ; 45
+    punpckhwd           m13, m15
+    punpcklwd           m14, m15, m4  ; 56
+    punpckhwd           m15, m4
+    pmaddwd              m4, m7, m12  ; a2
+    paddd                m0, m4
+    pmaddwd              m4, m7, m13  ; a2'
+    paddd                m2, m4
+    pmaddwd              m4, m7, m14  ; b2
+    paddd                m1, m4
+    pmaddwd              m4, m7, m15  ; b2'
+    paddd                m3, m4
+    REPX       {psrad x, 4}, m0, m2, m1, m3
+    packssdw             m0, m2
+    packssdw             m1, m3
+    mova          [r7+wq*0], m0
+    mova          [r7+wq*2], m1
+    lea                  r7, [r7+wq*4]
+    sub                  hd, 2
+    jg .v_w8_loop
+    add                srcq, 16
+    add                tmpq, 16
+    movzx                hd, r6b
+    sub                 r6d, 1<<8
+    jg .v_w8_loop0
+    RET
+%endif
+.hv:
+    and                  wd, -8
+    jnz .hv_w8
+    movzx               mxd, mxb
+    movq                 m0, [base+subpel_filters+mxq*8]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovb               myd, mxd
+    movq                 m2, [base+subpel_filters+1+myq*8]
+    WIN64_SPILL_XMM      15
+    movifnidn           ssq, r2mp
+    movifnidn          tmpq, r0mp
+    mova                 m7, [base+prep_8tap_2d_rnd]
+    sub                srcq, 2
+    pshuflw              m0, m0, q2121
+    pxor                 m6, m6
+    punpcklbw            m6, m0
+    punpcklbw            m2, m2
+    psraw                m6, 4
+    psraw                m2, 8
+    test          dword r7m, 0x800
+    jz .hv_w4_10bpc
+    psraw                m6, 2
+.hv_w4_10bpc:
+%if ARCH_X86_32
+%assign regs_used 4
+    ALLOC_STACK       -16*7
+%assign regs_used 7
+    %define             m10  [esp+16*3]
+    %define             m12  [esp+16*5]
+    %define             m13  [esp+16*6]
+    %define             m14  [base+spel_h_shufA]
+    %define             m11  [base+spel_h_shufB]
+    pshufd               m0, m2, q0000
+    pshufd               m1, m2, q1111
+    pshufd               m2, m2, q2222
+    pshufd               m5, m6, q0000
+    pshufd               m6, m6, q1111
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m10, m2
+    mova                m12, m5
+    mova                m13, m6
+    neg                 ssq
+    movu                 m3, [srcq+ssq*2]
+    movu                 m4, [srcq+ssq*1]
+    neg                 ssq
+%else
+    mov                  r6, ssq
+    pshufd               m8, m2, q0000
+    neg                  r6
+    pshufd               m9, m2, q1111
+    movu                 m3, [srcq+r6 *2]
+    pshufd              m10, m2, q2222
+    movu                 m4, [srcq+r6 *1]
+    pshufd              m12, m6, q0000
+    mova                m14, [base+spel_h_shufA]
+    pshufd              m13, m6, q1111
+    mova                m11, [base+spel_h_shufB]
+%endif
+    movu                 m1, [srcq+ssq*0]
+    movu                 m0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movu                 m2, [srcq+ssq*0]
+    HV_H_W4_6TAP         m3, m3, m5, m11
+    HV_H_W4_6TAP         m4, m4, m5, m11
+    HV_H_W4_6TAP         m5, m1, m5, m11
+    HV_H_W4_6TAP         m0, m0, m1, m11
+    HV_H_W4_6TAP         m2, m2, m1, m11
+    REPX       {psrad x, 6}, m3, m5, m4, m0, m2
+    packssdw             m3, m5      ; 0 2
+    packssdw             m4, m0      ; 1 3
+    packssdw             m5, m2      ; 2 4
+    punpcklwd            m1, m3, m4  ; 01
+    punpckhwd            m3, m4      ; 23
+    punpcklwd            m2, m4, m5  ; 12
+    punpckhwd            m4, m5      ; 34
+.hv_w4_loop:
+    movu                 m0, [srcq+ssq*1]
+    pmaddwd              m5, m8, m1  ; a0
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd              m6, m8, m2  ; b0
+    mova                 m1, m3
+    pmaddwd              m3, m9      ; a1
+    mova                 m2, m4
+    pmaddwd              m4, m9      ; b1
+    paddd                m5, m3
+    movu                 m3, [srcq+ssq*0]
+    paddd                m6, m4
+    HV_H_W4_6TAP         m0, m0, m4, m11
+    HV_H_W4_6TAP         m3, m3, m4, m11
+    psrad                m4, m2, 16
+    psrad                m0, 6
+    psrad                m3, 6
+    packssdw             m4, m0      ; 4 5
+    packssdw             m0, m3      ; 5 6
+    punpcklwd            m3, m4, m0  ; 45
+    punpckhwd            m4, m0      ; 56
+    pmaddwd              m0, m10, m3 ; a2
+    paddd                m5, m7
+    paddd                m5, m0
+    pmaddwd              m0, m10, m4 ; b2
+    paddd                m6, m7
+    paddd                m6, m0
+    psrad                m5, 6
+    psrad                m6, 6
+    packssdw             m5, m6
+    mova             [tmpq], m5
+    add                tmpq, 16
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    RESET_STACK_STATE
+    shr                 mxd, 16
+    movq                 m2, [base+subpel_filters+1+mxq*8]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 6
+    cmovb               myd, mxd
+    movq                 m1, [base+subpel_filters+1+myq*8]
+    movifnidn           ssq, r2mp
+    mova                 m4, [base+prep_8tap_2d_rnd]
+    pxor                 m0, m0
+    punpcklbw            m0, m2
+    punpcklbw            m1, m1
+    sub                srcq, 4
+    psraw                m0, 4
+    psraw                m1, 8
+    test          dword r7m, 0x800
+    jz .hv_w8_10bpc
+    psraw                m0, 2
+.hv_w8_10bpc:
+%if ARCH_X86_32
+%assign regs_used 1
+    ALLOC_STACK       -16*9
+%assign regs_used 7
+    mov                tmpq, r0mp
+    mova         [rsp+16*7], m4
+%else
+%if WIN64
+    PUSH                 r8
+%assign regs_used 9
+%endif
+    ALLOC_STACK        16*6, 16
+%endif
+    pshufd               m2, m0, q0000
+    mova         [rsp+16*0], m2
+    pshufd               m2, m0, q1111
+    mova         [rsp+16*1], m2
+    pshufd               m0, m0, q2222
+    mova         [rsp+16*2], m0
+    pshufd               m2, m1, q0000
+    mova         [rsp+16*3], m2
+    pshufd               m2, m1, q1111
+    mova         [rsp+16*4], m2
+    pshufd               m1, m1, q2222
+    mova         [rsp+16*5], m1
+    mov                  r6, ssq
+    neg                  r6
+%if ARCH_X86_32
+    mov                 r5d, wd
+    shl                 r5d, 14
+    lea                 r5d, [r5+hq-(1<<16)]
+%if STACK_ALIGNMENT < 16
+    %define           srcmp  [esp+16*8+4*0]
+    %define           tmpmp  [esp+16*8+4*1]
+%endif
+.hv_w8_loop0:
+    mov               srcmp, srcq
+    mov               tmpmp, tmpq
+    movu                 m5, [srcq+r6*2+0]
+    movu                 m6, [srcq+r6*2+2]
+    mova                 m7, [rsp+16*0]
+    mova                 m1, [rsp+16*1]
+    mova                 m0, [rsp+16*2]
+    HV_H_6TAP            m2, m5, m6, m7, m1, m0
+    movu                 m5, [srcq+r6*1+0]
+    movu                 m6, [srcq+r6*1+2]
+    HV_H_6TAP            m3, m5, m6, m7, m1, m0
+    movu                 m5, [srcq+ssq*0+0]
+    movu                 m6, [srcq+ssq*0+2]
+    HV_H_6TAP            m4, m5, m6, m7, m1, m0
+    movu                 m5, [srcq+ssq*1+0]
+    movu                 m6, [srcq+ssq*1+2]
+    lea                srcq, [srcq+ssq*2]
+    HV_H_6TAP            m0, m5, m6, m7, m1
+    movu                 m5, [srcq+ssq*0+0]
+    movu                 m6, [srcq+ssq*0+2]
+    HV_H_6TAP            m1, m5, m6, m7
+    mova                 m5, [rsp+16*7]
+    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
+    REPX      {psrad x, 6 }, m2, m4, m3, m0, m1
+    packssdw             m2, m4     ; 0 2
+    packssdw             m3, m0     ; 1 3
+    packssdw             m4, m1     ; 2 4
+    punpcklwd            m0, m2, m3 ; 01
+    punpckhwd            m2, m3     ; 23
+    punpcklwd            m1, m3, m4 ; 12
+    punpckhwd            m3, m4     ; 34
+.hv_w8_loop:
+    mova                 m5, [rsp+16*3]
+    mova                 m6, [rsp+16*4]
+    pmaddwd              m4, m0, m5 ; a0
+    pmaddwd              m5, m1     ; b0
+    mova                 m0, m2
+    pmaddwd              m2, m6     ; a1
+    mova                 m1, m3
+    pmaddwd              m3, m6     ; b1
+    paddd                m4, m2
+    movu                 m2, [srcq+ssq*1+0]
+    paddd                m5, m3
+    movu                 m3, [srcq+ssq*1+2]
+    lea                srcq, [srcq+ssq*2]
+    HV_H_6TAP            m6, m2, m3
+    movu                 m2, [srcq+ssq*0+0]
+    movu                 m3, [srcq+ssq*0+2]
+    HV_H_6TAP            m7, m2, m3
+    mova                 m2, [rsp+16*7]
+    psrad                m3, m1, 16
+    REPX      {paddd x, m2}, m6, m7, m4, m5
+    psrad                m6, 6
+    psrad                m7, 6
+    packssdw             m3, m6     ; 4 5
+    packssdw             m6, m7     ; 5 6
+    mova                 m7, [rsp+16*5]
+    punpcklwd            m2, m3, m6 ; 45
+    punpckhwd            m3, m6     ; 56
+    pmaddwd              m6, m2, m7 ; a2
+    pmaddwd              m7, m3     ; b2
+    paddd                m4, m6
+    paddd                m5, m7
+    psrad                m4, 6
+    psrad                m5, 6
+    packssdw             m4, m5
+    movq        [tmpq+wq*0], m4
+    movhps      [tmpq+wq*2], m4
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    mov                srcq, srcmp
+    mov                tmpq, tmpmp
+    movzx                hd, r5w
+    add                srcq, 8
+    add                tmpq, 8
+    sub                 r5d, 1<<16
+%else
+    lea                 r8d, [wq*4-(1<<5)]
+    lea                 r8d, [hq+r8*8]
+.hv_w8_loop0:
+    mova                 m5, [spel_h_shufA]
+    movu                 m0, [srcq+r6*2+ 0]
+    mova                 m6, [rsp+16*0]
+    movu                 m1, [srcq+r6*2+ 8]
+    mova                 m7, [rsp+16*1]
+    movu                 m2, [srcq+r6*2+16]
+    mova                 m8, [rsp+16*2]
+    HV_H_6TAP            m9, m0, m1, m2, 6, m5, m6, m7, m8
+    movu                 m0, [srcq+r6*1+ 0]
+    movu                 m1, [srcq+r6*1+ 8]
+    movu                 m2, [srcq+r6*1+16]
+    lea                  r5, [srcq+ssq*2]
+    HV_H_6TAP           m11, m0, m1, m2, 6, m5, m6, m7, m8
+    movu                 m0, [srcq+ssq*0+ 0]
+    movu                 m1, [srcq+ssq*0+ 8]
+    movu                 m2, [srcq+ssq*0+16]
+    mov                  r7, tmpq
+    HV_H_6TAP           m13, m0, m1, m2, 6, m5, m6, m7, m8
+    movu                 m0, [srcq+ssq*1+ 0]
+    movu                 m1, [srcq+ssq*1+ 8]
+    movu                 m2, [srcq+ssq*1+16]
+    HV_H_6TAP           m15, m0, m1, m2, 6, m5, m6, m7, m8
+    movu                 m0, [r5+ssq*0+ 0]
+    movu                 m1, [r5+ssq*0+ 8]
+    movu                 m2, [r5+ssq*0+16]
+    HV_H_6TAP            m5, m0, m1, m2, 6, m5, m6, m7, m8
+    punpcklwd            m8, m9, m11  ; 01
+    punpckhwd            m9, m11
+    punpcklwd           m10, m11, m13 ; 12
+    punpckhwd           m11, m13
+    punpcklwd           m12, m13, m15 ; 23
+    punpckhwd           m13, m15
+    punpcklwd           m14, m15, m5  ; 34
+    punpckhwd           m15, m5
+.hv_w8_loop:
+    mova                 m3, [rsp+16*3]
+    mova                 m7, [rsp+16*4]
+    pmaddwd              m0, m8, m3   ; a0
+    mova                 m8, m12
+    pmaddwd              m2, m9, m3   ; a0'
+    mova                 m9, m13
+    pmaddwd              m1, m10, m3  ; b0
+    mova                m10, m14
+    pmaddwd              m3, m11      ; b0'
+    mova                m11, m15
+    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
+    movu                 m6, [r5+ssq*1+ 0]
+    paddd                m0, m12
+    movu                 m7, [r5+ssq*1+ 8]
+    paddd                m2, m13
+    movu                m12, [r5+ssq*1+16]
+    paddd                m1, m14
+    lea                  r5, [r5+ssq*2]
+    paddd                m3, m15
+    HV_H_6TAP           m15, m6, m7, m12, 6
+    movu                 m6, [r5+ssq*0+ 0]
+    movu                 m7, [r5+ssq*0+ 8]
+    movu                m14, [r5+ssq*0+16]
+    punpcklwd           m12, m5, m15 ; 45
+    punpckhwd           m13, m5, m15
+    HV_H_6TAP            m5, m6, m7, m14, 6
+    mova                 m7, [rsp+16*5]
+    REPX      {paddd x, m4}, m0, m2, m1, m3
+    punpcklwd           m14, m15, m5  ; 56
+    punpckhwd           m15, m5
+    pmaddwd              m6, m12, m7  ; a2
+    paddd                m0, m6
+    pmaddwd              m6, m13, m7  ; a2'
+    paddd                m2, m6
+    pmaddwd              m6, m14, m7  ; b2
+    pmaddwd              m7, m15      ; b2'
+    paddd                m1, m6
+    paddd                m3, m7
+    REPX       {psrad x, 6}, m0, m2, m1, m3
+    packssdw             m0, m2
+    packssdw             m1, m3
+    mova          [r7+wq*0], m0
+    mova          [r7+wq*2], m1
+    lea                  r7, [r7+wq*4]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    add                srcq, 16
+    add                tmpq, 16
+    movzx                hd, r8b
+    sub                 r8d, 1<<8
+%endif
+    jg .hv_w8_loop0
+    RET
 
-%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
+PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
 PREP_8TAP_FN sharp,          SHARP,   SHARP
-PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
-PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
-PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN regular,        REGULAR, REGULAR
 
+cglobal prep_8tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
 %if ARCH_X86_32
-cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my
-%define mxb r0b
-%define mxd r0
-%define mxq r0
-%define myb r2b
-%define myd r2
-%define myq r2
-%else
-cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
+    %define             mxb  r0b
+    %define             mxd  r0
+    %define             mxq  r0
+    %define             myb  r2b
+    %define             myd  r2
+    %define             myq  r2
+    %define              m8  [esp+16*0]
+    %define              m9  [esp+16*1]
+    %define             m10  [esp+16*2]
+    %define             m11  [esp+16*3]
+    %define             m12  [esp+16*4]
+    %define             m13  [esp+16*5]
+    %define             m14  [esp+16*6]
+    %define             m15  [esp+16*7]
 %endif
-%define base t2-prep_ssse3
     imul                mxd, mxm, 0x010101
     add                 mxd, t0d ; 8tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
@@ -2026,138 +3439,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
     jnz .h
     movifnidn            hd, hm
     test                myd, 0xf00
-    jnz .v
-    tzcnt                wd, wd
-    mov                 myd, r7m ; bitdepth_max
-    movzx                wd, word [base+prep_ssse3_table+wq*2]
-    mova                 m5, [base+pw_8192]
-    shr                 myd, 11
-    add                  wq, t2
-    movddup              m4, [base+prep_mul+myq*8]
-    movifnidn           ssq, ssmp
-    movifnidn          tmpq, tmpmp
-    lea                  r6, [ssq*3]
-%if WIN64
-    pop                  r7
-%endif
-    jmp                  wq
-.h:
-    test                myd, 0xf00
-    jnz .hv
-    movifnidn           ssq, r2mp
-    movifnidn            hd, r4m
-    movddup              m5, [base+prep_8tap_1d_rnd]
-    cmp                  wd, 4
-    jne .h_w8
-    movzx               mxd, mxb
-    movq                 m0, [base+subpel_filters+mxq*8]
-    mova                 m3, [base+spel_h_shufA]
-    mova                 m4, [base+spel_h_shufB]
-    movifnidn          tmpq, tmpmp
-    sub                srcq, 2
-    WIN64_SPILL_XMM       8
-    punpcklbw            m0, m0
-    psraw                m0, 8
-    test          dword r7m, 0x800
-    jnz .h_w4_12bpc
-    psllw                m0, 2
-.h_w4_12bpc:
-    pshufd               m6, m0, q1111
-    pshufd               m7, m0, q2222
-.h_w4_loop:
-    movu                 m1, [srcq+ssq*0]
-    movu                 m2, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
-    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
-    pmaddwd              m0, m6
-    pmaddwd              m1, m7
-    paddd                m0, m5
-    paddd                m0, m1
-    pshufb               m1, m2, m3
-    pshufb               m2, m4
-    pmaddwd              m1, m6
-    pmaddwd              m2, m7
-    paddd                m1, m5
-    paddd                m1, m2
-    psrad                m0, 4
-    psrad                m1, 4
-    packssdw             m0, m1
-    mova             [tmpq], m0
-    add                tmpq, 16
-    sub                  hd, 2
-    jg .h_w4_loop
-    RET
-.h_w8:
-    WIN64_SPILL_XMM      11
-    shr                 mxd, 16
-    movq                 m2, [base+subpel_filters+mxq*8]
-    mova                 m4, [base+spel_h_shufA]
-    mova                 m6, [base+spel_h_shufB]
-    movifnidn          tmpq, r0mp
-    add                  wd, wd
-    punpcklbw            m2, m2
-    add                srcq, wq
-    psraw                m2, 8
-    add                tmpq, wq
-    neg                  wq
-    test          dword r7m, 0x800
-    jnz .h_w8_12bpc
-    psllw                m2, 2
-.h_w8_12bpc:
-    pshufd               m7, m2, q0000
-%if ARCH_X86_32
-    ALLOC_STACK       -16*3
-    pshufd               m0, m2, q1111
-    pshufd               m1, m2, q2222
-    pshufd               m2, m2, q3333
-    mova                 m8, m0
-    mova                 m9, m1
-    mova                m10, m2
-%else
-    pshufd               m8, m2, q1111
-    pshufd               m9, m2, q2222
-    pshufd              m10, m2, q3333
-%endif
-.h_w8_loop0:
-    mov                  r6, wq
-.h_w8_loop:
-    movu                 m0, [srcq+r6- 6]
-    movu                 m1, [srcq+r6+ 2]
-    pshufb               m2, m0, m4  ; 0 1 1 2 2 3 3 4
-    pshufb               m0, m6      ; 2 3 3 4 4 5 5 6
-    pmaddwd              m2, m7      ; abcd0
-    pmaddwd              m0, m8      ; abcd1
-    pshufb               m3, m1, m4  ; 4 5 5 6 6 7 7 8
-    pshufb               m1, m6      ; 6 7 7 8 8 9 9 a
-    paddd                m2, m5
-    paddd                m0, m2
-    pmaddwd              m2, m9, m3  ; abcd2
-    pmaddwd              m3, m7      ; efgh0
-    paddd                m0, m2
-    pmaddwd              m2, m10, m1 ; abcd3
-    pmaddwd              m1, m8      ; efgh1
-    paddd                m0, m2
-    movu                 m2, [srcq+r6+10]
-    paddd                m3, m5
-    paddd                m1, m3
-    pshufb               m3, m2, m4  ; a b b c c d d e
-    pshufb               m2, m6      ; 8 9 9 a a b b c
-    pmaddwd              m3, m9      ; efgh2
-    pmaddwd              m2, m10     ; efgh3
-    paddd                m1, m3
-    paddd                m1, m2
-    psrad                m0, 4
-    psrad                m1, 4
-    packssdw             m0, m1
-    mova          [tmpq+r6], m0
-    add                  r6, 16
-    jl .h_w8_loop
-    add                srcq, ssq
-    sub                tmpq, wq
-    dec                  hd
-    jg .h_w8_loop0
-    RET
+    jz mangle(private_prefix %+ _prep_6tap_16bpc_ssse3).prep
 .v:
     movzx               mxd, myb
     shr                 myd, 16
@@ -2315,6 +3597,125 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
     sub                  wd, 1<<8
     jg .v_loop0
     RET
+.h:
+    RESET_STACK_STATE
+    test                myd, 0xf00
+    jnz .hv
+    movifnidn           ssq, r2mp
+    movifnidn            hd, r4m
+    movddup              m5, [base+prep_8tap_1d_rnd]
+    cmp                  wd, 4
+    jne .h_w8
+.h_w4:
+    movzx               mxd, mxb
+    movq                 m0, [base+subpel_filters+mxq*8]
+    mova                 m3, [base+spel_h_shufA]
+    mova                 m4, [base+spel_h_shufB]
+    movifnidn          tmpq, tmpmp
+    sub                srcq, 2
+    WIN64_SPILL_XMM       8
+    punpcklbw            m0, m0
+    psraw                m0, 8
+    test          dword r7m, 0x800
+    jnz .h_w4_12bpc
+    psllw                m0, 2
+.h_w4_12bpc:
+    pshufd               m6, m0, q1111
+    pshufd               m7, m0, q2222
+.h_w4_loop:
+    movu                 m1, [srcq+ssq*0]
+    movu                 m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
+    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
+    pmaddwd              m0, m6
+    pmaddwd              m1, m7
+    paddd                m0, m5
+    paddd                m0, m1
+    pshufb               m1, m2, m3
+    pshufb               m2, m4
+    pmaddwd              m1, m6
+    pmaddwd              m2, m7
+    paddd                m1, m5
+    paddd                m1, m2
+    psrad                m0, 4
+    psrad                m1, 4
+    packssdw             m0, m1
+    mova             [tmpq], m0
+    add                tmpq, 16
+    sub                  hd, 2
+    jg .h_w4_loop
+    RET
+.h_w8:
+    WIN64_SPILL_XMM      11
+    shr                 mxd, 16
+    movq                 m2, [base+subpel_filters+mxq*8]
+    mova                 m4, [base+spel_h_shufA]
+    mova                 m6, [base+spel_h_shufB]
+    movifnidn          tmpq, r0mp
+    add                  wd, wd
+    punpcklbw            m2, m2
+    add                srcq, wq
+    psraw                m2, 8
+    add                tmpq, wq
+    neg                  wq
+    test          dword r7m, 0x800
+    jnz .h_w8_12bpc
+    psllw                m2, 2
+.h_w8_12bpc:
+    pshufd               m7, m2, q0000
+%if ARCH_X86_32
+    ALLOC_STACK       -16*3
+    pshufd               m0, m2, q1111
+    pshufd               m1, m2, q2222
+    pshufd               m2, m2, q3333
+    mova                 m8, m0
+    mova                 m9, m1
+    mova                m10, m2
+%else
+    pshufd               m8, m2, q1111
+    pshufd               m9, m2, q2222
+    pshufd              m10, m2, q3333
+%endif
+.h_w8_loop0:
+    mov                  r6, wq
+.h_w8_loop:
+    movu                 m0, [srcq+r6- 6]
+    movu                 m1, [srcq+r6+ 2]
+    pshufb               m2, m0, m4  ; 0 1 1 2 2 3 3 4
+    pshufb               m0, m6      ; 2 3 3 4 4 5 5 6
+    pmaddwd              m2, m7      ; abcd0
+    pmaddwd              m0, m8      ; abcd1
+    pshufb               m3, m1, m4  ; 4 5 5 6 6 7 7 8
+    pshufb               m1, m6      ; 6 7 7 8 8 9 9 a
+    paddd                m2, m5
+    paddd                m0, m2
+    pmaddwd              m2, m9, m3  ; abcd2
+    pmaddwd              m3, m7      ; efgh0
+    paddd                m0, m2
+    pmaddwd              m2, m10, m1 ; abcd3
+    pmaddwd              m1, m8      ; efgh1
+    paddd                m0, m2
+    movu                 m2, [srcq+r6+10]
+    paddd                m3, m5
+    paddd                m1, m3
+    pshufb               m3, m2, m4  ; a b b c c d d e
+    pshufb               m2, m6      ; 8 9 9 a a b b c
+    pmaddwd              m3, m9      ; efgh2
+    pmaddwd              m2, m10     ; efgh3
+    paddd                m1, m3
+    paddd                m1, m2
+    psrad                m0, 4
+    psrad                m1, 4
+    packssdw             m0, m1
+    mova          [tmpq+r6], m0
+    add                  r6, 16
+    jl .h_w8_loop
+    add                srcq, ssq
+    sub                tmpq, wq
+    dec                  hd
+    jg .h_w8_loop0
+    RET
 .hv:
     RESET_STACK_STATE
     movzx               t3d, mxb
@@ -6427,16 +7828,18 @@ DECLARE_REG_TMP 6, 8
 %else
 DECLARE_REG_TMP 1, 2
 %endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
 BILIN_SCALED_FN put
-FN put_8tap_scaled, sharp,          SHARP,   SHARP
-FN put_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
-FN put_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
-FN put_8tap_scaled, smooth,         SMOOTH,  SMOOTH
-FN put_8tap_scaled, sharp_regular,  SHARP,   REGULAR
-FN put_8tap_scaled, regular_sharp,  REGULAR, SHARP
-FN put_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
-FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
-FN put_8tap_scaled, regular,        REGULAR, REGULAR
+PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
+PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 MC_8TAP_SCALED put
 
 %if WIN64
@@ -6446,16 +7849,18 @@ DECLARE_REG_TMP 6, 7
 %else
 DECLARE_REG_TMP 1, 2
 %endif
+
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
 BILIN_SCALED_FN prep
-FN prep_8tap_scaled, sharp,          SHARP,   SHARP
-FN prep_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
-FN prep_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
-FN prep_8tap_scaled, smooth,         SMOOTH,  SMOOTH
-FN prep_8tap_scaled, sharp_regular,  SHARP,   REGULAR
-FN prep_8tap_scaled, regular_sharp,  REGULAR, SHARP
-FN prep_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
-FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
-FN prep_8tap_scaled, regular,        REGULAR, REGULAR
+PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
+PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 MC_8TAP_SCALED prep
 
 %if ARCH_X86_64
diff --git a/tests/checkasm/arm/checkasm_32.S b/tests/checkasm/arm/checkasm_32.S
index a186ef8fc25ebedafd8a5326b481fedd37c16e15..09b88aa34601f9e7ef2d51fe084d5f180f73f5a7 100644
--- a/tests/checkasm/arm/checkasm_32.S
+++ b/tests/checkasm/arm/checkasm_32.S
@@ -101,9 +101,10 @@ function checked_call_\variant, export=1
         mov             r12, r0
         mov             r0,  r2
         mov             r1,  r3
-        ldrd            r2,  r3,  [sp, #ARG_STACK_A + pushed]
+        ldr             r2,  [sp, #ARG_STACK_A + pushed]
+        ldr             r3,  [sp, #ARG_STACK_A + pushed + 4]
         @ Call the target function
-        blx             r12
+        v4blx           r12
 
         @ Load the number of stack parameters, stack canary and its reference
         ldr             r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
@@ -120,7 +121,8 @@ function checked_call_\variant, export=1
         movrel          r12, register_init
 .ifc \variant, vfp
 .macro check_reg_vfp, dreg, offset
-        ldrd            r2,  r3,  [r12, #8 * (\offset)]
+        ldr             r2,  [r12, #(8 * (\offset))]
+        ldr             r3,  [r12, #(8 * (\offset)) + 4]
         vmov            r0,  lr,  \dreg
         eor             r2,  r2,  r0
         eor             r3,  r3,  lr
@@ -148,7 +150,8 @@ function checked_call_\variant, export=1
         @ keep track of the checked GPR
         mov             r1,  #4
 .macro check_reg reg1, reg2=
-        ldrd            r2,  r3,  [r12], #8
+        ldr             r2,  [r12], #4
+        ldr             r3,  [r12], #4
         eors            r2,  r2,  \reg1
         bne             2f
         add             r1,  r1,  #1
@@ -198,4 +201,5 @@ function checked_call_\variant, export=1
 endfunc
 .endm
 
+clobbercheck novfp
 clobbercheck vfp
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 2faa01f4a65940f05a73a1d9768fb9d9a4ab4c44..2115ed3e7a60a2ee2ae1ae94ca07be2d355854ba 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -44,12 +44,16 @@
 #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04
 #endif
 #else
-#include <unistd.h>
 #include <time.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#if HAVE_PTHREAD_SETAFFINITY_NP
 #include <pthread.h>
-#ifdef HAVE_PTHREAD_NP_H
+#if HAVE_PTHREAD_NP_H
 #include <pthread_np.h>
 #endif
+#endif
 #ifdef __APPLE__
 #include <mach/mach_time.h>
 #endif
@@ -732,7 +736,7 @@ int main(int argc, char *argv[]) {
             } else {
                 fprintf(stderr, "checkasm: running on cpu %lu\n", affinity);
             }
-#elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(CPU_SET)
+#elif HAVE_PTHREAD_SETAFFINITY_NP && defined(CPU_SET)
             cpu_set_t set;
             CPU_ZERO(&set);
             CPU_SET(affinity, &set);
@@ -832,6 +836,14 @@ int main(int argc, char *argv[]) {
             state.simd_warmup = checkasm_warmup_avx2;
         checkasm_simd_warmup();
 #endif
+#if ARCH_ARM
+        void checkasm_checked_call_vfp(void *func, int dummy, ...);
+        void checkasm_checked_call_novfp(void *func, int dummy, ...);
+        if (cpu_flags & DAV1D_ARM_CPU_FLAG_NEON)
+            checkasm_checked_call_ptr = checkasm_checked_call_vfp;
+        else
+            checkasm_checked_call_ptr = checkasm_checked_call_novfp;
+#endif
 #if ARCH_X86
         unsigned checkasm_init_x86(char *name);
         char name[48];
@@ -1126,3 +1138,7 @@ void checkasm_simd_warmup(void)
         state.simd_warmup();
 }
 #endif
+
+#if ARCH_ARM
+void (*checkasm_checked_call_ptr)(void *func, int dummy, ...);
+#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 29de9b64b3386a9f7dc43ed5ddb910064ca064c5..07ce4da581fd0e806bc4bfd89bc1cbebc961f9ef 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -68,6 +68,10 @@ typedef sigjmp_buf checkasm_context;
 #include "include/common/bitdepth.h"
 #include "include/common/intops.h"
 
+#if ARCH_ARM
+#include "src/arm/arm-arch.h"
+#endif
+
 int xor128_rand(void);
 #define rnd xor128_rand
 
@@ -254,7 +258,7 @@ void checkasm_simd_warmup(void);
  * handled orthogonally from integer parameters passed in GPR registers. */
 #define IGNORED_FP_ARGS 8
 #endif
-#ifdef HAVE_C11_GENERIC
+#if HAVE_C11_GENERIC
 #define clobber_type(arg) _Generic((void (*)(void*, arg))NULL,\
      void (*)(void*, int32_t ): clobber_mask |= 1 << mpos++,\
      void (*)(void*, uint32_t): clobber_mask |= 1 << mpos++,\
@@ -302,12 +306,12 @@ void checkasm_simd_warmup(void);
 /* Use a dummy argument, to offset the real parameters by 2, not only 1.
  * This makes sure that potential 8-byte-alignment of parameters is kept
  * the same even when the extra parameters have been removed. */
-void checkasm_checked_call_vfp(void *func, int dummy, ...);
+extern void (*checkasm_checked_call_ptr)(void *func, int dummy, ...);
 #define declare_new(ret, ...)\
     ret (*checked_call)(void *, int dummy, __VA_ARGS__,\
                         int, int, int, int, int, int, int, int,\
                         int, int, int, int, int, int, int) =\
-    (void *)checkasm_checked_call_vfp;
+    (void *)checkasm_checked_call_ptr;
 #define call_new(...)\
     (checkasm_set_signal_handler_state(1),\
      checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\
diff --git a/tests/checkasm/itx.c b/tests/checkasm/itx.c
index c7cc411ff5343c4ba1b3e39ed1b38ac045eed1c7..b0de65dddc5d89e86ab87e98890547b50c64bc26 100644
--- a/tests/checkasm/itx.c
+++ b/tests/checkasm/itx.c
@@ -130,7 +130,8 @@ static void fwht4_1d(double *const out, const double *const in)
 
 static int copy_subcoefs(coef *coeff,
                          const enum RectTxfmSize tx, const enum TxfmType txtp,
-                         const int sw, const int sh, const int subsh)
+                         const int sw, const int sh, const int subsh,
+                         int *const max_eob)
 {
     /* copy the topleft coefficients such that the return value (being the
      * coefficient scantable index for the eob token) guarantees that only
@@ -160,6 +161,7 @@ static int copy_subcoefs(coef *coeff,
         } else if (!eob && (rcx > sub_low || rcy > sub_low))
             eob = n; /* lower boundary */
     }
+    *max_eob = n - 1;
 
     if (eob)
         eob += rnd() % (n - eob - 1);
@@ -182,7 +184,7 @@ static int copy_subcoefs(coef *coeff,
 
 static int ftx(coef *const buf, const enum RectTxfmSize tx,
                const enum TxfmType txtp, const int w, const int h,
-               const int subsh, const int bitdepth_max)
+               const int subsh, int *const max_eob, const int bitdepth_max)
 {
     double out[64 * 64], temp[64 * 64];
     const double scale = scaling_factors[ctz(w * h) - 4];
@@ -236,7 +238,7 @@ static int ftx(coef *const buf, const enum RectTxfmSize tx,
         for (int x = 0; x < sw; x++)
             buf[y * sw + x] = (coef) (out[y * w + x] + 0.5);
 
-    return copy_subcoefs(buf, tx, txtp, sw, sh, subsh);
+    return copy_subcoefs(buf, tx, txtp, sw, sh, subsh, max_eob);
 }
 
 static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
@@ -272,7 +274,9 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
                                bpc))
                 {
                     const int bitdepth_max = (1 << bpc) - 1;
-                    const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
+                    int max_eob;
+                    const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, &max_eob,
+                                        bitdepth_max);
                     memcpy(coeff[1], coeff[0], sizeof(*coeff));
 
                     CLEAR_PIXEL_RECT(c_dst);
@@ -295,7 +299,7 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
                         fail();
 
                     bench_new(alternate(c_dst, a_dst), a_dst_stride,
-                              alternate(coeff[0], coeff[1]), eob HIGHBD_TAIL_SUFFIX);
+                              alternate(coeff[0], coeff[1]), max_eob HIGHBD_TAIL_SUFFIX);
                 }
     }
     report("add_%dx%d", w, h);
diff --git a/tests/dav1d_argon.bash b/tests/dav1d_argon.bash
index ead3e6ed2dc1812d432cea04edfd32098a9dc0c1..229eab5a8ff1988287806060a6d72eb7e38c341c 100755
--- a/tests/dav1d_argon.bash
+++ b/tests/dav1d_argon.bash
@@ -6,6 +6,7 @@ FILMGRAIN=1
 CPUMASK=-1
 THREADS=1
 JOBS=0
+WRAP=""
 
 usage() {
     NAME=$(basename "$0")
@@ -20,7 +21,8 @@ usage() {
         printf " -g \$num   enable filmgrain (default: 1)\n"
         printf " -c \$mask  use restricted cpumask (default: -1)\n"
         printf " -t \$num   number of threads per dav1d (default: 1)\n"
-        printf " -j \$num   number of parallel dav1d processes (default: 0)\n\n"
+        printf " -j \$num   number of parallel dav1d processes (default: 0)\n"
+        printf " -w tool   execute dav1d with a wrapper tool\n\n"
     } >&2
     exit 1
 }
@@ -79,7 +81,7 @@ if [ -d "$tests_dir/argon" ]; then
     ARGON_DIR="$tests_dir/argon"
 fi
 
-while getopts ":d:a:g:c:t:j:" opt; do
+while getopts ":d:a:g:c:t:j:w:" opt; do
     case "$opt" in
         d)
             DAV1D="$OPTARG"
@@ -99,6 +101,9 @@ while getopts ":d:a:g:c:t:j:" opt; do
         j)
             JOBS="$OPTARG"
             ;;
+        w)
+            WRAP="$OPTARG"
+            ;;
         \?)
             printf "Error! Invalid option: -%s\n" "$OPTARG" >&2
             usage
@@ -158,7 +163,7 @@ for i in "${!files[@]}"; do
     md5=${md5/ */}
 
     printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}"
-    cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q)
+    cmd=($WRAP "$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q)
     if [ "$JOBS" -gt 1 ]; then
         "${cmd[@]}" 2>/dev/null &
         p=$!
diff --git a/tests/seek_stress.c b/tests/seek_stress.c
index a85ec86886c31d0fc52e843c250d0b6ee555f44e..7f75ea86e5c6c994a23b9cf9227a49ddd10930d0 100644
--- a/tests/seek_stress.c
+++ b/tests/seek_stress.c
@@ -60,7 +60,7 @@ static unsigned get_seed(void) {
 static unsigned get_seed(void) {
 #ifdef __APPLE__
     return (unsigned) mach_absolute_time();
-#elif defined(HAVE_CLOCK_GETTIME)
+#elif HAVE_CLOCK_GETTIME
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
     return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
diff --git a/tools/compat/getopt.c b/tools/compat/getopt.c
index ac1fda426ecc59c017d4a56da7cb4ceb728e60b0..ab375bdb17e2add76ef884b1877ad93731dee3c6 100644
--- a/tools/compat/getopt.c
+++ b/tools/compat/getopt.c
@@ -55,7 +55,11 @@
 #include <getopt.h>
 #include <stdarg.h>
 #include <stdio.h>
+#ifdef _WIN32
 #include <windows.h>
+#else
+#include <err.h>
+#endif
 
 #define	REPLACE_GETOPT		/* use this getopt as the system getopt(3) */
 
@@ -80,12 +84,6 @@ char    *optarg;		/* argument associated with option */
 #define	BADARG		((*options == ':') ? (int)':' : (int)'?')
 #define	INORDER 	(int)1
 
-#ifndef __CYGWIN__
-#define __progname __argv[0]
-#else
-extern char __declspec(dllimport) *__progname;
-#endif
-
 #ifdef __CYGWIN__
 static char EMSG[] = "";
 #else
@@ -113,6 +111,13 @@ static const char noarg[] = "option doesn't take an argument -- %.*s";
 static const char illoptchar[] = "unknown option -- %c";
 static const char illoptstring[] = "unknown option -- %s";
 
+#ifdef _WIN32
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char __declspec(dllimport) *__progname;
+#endif
+
 static void
 _vwarnx(const char *fmt,va_list ap)
 {
@@ -130,6 +135,7 @@ warnx(const char *fmt,...)
   _vwarnx(fmt,ap);
   va_end(ap);
 }
+#endif
 
 /*
  * Compute the greatest common divisor of a and b.
diff --git a/tools/dav1d.c b/tools/dav1d.c
index 4d8d072debd7b9bfef29cd1f093f133b2ed887b9..eb19a80b358b74accf384796f6f2175c180c8cb1 100644
--- a/tools/dav1d.c
+++ b/tools/dav1d.c
@@ -38,10 +38,10 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#ifdef HAVE_UNISTD_H
+#if HAVE_UNISTD_H
 # include <unistd.h>
 #endif
-#ifdef HAVE_IO_H
+#if HAVE_IO_H
 # include <io.h>
 #endif
 #ifdef _WIN32
@@ -68,7 +68,7 @@ static uint64_t get_time_nanos(void) {
     uint64_t seconds = t.QuadPart / frequency.QuadPart;
     uint64_t fractions = t.QuadPart % frequency.QuadPart;
     return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart;
-#elif defined(HAVE_CLOCK_GETTIME)
+#elif HAVE_CLOCK_GETTIME
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
     return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
index f4259643542ad227b95792af1a446b3efdaa84e5..134be46b2fb08a0da587f60125b480264bcb9354 100644
--- a/tools/dav1d_cli_parse.c
+++ b/tools/dav1d_cli_parse.c
@@ -35,7 +35,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#ifdef HAVE_UNISTD_H
+#if HAVE_UNISTD_H
 # include <unistd.h>
 #endif
 
diff --git a/tools/input/parse.h b/tools/input/parse.h
index f5805e8ca45323363d4c2017583c3f13f8f2a85a..f39f80f2c69d9e964d7bb9aafe258ce721092669 100644
--- a/tools/input/parse.h
+++ b/tools/input/parse.h
@@ -89,6 +89,8 @@ static inline int parse_obu_header(const uint8_t *buf, int buf_size,
     buf_size--;
 
     if (extension_flag) {
+        if (!buf_size)
+            return -1;
         buf++;
         buf_size--;
         // ignore fields
diff --git a/tools/input/section5.c b/tools/input/section5.c
index db1b34c227419be85baed4ed31859bba77b0d859..99cb7615d6caca56faf5d7b10294202ad0669531 100644
--- a/tools/input/section5.c
+++ b/tools/input/section5.c
@@ -32,7 +32,9 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#if HAVE_SYS_TYPES_H
 #include <sys/types.h>
+#endif
 
 #include "dav1d/headers.h"
 
diff --git a/tools/output/md5.c b/tools/output/md5.c
index 7d192c2459dc84c607c9e1cfd408c52fc4aab1cd..cfad4f0bfb9696254078eb034f953dea45cd1b2d 100644
--- a/tools/output/md5.c
+++ b/tools/output/md5.c
@@ -31,7 +31,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/stat.h>
 
 #include "common/intops.h"
 
diff --git a/tools/output/y4m2.c b/tools/output/y4m2.c
index 8766f64868231e8003117d91e7882c8c34b3cb22..40411d15abf55a5dda71efaaa614e11485de5b75 100644
--- a/tools/output/y4m2.c
+++ b/tools/output/y4m2.c
@@ -32,7 +32,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/stat.h>
 
 #include "output/muxer.h"
 
diff --git a/tools/output/yuv.c b/tools/output/yuv.c
index 406f28418893217022c8daad824b8c5557ae42e9..e0c0ec47d914f7bf5cd97aecb88119f77f67f39c 100644
--- a/tools/output/yuv.c
+++ b/tools/output/yuv.c
@@ -31,7 +31,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/stat.h>
 
 #include "output/muxer.h"