diff --git a/NEWS b/NEWS index 16825ff87e4b0b73f25ff10ad99b708c9e49a00c..0302484c1b69bed02c5d0fcd53c186c8c2ddee9f 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,27 @@ +Changes for 1.5.0 'Road Runner': +-------------------------------- + +1.5.0 is a major release of dav1d, that: + - WARNING: we removed some of the SSE2 optimizations, so if you care about + systems without SSSE3, you should be careful when updating! + - Add Arm OpenBSD run-time CPU feature + - Optimize index offset calculations for decode_coefs + - picture: copy HDR10+ and T35 metadata only to visible frames + - SSSE3 new optimizations for 6-tap (8bit and hbd) + - AArch64/SVE: Add HBD subpel filters using 128-bit SVE2 + - AArch64: Add USMMLA implempentation for 6-tap H/HV + - AArch64: Optimize Armv8.0 NEON for HBD horizontal filters and 6-tap filters + - Allow playing videos in full-screen mode in dav1dplay + + +Changes for 1.4.3 'Road Runner': +-------------------------------- + +1.4.3 is a small release focused on security issues + - AArch64: Fix potential out of bounds access in DotProd H/HV filters + - cli: Prevent buffer over-read + + Changes for 1.4.2 'Road Runner': -------------------------------- diff --git a/examples/dav1dplay.c b/examples/dav1dplay.c index 9cca8e8472b132522b5dc33c6b8126407735ee95..1f649444830f507e1c7419ac09398ada81bc0ced 100644 --- a/examples/dav1dplay.c +++ b/examples/dav1dplay.c @@ -120,6 +120,7 @@ static void dp_settings_print_usage(const char *const app, " --highquality: enable high quality rendering\n" " --zerocopy/-z: enable zero copy upload path\n" " --gpugrain/-g: enable GPU grain synthesis\n" + " --fullscreen/-f: enable full screen mode\n" " --version/-v: print version and exit\n" " --renderer/-r: select renderer backend (default: auto)\n"); exit(1); @@ -144,7 +145,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, Dav1dSettings *lib_settings = &rd_ctx->lib_settings; // Short options - static const char short_opts[] = "i:vuzgr:"; + static const char short_opts[] = "i:vuzgfr:"; enum { ARG_THREADS = 256, @@ -162,6 +163,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, { "highquality", 0, NULL, ARG_HIGH_QUALITY }, { "zerocopy", 0, NULL, 'z' }, { "gpugrain", 0, NULL, 'g' }, + { "fullscreen", 0, NULL, 'f'}, { "renderer", 0, NULL, 'r'}, { NULL, 0, NULL, 0 }, }; @@ -186,6 +188,9 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, case 'g': settings->gpugrain = true; break; + case 'f': + settings->fullscreen = true; + break; case 'r': settings->renderer_name = optarg; break; @@ -240,35 +245,37 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv) return NULL; } + // Parse and validate arguments + dav1d_default_settings(&rd_ctx->lib_settings); + memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings)); + dp_rd_ctx_parse_args(rd_ctx, argc, argv); + + // Init SDL2 library + if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0) { + fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError()); + goto fail; + } + // Register a custom event to notify our SDL main thread // about new frames rd_ctx->event_types = SDL_RegisterEvents(3); if (rd_ctx->event_types == UINT32_MAX) { fprintf(stderr, "Failure to create custom SDL event types!\n"); - free(rd_ctx); - return NULL; + goto fail; } rd_ctx->fifo = dp_fifo_create(5); if (rd_ctx->fifo == NULL) { fprintf(stderr, "Failed to create FIFO for output pictures!\n"); - free(rd_ctx); - return NULL; + goto fail; } rd_ctx->lock = SDL_CreateMutex(); if (rd_ctx->lock == NULL) { fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); - dp_fifo_destroy(rd_ctx->fifo); - free(rd_ctx); - return NULL; + goto fail; } - // Parse and validate arguments - dav1d_default_settings(&rd_ctx->lib_settings); - memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings)); - dp_rd_ctx_parse_args(rd_ctx, argc, argv); - // Select renderer renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name); @@ -279,15 +286,21 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv) printf("Using %s renderer\n", renderer_info->name); } - rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL; + rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer(&rd_ctx->settings) : NULL; if (rd_ctx->rd_priv == NULL) { - SDL_DestroyMutex(rd_ctx->lock); - dp_fifo_destroy(rd_ctx->fifo); - free(rd_ctx); - return NULL; + goto fail; } return rd_ctx; + +fail: + if (rd_ctx->lock) + SDL_DestroyMutex(rd_ctx->lock); + if (rd_ctx->fifo) + dp_fifo_destroy(rd_ctx->fifo); + free(rd_ctx); + SDL_Quit(); + return NULL; } /** @@ -662,10 +675,6 @@ int main(int argc, char **argv) return 1; } - // Init SDL2 library - if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0) - return 10; - // Create render context Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv); if (rd_ctx == NULL) { @@ -711,9 +720,7 @@ int main(int argc, char **argv) if (e->type == SDL_QUIT) { dp_rd_ctx_request_shutdown(rd_ctx); dp_fifo_flush(rd_ctx->fifo, destroy_pic); - SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME); - SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME); - num_frame_events = 0; + goto out; } else if (e->type == SDL_WINDOWEVENT) { if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) { // TODO: Handle window resizes @@ -724,6 +731,10 @@ int main(int argc, char **argv) SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e; if (kbde->keysym.sym == SDLK_SPACE) { dp_rd_ctx_toggle_pause(rd_ctx); + } else if (kbde->keysym.sym == SDLK_ESCAPE) { + dp_rd_ctx_request_shutdown(rd_ctx); + dp_fifo_flush(rd_ctx->fifo, destroy_pic); + goto out; } else if (kbde->keysym.sym == SDLK_LEFT || kbde->keysym.sym == SDLK_RIGHT) { @@ -776,5 +787,6 @@ out:; int decoder_ret = 0; SDL_WaitThread(decoder_thread, &decoder_ret); dp_rd_ctx_destroy(rd_ctx); + SDL_Quit(); return decoder_ret; } diff --git a/examples/dp_renderer.h b/examples/dp_renderer.h index 354e140a48376303622872b174780c5f6f32c352..513e2ad61fb5fbf1c7faa1fcdf62671e6e47fbeb 100644 --- a/examples/dp_renderer.h +++ b/examples/dp_renderer.h @@ -30,22 +30,32 @@ #include "dav1d/dav1d.h" #include <SDL.h> -#ifdef HAVE_PLACEBO +#if HAVE_PLACEBO # include <libplacebo/config.h> #endif // Check libplacebo Vulkan rendering -#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN) +#if HAVE_VULKAN && defined(SDL_VIDEO_VULKAN) # if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN -# define HAVE_RENDERER_PLACEBO -# define HAVE_PLACEBO_VULKAN +# define HAVE_RENDERER_PLACEBO 1 +# define HAVE_PLACEBO_VULKAN 1 # endif #endif // Check libplacebo OpenGL rendering #if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL -# define HAVE_RENDERER_PLACEBO -# define HAVE_PLACEBO_OPENGL +# define HAVE_RENDERER_PLACEBO 1 +# define HAVE_PLACEBO_OPENGL 1 +#endif + +#ifndef HAVE_RENDERER_PLACEBO +#define HAVE_RENDERER_PLACEBO 0 +#endif +#ifndef HAVE_PLACEBO_VULKAN +#define HAVE_PLACEBO_VULKAN 0 +#endif +#ifndef HAVE_PLACEBO_OPENGL +#define HAVE_PLACEBO_OPENGL 0 #endif /** @@ -61,6 +71,7 @@ typedef struct { int untimed; int zerocopy; int gpugrain; + int fullscreen; } Dav1dPlaySettings; #define WINDOW_WIDTH 910 @@ -82,7 +93,7 @@ typedef struct rdr_info // Cookie passed to the renderer implementation callbacks void *cookie; // Callback to create the renderer - void* (*create_renderer)(void); + void* (*create_renderer)(const Dav1dPlaySettings *settings); // Callback to destroy the renderer void (*destroy_renderer)(void *cookie); // Callback to the render function that renders a prevously sent frame diff --git a/examples/dp_renderer_placebo.c b/examples/dp_renderer_placebo.c index 4ab1415f44af4febeb8a6a5c263a2282d6860e1a..972cc576883d0cd69f78686e4c9aa189a3a1b0ff 100644 --- a/examples/dp_renderer_placebo.c +++ b/examples/dp_renderer_placebo.c @@ -26,17 +26,17 @@ #include "dp_renderer.h" -#ifdef HAVE_RENDERER_PLACEBO +#if HAVE_RENDERER_PLACEBO #include <assert.h> #include <libplacebo/renderer.h> #include <libplacebo/utils/dav1d.h> -#ifdef HAVE_PLACEBO_VULKAN +#if HAVE_PLACEBO_VULKAN # include <libplacebo/vulkan.h> # include <SDL_vulkan.h> #endif -#ifdef HAVE_PLACEBO_OPENGL +#if HAVE_PLACEBO_OPENGL # include <libplacebo/opengl.h> # include <SDL_opengl.h> #endif @@ -53,7 +53,7 @@ typedef struct renderer_priv_ctx pl_log log; // Placebo renderer pl_renderer renderer; -#ifdef HAVE_PLACEBO_VULKAN +#if HAVE_PLACEBO_VULKAN // Placebo Vulkan handle pl_vulkan vk; // Placebo Vulkan instance @@ -61,9 +61,11 @@ typedef struct renderer_priv_ctx // Vulkan surface VkSurfaceKHR surf; #endif -#ifdef HAVE_PLACEBO_OPENGL +#if HAVE_PLACEBO_OPENGL // Placebo OpenGL handle pl_opengl gl; + // SDL OpenGL context + SDL_GLContext gl_context; #endif // Placebo GPU pl_gpu gpu; @@ -77,13 +79,18 @@ typedef struct renderer_priv_ctx } Dav1dPlayRendererPrivateContext; static Dav1dPlayRendererPrivateContext* - placebo_renderer_create_common(int window_flags) + placebo_renderer_create_common(const Dav1dPlaySettings *settings, int window_flags) { + if (settings->fullscreen) + window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP; + // Create Window SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE); if (sdlwin == NULL) return NULL; + SDL_ShowCursor(0); + // Alloc Dav1dPlayRendererPrivateContext *const rd_priv_ctx = calloc(1, sizeof(Dav1dPlayRendererPrivateContext)); @@ -118,24 +125,25 @@ static Dav1dPlayRendererPrivateContext* return rd_priv_ctx; } -#ifdef HAVE_PLACEBO_OPENGL -static void *placebo_renderer_create_gl(void) +#if HAVE_PLACEBO_OPENGL +static void *placebo_renderer_create_gl(const Dav1dPlaySettings *settings) { SDL_Window *sdlwin = NULL; SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG); // Common init Dav1dPlayRendererPrivateContext *rd_priv_ctx = - placebo_renderer_create_common(SDL_WINDOW_OPENGL); + placebo_renderer_create_common(settings, SDL_WINDOW_OPENGL); if (rd_priv_ctx == NULL) return NULL; sdlwin = rd_priv_ctx->win; - SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin); - SDL_GL_MakeCurrent(sdlwin, glcontext); + rd_priv_ctx->gl_context = SDL_GL_CreateContext(sdlwin); + SDL_GL_MakeCurrent(sdlwin, rd_priv_ctx->gl_context); rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->log, pl_opengl_params( + .allow_software = true, #ifndef NDEBUG .debug = true, #endif @@ -173,14 +181,14 @@ static void *placebo_renderer_create_gl(void) } #endif -#ifdef HAVE_PLACEBO_VULKAN -static void *placebo_renderer_create_vk(void) +#if HAVE_PLACEBO_VULKAN +static void *placebo_renderer_create_vk(const Dav1dPlaySettings *settings) { SDL_Window *sdlwin = NULL; // Common init Dav1dPlayRendererPrivateContext *rd_priv_ctx = - placebo_renderer_create_common(SDL_WINDOW_VULKAN); + placebo_renderer_create_common(settings, SDL_WINDOW_VULKAN); if (rd_priv_ctx == NULL) return NULL; @@ -270,16 +278,18 @@ static void placebo_renderer_destroy(void *cookie) for (int i = 0; i < 3; i++) pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i])); -#ifdef HAVE_PLACEBO_VULKAN +#if HAVE_PLACEBO_VULKAN if (rd_priv_ctx->vk) { pl_vulkan_destroy(&(rd_priv_ctx->vk)); vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL); pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst)); } #endif -#ifdef HAVE_PLACEBO_OPENGL +#if HAVE_PLACEBO_OPENGL if (rd_priv_ctx->gl) pl_opengl_destroy(&(rd_priv_ctx->gl)); + if (rd_priv_ctx->gl_context) + SDL_GL_DeleteContext(rd_priv_ctx->gl_context); #endif SDL_DestroyWindow(rd_priv_ctx->win); @@ -382,7 +392,7 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie) SDL_UnlockMutex(rd_priv_ctx->lock); } -#ifdef HAVE_PLACEBO_VULKAN +#if HAVE_PLACEBO_VULKAN const Dav1dPlayRenderInfo rdr_placebo_vk = { .name = "placebo-vk", .create_renderer = placebo_renderer_create_vk, @@ -397,7 +407,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = { const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL }; #endif -#ifdef HAVE_PLACEBO_OPENGL +#if HAVE_PLACEBO_OPENGL const Dav1dPlayRenderInfo rdr_placebo_gl = { .name = "placebo-gl", .create_renderer = placebo_renderer_create_gl, diff --git a/examples/dp_renderer_sdl.c b/examples/dp_renderer_sdl.c index 735b0664d3313764810b14b81d17410080c67ef3..39e6ac8e00e7286d6f1c1d8b29f888d05fc88bbe 100644 --- a/examples/dp_renderer_sdl.c +++ b/examples/dp_renderer_sdl.c @@ -43,12 +43,18 @@ typedef struct renderer_priv_ctx SDL_Texture *tex; } Dav1dPlayRendererPrivateContext; -static void *sdl_renderer_create(void) +static void *sdl_renderer_create(const Dav1dPlaySettings *settings) { - SDL_Window *win = dp_create_sdl_window(0); + int window_flags = 0; + if (settings->fullscreen) + window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP; + + SDL_Window *win = dp_create_sdl_window(window_flags); if (win == NULL) return NULL; + SDL_ShowCursor(0); + // Alloc Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); if (rd_priv_ctx == NULL) { @@ -79,7 +85,9 @@ static void sdl_renderer_destroy(void *cookie) Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); + SDL_DestroyTexture(rd_priv_ctx->tex); SDL_DestroyRenderer(rd_priv_ctx->renderer); + SDL_DestroyWindow(rd_priv_ctx->win); SDL_DestroyMutex(rd_priv_ctx->lock); free(rd_priv_ctx); } @@ -142,6 +150,7 @@ static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic, if (texture == NULL) { texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, width, height); + SDL_RenderSetLogicalSize(rd_priv_ctx->renderer, width, height); } SDL_UpdateYUVTexture(texture, NULL, diff --git a/examples/meson.build b/examples/meson.build index 2b2b8bd5adc0a64432f31437cfe3536f6296915b..adbf85b7848203c435cff86eab1c0f73fcad3da7 100644 --- a/examples/meson.build +++ b/examples/meson.build @@ -48,19 +48,23 @@ if sdl2_dependency.found() placebo_dependency = dependency('libplacebo', version: '>= 4.160.0', required: false) - if placebo_dependency.found() + have_vulkan = false + have_placebo = placebo_dependency.found() + if have_placebo dav1dplay_deps += placebo_dependency - dav1dplay_cflags += '-DHAVE_PLACEBO' # If libplacebo is found, we might be able to use Vulkan # with it, in which case we need the Vulkan library too. vulkan_dependency = dependency('vulkan', required: false) if vulkan_dependency.found() dav1dplay_deps += vulkan_dependency - dav1dplay_cflags += '-DHAVE_VULKAN' + have_vulkan = true endif endif + dav1dplay_cflags += '-DHAVE_PLACEBO=' + (have_placebo ? '1' : '0') + dav1dplay_cflags += '-DHAVE_VULKAN=' + (have_vulkan ? '1' : '0') + dav1dplay = executable('dav1dplay', dav1dplay_sources, rev_target, diff --git a/include/common/attributes.h b/include/common/attributes.h index cd058abf9b6c217dd9b3716f675e83b0569724c1..c8758c19aef74ca721c700ac1d4a995d01a9ab19 100644 --- a/include/common/attributes.h +++ b/include/common/attributes.h @@ -189,9 +189,13 @@ static inline int clzll(const unsigned long long mask) { #ifndef static_assert #define CHECK_OFFSET(type, field, name) \ struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; } +#define CHECK_SIZE(type, size) \ + struct check_##type##_size { int x[(size == sizeof(type)) ? 1 : -1]; } #else #define CHECK_OFFSET(type, field, name) \ static_assert(name == offsetof(type, field), #field) +#define CHECK_SIZE(type, size) \ + static_assert(size == sizeof(type), #type) #endif #ifdef _MSC_VER diff --git a/include/common/intops.h b/include/common/intops.h index 2d21998b7100c323e7f75149ebed3f162a16c0bb..089da5e15ed016af736951820a33eaaf9fa2edf9 100644 --- a/include/common/intops.h +++ b/include/common/intops.h @@ -65,11 +65,11 @@ static inline int apply_sign64(const int v, const int64_t s) { } static inline int ulog2(const unsigned v) { - return 31 - clz(v); + return 31 ^ clz(v); } static inline int u64log2(const uint64_t v) { - return 63 - clzll(v); + return 63 ^ clzll(v); } static inline unsigned inv_recenter(const unsigned r, const unsigned v) { diff --git a/include/compat/getopt.h b/include/compat/getopt.h index 930e002a139142c8600538cc6f7fbcd0a4a4bd68..ad597691ef4860e511b619d585415ee7208412e8 100644 --- a/include/compat/getopt.h +++ b/include/compat/getopt.h @@ -13,7 +13,9 @@ #define __GETOPT_H__ /* All the headers include this file. */ +#ifdef _WIN32 #include <crtdefs.h> +#endif #ifdef __cplusplus extern "C" { diff --git a/include/dav1d/meson.build b/include/dav1d/meson.build index 68faaf9a3695dc7bb40ee3367046d917b5204908..dfb69a1c164192221240449842af6925db797134 100644 --- a/include/dav1d/meson.build +++ b/include/dav1d/meson.build @@ -22,24 +22,15 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# installed version.h header generation -version_h_data = configuration_data() -version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major) -version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor) -version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision) -version_h_target = configure_file(input: 'version.h.in', - output: 'version.h', - configuration: version_h_data) - dav1d_api_headers = [ 'common.h', 'data.h', 'dav1d.h', 'headers.h', 'picture.h', + 'version.h', ] # install headers install_headers(dav1d_api_headers, - version_h_target, subdir : 'dav1d') diff --git a/include/dav1d/version.h.in b/include/dav1d/version.h similarity index 88% rename from include/dav1d/version.h.in rename to include/dav1d/version.h index 4fa420ded31e977a49f05c574b68af2c3d33be37..43df60391531695037f582d64e19e912f6a14e40 100644 --- a/include/dav1d/version.h.in +++ b/include/dav1d/version.h @@ -1,5 +1,5 @@ /* - * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019-2024, VideoLAN and dav1d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,9 +31,9 @@ extern "C" { #endif -#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@ -#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@ -#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@ +#define DAV1D_API_VERSION_MAJOR 7 +#define DAV1D_API_VERSION_MINOR 0 +#define DAV1D_API_VERSION_PATCH 0 /** * Extract version components from the value returned by diff --git a/meson.build b/meson.build index f5010ac4855e55d30764e6d721a7474ade037e35..798abc1deb7b8680c529ca3de1f3a8e37478af60 100644 --- a/meson.build +++ b/meson.build @@ -1,4 +1,4 @@ -# Copyright © 2018-2022, VideoLAN and dav1d authors +# Copyright © 2018-2024, VideoLAN and dav1d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,19 +23,13 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '1.4.2', + version: '1.5.0', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.49.0') -dav1d_soname_version = '7.0.0' -dav1d_api_version_array = dav1d_soname_version.split('.') -dav1d_api_version_major = dav1d_api_version_array[0] -dav1d_api_version_minor = dav1d_api_version_array[1] -dav1d_api_version_revision = dav1d_api_version_array[2] - dav1d_src_root = meson.current_source_dir() cc = meson.get_compiler('c') @@ -48,7 +42,18 @@ cdata_asm = configuration_data() # Include directories dav1d_inc_dirs = include_directories(['.', 'include/dav1d', 'include']) - +dav1d_api_version_major = cc.get_define('DAV1D_API_VERSION_MAJOR', + prefix: '#include "dav1d/version.h"', + include_directories: dav1d_inc_dirs).strip() +dav1d_api_version_minor = cc.get_define('DAV1D_API_VERSION_MINOR', + prefix: '#include "dav1d/version.h"', + include_directories: dav1d_inc_dirs).strip() +dav1d_api_version_revision = cc.get_define('DAV1D_API_VERSION_PATCH', + prefix: '#include "dav1d/version.h"', + include_directories: dav1d_inc_dirs).strip() +dav1d_soname_version = '@0@.@1@.@2@'.format(dav1d_api_version_major, + dav1d_api_version_minor, + dav1d_api_version_revision) # # Option handling @@ -98,6 +103,10 @@ if host_machine.system() in ['linux', 'gnu', 'emscripten'] add_project_arguments('-D_GNU_SOURCE', language: 'c') endif +have_clock_gettime = false +have_posix_memalign = false +have_memalign = false +have_aligned_alloc = false if host_machine.system() == 'windows' cdata.set('_WIN32_WINNT', '0x0601') cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs @@ -145,20 +154,25 @@ else rt_dependency = [] if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args) - cdata.set('HAVE_CLOCK_GETTIME', 1) + have_clock_gettime = true elif host_machine.system() not in ['darwin', 'ios', 'tvos'] rt_dependency = cc.find_library('rt', required: false) if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency) error('clock_gettime not found') endif - cdata.set('HAVE_CLOCK_GETTIME', 1) + have_clock_gettime = true endif - if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args) - cdata.set('HAVE_POSIX_MEMALIGN', 1) - endif + have_posix_memalign = cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args) + have_memalign = cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args) + have_aligned_alloc = cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args) endif +cdata.set10('HAVE_CLOCK_GETTIME', have_clock_gettime) +cdata.set10('HAVE_POSIX_MEMALIGN', have_posix_memalign) +cdata.set10('HAVE_MEMALIGN', have_memalign) +cdata.set10('HAVE_ALIGNED_ALLOC', have_aligned_alloc) + # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64 have_fseeko = true if host_machine.system() == 'android' @@ -175,12 +189,12 @@ if host_machine.system() == 'android' endif libdl_dependency = [] +have_dlsym = false if host_machine.system() == 'linux' libdl_dependency = cc.find_library('dl', required : false) - if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency) - cdata.set('HAVE_DLSYM', 1) - endif + have_dlsym = cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency) endif +cdata.set10('HAVE_DLSYM', have_dlsym) libm_dependency = cc.find_library('m', required: false) @@ -209,19 +223,13 @@ if host_machine.cpu_family().startswith('wasm') stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true) endif -if cc.check_header('unistd.h') - cdata.set('HAVE_UNISTD_H', 1) -endif - -if cc.check_header('io.h') - cdata.set('HAVE_IO_H', 1) -endif - -if cc.check_header('pthread_np.h') - cdata.set('HAVE_PTHREAD_NP_H', 1) - test_args += '-DHAVE_PTHREAD_NP_H' -endif +cdata.set10('HAVE_SYS_TYPES_H', cc.check_header('sys/types.h')) +cdata.set10('HAVE_UNISTD_H', cc.check_header('unistd.h')) +cdata.set10('HAVE_IO_H', cc.check_header('io.h')) +have_pthread_np = cc.check_header('pthread_np.h') +cdata.set10('HAVE_PTHREAD_NP_H', have_pthread_np) +test_args += '-DHAVE_PTHREAD_NP_H=' + (have_pthread_np ? '1' : '0') # Function checks @@ -234,35 +242,32 @@ else getopt_dependency = [] endif +have_getauxval = false +have_elf_aux_info = false if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu_family().startswith('loongarch') or host_machine.cpu() == 'ppc64le' or host_machine.cpu_family().startswith('riscv')) - if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args) - cdata.set('HAVE_GETAUXVAL', 1) - endif - if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args) - cdata.set('HAVE_ELF_AUX_INFO', 1) - endif + have_getauxval = cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args) + have_elf_aux_info = cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args) endif +cdata.set10('HAVE_GETAUXVAL', have_getauxval) +cdata.set10('HAVE_ELF_AUX_INFO', have_elf_aux_info) + pthread_np_prefix = ''' #include <pthread.h> -#ifdef HAVE_PTHREAD_NP_H +#if HAVE_PTHREAD_NP_H #include <pthread_np.h> #endif ''' -if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency) - cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1) -endif -if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency) - cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1) -endif +cdata.set10('HAVE_PTHREAD_GETAFFINITY_NP', cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) +cdata.set10('HAVE_PTHREAD_SETAFFINITY_NP', cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) +cdata.set10('HAVE_PTHREAD_SETNAME_NP', cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) +cdata.set10('HAVE_PTHREAD_SET_NAME_NP', cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) -if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args) - cdata.set('HAVE_C11_GENERIC', 1) -endif +cdata.set10('HAVE_C11_GENERIC', cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)) # Compiler flag tests @@ -343,6 +348,17 @@ endif cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') cdata.set10('ARCH_ARM', host_machine.cpu_family().startswith('arm') and host_machine.cpu() != 'arm64') + +have_as_func = false +have_as_arch = false +aarch64_extensions = { + 'dotprod': 'udot v0.4s, v0.16b, v0.16b', + 'i8mm': 'usdot v0.4s, v0.16b, v0.16b', + 'sve': 'whilelt p0.s, x0, x1', + 'sve2': 'sqrdmulh z0.s, z0.s, z0.s', +} +supported_aarch64_archexts = [] +supported_aarch64_instructions = [] if (is_asm_enabled and (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm'))) @@ -353,7 +369,6 @@ if (is_asm_enabled and ); ''' have_as_func = cc.compiles(as_func_code) - cdata.set10('HAVE_AS_FUNC', have_as_func) # fedora package build infrastructure uses a gcc specs file to enable # '-fPIE' by default. The chosen way only adds '-fPIE' to the C compiler @@ -374,7 +389,6 @@ if (is_asm_enabled and if host_machine.cpu_family() == 'aarch64' have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''') - cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch) as_arch_str = '' if have_as_arch as_arch_level = 'armv8-a' @@ -403,13 +417,7 @@ if (is_asm_enabled and cdata.set('AS_ARCH_LEVEL', as_arch_level) as_arch_str = '".arch ' + as_arch_level + '\\n"' endif - extensions = { - 'dotprod': 'udot v0.4s, v0.16b, v0.16b', - 'i8mm': 'usdot v0.4s, v0.16b, v0.16b', - 'sve': 'whilelt p0.s, x0, x1', - 'sve2': 'sqrdmulh z0.s, z0.s, z0.s', - } - foreach name, instr : extensions + foreach name, instr : aarch64_extensions # Test for support for the various extensions. First test if # the assembler supports the .arch_extension directive for # enabling/disabling the extension, then separately check whether @@ -420,19 +428,27 @@ if (is_asm_enabled and code += '".arch_extension ' + name + '\\n"' code += ');' supports_archext = cc.compiles(code) - cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext) code = '__asm__ (' + as_arch_str if supports_archext + supported_aarch64_archexts += name code += '".arch_extension ' + name + '\\n"' endif code += '"' + instr + '\\n"' code += ');' - supports_instr = cc.compiles(code, name: name.to_upper()) - cdata.set10('HAVE_' + name.to_upper(), supports_instr) + if cc.compiles(code, name: name.to_upper()) + supported_aarch64_instructions += name + endif endforeach endif endif +cdata.set10('HAVE_AS_FUNC', have_as_func) +cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch) +foreach name, _ : aarch64_extensions + cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', name in supported_aarch64_archexts) + cdata.set10('HAVE_' + name.to_upper(), name in supported_aarch64_instructions) +endforeach + cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86')) cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64') cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86') diff --git a/src/arm/32/util.S b/src/arm/32/util.S index c3710d37670f441dc69279c3b5f8a263940f2c6d..38d63f855d0166dad05bb4d2e2c6e1ea3af5d529 100644 --- a/src/arm/32/util.S +++ b/src/arm/32/util.S @@ -31,18 +31,36 @@ #include "config.h" #include "src/arm/asm.S" +#include "src/arm/arm-arch.h" + +.macro v4bx rd +#if __ARM_ARCH >= 5 || defined(__ARM_ARCH_4T__) + bx \rd +#else + mov pc, \rd +#endif +.endm + +.macro v4blx rd +#if __ARM_ARCH >= 5 + blx \rd +#else + mov lr, pc + v4bx \rd +#endif +.endm .macro movrel_local rd, val, offset=0 -#if defined(PIC) +#if (__ARM_ARCH >= 7 || defined(__ARM_ARCH_6T2__)) && !defined(PIC) + movw \rd, #:lower16:\val+\offset + movt \rd, #:upper16:\val+\offset +#else ldr \rd, 90001f b 90002f 90001: .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB) 90002: add \rd, \rd, pc -#else - movw \rd, #:lower16:\val+\offset - movt \rd, #:upper16:\val+\offset #endif .endm diff --git a/src/arm/64/filmgrain.S b/src/arm/64/filmgrain.S index aa7f18bf39d53d824081550d1fc3ca39500dea1a..864ceba974a37450dc6f409baca1c49c4df03871 100644 --- a/src/arm/64/filmgrain.S +++ b/src/arm/64/filmgrain.S @@ -884,12 +884,12 @@ function generate_grain_\type\()_8bpc_neon, export=1 .else add x4, x1, #FGD_AR_COEFFS_UV .endif - adr x16, L(gen_grain_\type\()_tbl) + movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] + ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 + data->grain_scale_shift - sub x16, x16, w17, uxtw + add x16, x16, x17 neg v31.8h, v31.8h .ifc \type, uv_444 @@ -1075,13 +1075,14 @@ L(generate_grain_\type\()_lag3): ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) endfunc + +jumptable gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl +endjumptable .endm gen_grain_82 y @@ -1118,12 +1119,12 @@ function generate_grain_\type\()_8bpc_neon, export=1 ldr w2, [x1, #FGD_SEED] ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] add x4, x1, #FGD_AR_COEFFS_UV - adr x16, L(gen_grain_\type\()_tbl) + movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] + ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 + data->grain_scale_shift - sub x16, x16, w17, uxtw + add x16, x16, x17 neg v31.8h, v31.8h cmp w13, #0 @@ -1272,13 +1273,14 @@ L(generate_grain_\type\()_lag3): ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) endfunc + +jumptable gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl +endjumptable .endm gen_grain_44 uv_420 @@ -1407,18 +1409,18 @@ function fgy_32x32_8bpc_neon, export=1 add_offset x5, w6, x10, x5, x9 ldr w11, [sp, #24] // type - adr x13, L(fgy_loop_tbl) + movrel x13, fgy_loop_tbl add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by tst w11, #1 - ldrh w11, [x13, w11, uxtw #1] + ldrsw x11, [x13, w11, uxtw #2] add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx - sub x11, x13, w11, uxtw + add x11, x13, x11 b.eq 1f // y overlap @@ -1555,14 +1557,15 @@ L(loop_\ox\oy): fgy 0, 1 fgy 1, 0 fgy 1, 1 - -L(fgy_loop_tbl): - .hword L(fgy_loop_tbl) - L(loop_00) - .hword L(fgy_loop_tbl) - L(loop_01) - .hword L(fgy_loop_tbl) - L(loop_10) - .hword L(fgy_loop_tbl) - L(loop_11) endfunc +jumptable fgy_loop_tbl + .word L(loop_00) - fgy_loop_tbl + .word L(loop_01) - fgy_loop_tbl + .word L(loop_10) - fgy_loop_tbl + .word L(loop_11) - fgy_loop_tbl +endjumptable + // void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, @@ -1646,11 +1649,11 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1 ldr w13, [sp, #64] // type movrel x16, overlap_coeffs_\sx - adr x14, L(fguv_loop_sx\sx\()_tbl) + movrel x14, fguv_loop_sx\sx\()_tbl ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs tst w13, #1 - ldrh w13, [x14, w13, uxtw #1] + ldrsw x13, [x14, w13, uxtw #2] b.eq 1f // y overlap @@ -1658,7 +1661,7 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1 mov w9, #(2 >> \sy) 1: - sub x13, x14, w13, uxtw + add x13, x14, x13 .if \sy movi v25.16b, #23 @@ -1848,18 +1851,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): ldr x30, [sp], #32 AARCH64_VALIDATE_LINK_REGISTER ret - -L(fguv_loop_sx0_tbl): - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) endfunc +jumptable fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl +endjumptable + function fguv_loop_sx1_neon .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): @@ -1997,14 +2001,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): ldr x30, [sp], #32 AARCH64_VALIDATE_LINK_REGISTER ret - -L(fguv_loop_sx1_tbl): - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) endfunc + +jumptable fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl +endjumptable diff --git a/src/arm/64/filmgrain16.S b/src/arm/64/filmgrain16.S index 75252acfb1a8398650bff3e0a2f235fb219e77d7..aa6b75b171e05022a363f1b15bd3b9a5e23714e2 100644 --- a/src/arm/64/filmgrain16.S +++ b/src/arm/64/filmgrain16.S @@ -740,12 +740,12 @@ function generate_grain_\type\()_16bpc_neon, export=1 add x4, x1, #FGD_AR_COEFFS_UV .endif add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 - adr x16, L(gen_grain_\type\()_tbl) + movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] + ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift - sub x16, x16, w17, uxtw + add x16, x16, x17 neg v31.8h, v31.8h .ifc \type, uv_444 @@ -945,13 +945,14 @@ L(generate_grain_\type\()_lag3): ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) endfunc + +jumptable gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl +endjumptable .endm gen_grain_82 y @@ -991,12 +992,12 @@ function generate_grain_\type\()_16bpc_neon, export=1 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] add x4, x1, #FGD_AR_COEFFS_UV add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 - adr x16, L(gen_grain_\type\()_tbl) + movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] + ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift - sub x16, x16, w17, uxtw + add x16, x16, x17 neg v31.8h, v31.8h cmp w13, #0 @@ -1155,13 +1156,14 @@ L(generate_grain_\type\()_lag3): ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) endfunc + +jumptable gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl + .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl +endjumptable .endm gen_grain_44 uv_420 @@ -1306,18 +1308,18 @@ function fgy_32x32_16bpc_neon, export=1 add_offset x5, w6, x10, x5, x9 ldr w11, [sp, #88] // type - adr x13, L(fgy_loop_tbl) + movrel x13, fgy_loop_tbl add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by tst w11, #1 - ldrh w11, [x13, w11, uxtw #1] + ldrsw x11, [x13, w11, uxtw #2] add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx - sub x11, x13, w11, uxtw + add x11, x13, x11 b.eq 1f // y overlap @@ -1480,14 +1482,15 @@ L(loop_\ox\oy): fgy 0, 1 fgy 1, 0 fgy 1, 1 - -L(fgy_loop_tbl): - .hword L(fgy_loop_tbl) - L(loop_00) - .hword L(fgy_loop_tbl) - L(loop_01) - .hword L(fgy_loop_tbl) - L(loop_10) - .hword L(fgy_loop_tbl) - L(loop_11) endfunc +jumptable fgy_loop_tbl + .word L(loop_00) - fgy_loop_tbl + .word L(loop_01) - fgy_loop_tbl + .word L(loop_10) - fgy_loop_tbl + .word L(loop_11) - fgy_loop_tbl +endjumptable + // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, @@ -1589,11 +1592,11 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1 ldr w13, [sp, #112] // type movrel x16, overlap_coeffs_\sx - adr x14, L(fguv_loop_sx\sx\()_tbl) + movrel x14, fguv_loop_sx\sx\()_tbl ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs tst w13, #1 - ldrh w13, [x14, w13, uxtw #1] + ldrsw x13, [x14, w13, uxtw #2] b.eq 1f // y overlap @@ -1601,7 +1604,7 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1 mov w9, #(2 >> \sy) 1: - sub x13, x14, w13, uxtw + add x13, x14, x13 .if \sy movi v25.8h, #23 @@ -1818,18 +1821,19 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret - -L(fguv_loop_sx0_tbl): - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) endfunc +jumptable fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl + .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl +endjumptable + function fguv_loop_sx1_neon .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): @@ -1984,14 +1988,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret - -L(fguv_loop_sx1_tbl): - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) endfunc + +jumptable fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl + .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl +endjumptable diff --git a/src/arm/64/ipred.S b/src/arm/64/ipred.S index 709238e2f85474218578811c16746ef152b100d5..5a375d8dca06446435126d0860488fb36feb25ed 100644 --- a/src/arm/64/ipred.S +++ b/src/arm/64/ipred.S @@ -34,16 +34,17 @@ // const int max_width, const int max_height); function ipred_dc_128_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_dc_128_tbl) + movrel x5, ipred_dc_128_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] movi v0.16b, #128 - sub x5, x5, w3, uxtw + add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 -4: +40: AARCH64_VALID_JUMP_TARGET +4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 @@ -51,8 +52,9 @@ function ipred_dc_128_8bpc_neon, export=1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 @@ -60,8 +62,9 @@ function ipred_dc_128_8bpc_neon, export=1 st1 {v0.8b}, [x6], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 @@ -93,26 +96,27 @@ function ipred_dc_128_8bpc_neon, export=1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret - -L(ipred_dc_128_tbl): - .hword L(ipred_dc_128_tbl) - 640b - .hword L(ipred_dc_128_tbl) - 320b - .hword L(ipred_dc_128_tbl) - 16b - .hword L(ipred_dc_128_tbl) - 8b - .hword L(ipred_dc_128_tbl) - 4b endfunc +jumptable ipred_dc_128_tbl + .word 640b - ipred_dc_128_tbl + .word 320b - ipred_dc_128_tbl + .word 160b - ipred_dc_128_tbl + .word 80b - ipred_dc_128_tbl + .word 40b - ipred_dc_128_tbl +endjumptable + // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_v_tbl) + movrel x5, ipred_v_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #1 - sub x5, x5, w3, uxtw + add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -171,32 +175,34 @@ function ipred_v_8bpc_neon, export=1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret - -L(ipred_v_tbl): - .hword L(ipred_v_tbl) - 640b - .hword L(ipred_v_tbl) - 320b - .hword L(ipred_v_tbl) - 160b - .hword L(ipred_v_tbl) - 80b - .hword L(ipred_v_tbl) - 40b endfunc +jumptable ipred_v_tbl + .word 640b - ipred_v_tbl + .word 320b - ipred_v_tbl + .word 160b - ipred_v_tbl + .word 80b - ipred_v_tbl + .word 40b - ipred_v_tbl +endjumptable + // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_h_tbl) + movrel x5, ipred_h_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] sub x2, x2, #4 - sub x5, x5, w3, uxtw + add x5, x5, x3 mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.s}[0], [x0], x1 st1 {v2.s}[0], [x6], x1 @@ -205,8 +211,9 @@ function ipred_h_8bpc_neon, export=1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.8b}, [x0], x1 st1 {v2.8b}, [x6], x1 @@ -215,8 +222,9 @@ function ipred_h_8bpc_neon, export=1 st1 {v0.8b}, [x6], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 @@ -225,8 +233,9 @@ function ipred_h_8bpc_neon, export=1 st1 {v0.16b}, [x6], x1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -239,8 +248,9 @@ function ipred_h_8bpc_neon, export=1 st1 {v0.16b}, [x6], x1 b.gt 32b ret -64: +640: AARCH64_VALID_JUMP_TARGET +64: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -257,26 +267,27 @@ function ipred_h_8bpc_neon, export=1 st1 {v0.16b}, [x6], x1 b.gt 64b ret - -L(ipred_h_tbl): - .hword L(ipred_h_tbl) - 64b - .hword L(ipred_h_tbl) - 32b - .hword L(ipred_h_tbl) - 16b - .hword L(ipred_h_tbl) - 8b - .hword L(ipred_h_tbl) - 4b endfunc +jumptable ipred_h_tbl + .word 640b - ipred_h_tbl + .word 320b - ipred_h_tbl + .word 160b - ipred_h_tbl + .word 80b - ipred_h_tbl + .word 40b - ipred_h_tbl +endjumptable + // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_dc_top_tbl) + movrel x5, ipred_dc_top_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #1 - sub x5, x5, w3, uxtw + add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -362,15 +373,16 @@ function ipred_dc_top_8bpc_neon, export=1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret - -L(ipred_dc_top_tbl): - .hword L(ipred_dc_top_tbl) - 640b - .hword L(ipred_dc_top_tbl) - 320b - .hword L(ipred_dc_top_tbl) - 160b - .hword L(ipred_dc_top_tbl) - 80b - .hword L(ipred_dc_top_tbl) - 40b endfunc +jumptable ipred_dc_top_tbl + .word 640b - ipred_dc_top_tbl + .word 320b - ipred_dc_top_tbl + .word 160b - ipred_dc_top_tbl + .word 80b - ipred_dc_top_tbl + .word 40b - ipred_dc_top_tbl +endjumptable + // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -379,13 +391,13 @@ function ipred_dc_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w3, w3 clz w7, w4 - adr x5, L(ipred_dc_left_tbl) + movrel x5, ipred_dc_left_tbl sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 - ldrh w3, [x5, w3, uxtw #1] - ldrh w7, [x5, w7, uxtw #1] - sub x3, x5, w3, uxtw - sub x5, x5, w7, uxtw + ldrsw x3, [x5, w3, uxtw #2] + ldrsw x7, [x5, w7, uxtw #2] + add x3, x5, x3 + add x5, x5, x7 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -399,12 +411,13 @@ L(ipred_dc_left_h4): br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET +1: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 - b.gt L(ipred_dc_left_w4) + b.gt 1b ret L(ipred_dc_left_h8): @@ -416,12 +429,13 @@ L(ipred_dc_left_h8): br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET +1: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 - b.gt L(ipred_dc_left_w8) + b.gt 1b ret L(ipred_dc_left_h16): @@ -433,12 +447,13 @@ L(ipred_dc_left_h16): br x3 L(ipred_dc_left_w16): AARCH64_VALID_JUMP_TARGET +1: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 - b.gt L(ipred_dc_left_w16) + b.gt 1b ret L(ipred_dc_left_h32): @@ -488,20 +503,21 @@ L(ipred_dc_left_w64): st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 1b ret - -L(ipred_dc_left_tbl): - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc +jumptable ipred_dc_left_tbl + .word L(ipred_dc_left_h64) - ipred_dc_left_tbl + .word L(ipred_dc_left_h32) - ipred_dc_left_tbl + .word L(ipred_dc_left_h16) - ipred_dc_left_tbl + .word L(ipred_dc_left_h8) - ipred_dc_left_tbl + .word L(ipred_dc_left_h4) - ipred_dc_left_tbl + .word L(ipred_dc_left_w64) - ipred_dc_left_tbl + .word L(ipred_dc_left_w32) - ipred_dc_left_tbl + .word L(ipred_dc_left_w16) - ipred_dc_left_tbl + .word L(ipred_dc_left_w8) - ipred_dc_left_tbl + .word L(ipred_dc_left_w4) - ipred_dc_left_tbl +endjumptable + // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -512,16 +528,16 @@ function ipred_dc_8bpc_neon, export=1 clz w3, w3 clz w6, w4 dup v16.8h, w7 // width + height - adr x5, L(ipred_dc_tbl) + movrel x5, ipred_dc_tbl rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) - ldrh w3, [x5, w3, uxtw #1] - ldrh w6, [x5, w6, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] + ldrsw x6, [x5, w6, uxtw #2] neg w7, w7 // -ctz(width + height) - sub x3, x5, w3, uxtw - sub x5, x5, w6, uxtw + add x3, x5, x3 + add x5, x5, x6 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w7 // -ctz(width + height) add x6, x0, x1 @@ -713,33 +729,34 @@ L(ipred_dc_w64): st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 2b ret - -L(ipred_dc_tbl): - .hword L(ipred_dc_tbl) - L(ipred_dc_h64) - .hword L(ipred_dc_tbl) - L(ipred_dc_h32) - .hword L(ipred_dc_tbl) - L(ipred_dc_h16) - .hword L(ipred_dc_tbl) - L(ipred_dc_h8) - .hword L(ipred_dc_tbl) - L(ipred_dc_h4) - .hword L(ipred_dc_tbl) - L(ipred_dc_w64) - .hword L(ipred_dc_tbl) - L(ipred_dc_w32) - .hword L(ipred_dc_tbl) - L(ipred_dc_w16) - .hword L(ipred_dc_tbl) - L(ipred_dc_w8) - .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc +jumptable ipred_dc_tbl + .word L(ipred_dc_h64) - ipred_dc_tbl + .word L(ipred_dc_h32) - ipred_dc_tbl + .word L(ipred_dc_h16) - ipred_dc_tbl + .word L(ipred_dc_h8) - ipred_dc_tbl + .word L(ipred_dc_h4) - ipred_dc_tbl + .word L(ipred_dc_w64) - ipred_dc_tbl + .word L(ipred_dc_w32) - ipred_dc_tbl + .word L(ipred_dc_w16) - ipred_dc_tbl + .word L(ipred_dc_w8) - ipred_dc_tbl + .word L(ipred_dc_w4) - ipred_dc_tbl +endjumptable + // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_8bpc_neon, export=1 clz w9, w3 - adr x5, L(ipred_paeth_tbl) + movrel x5, ipred_paeth_tbl sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.16b}, [x2] add x8, x2, #1 sub x2, x2, #4 - sub x5, x5, w9, uxtw + add x5, x5, x9 mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 @@ -898,15 +915,16 @@ function ipred_paeth_8bpc_neon, export=1 b 1b 9: ret - -L(ipred_paeth_tbl): - .hword L(ipred_paeth_tbl) - 640b - .hword L(ipred_paeth_tbl) - 320b - .hword L(ipred_paeth_tbl) - 160b - .hword L(ipred_paeth_tbl) - 80b - .hword L(ipred_paeth_tbl) - 40b endfunc +jumptable ipred_paeth_tbl + .word 640b - ipred_paeth_tbl + .word 320b - ipred_paeth_tbl + .word 160b - ipred_paeth_tbl + .word 80b - ipred_paeth_tbl + .word 40b - ipred_paeth_tbl +endjumptable + // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -916,13 +934,13 @@ function ipred_smooth_8bpc_neon, export=1 add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 - adr x5, L(ipred_smooth_tbl) + movrel x5, ipred_smooth_tbl sub x12, x2, w4, uxtw sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.16b}, [x12] // bottom add x8, x2, #1 - sub x5, x5, w9, uxtw + add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1079,15 +1097,16 @@ function ipred_smooth_8bpc_neon, export=1 b 1b 9: ret - -L(ipred_smooth_tbl): - .hword L(ipred_smooth_tbl) - 640b - .hword L(ipred_smooth_tbl) - 320b - .hword L(ipred_smooth_tbl) - 160b - .hword L(ipred_smooth_tbl) - 80b - .hword L(ipred_smooth_tbl) - 40b endfunc +jumptable ipred_smooth_tbl + .word 640b - ipred_smooth_tbl + .word 320b - ipred_smooth_tbl + .word 160b - ipred_smooth_tbl + .word 80b - ipred_smooth_tbl + .word 40b - ipred_smooth_tbl +endjumptable + // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -1096,13 +1115,13 @@ function ipred_smooth_v_8bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 - adr x5, L(ipred_smooth_v_tbl) + movrel x5, ipred_smooth_v_tbl sub x8, x2, w4, uxtw sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.16b}, [x8] // bottom add x2, x2, #1 - sub x5, x5, w9, uxtw + add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1220,15 +1239,16 @@ function ipred_smooth_v_8bpc_neon, export=1 b 1b 9: ret - -L(ipred_smooth_v_tbl): - .hword L(ipred_smooth_v_tbl) - 640b - .hword L(ipred_smooth_v_tbl) - 320b - .hword L(ipred_smooth_v_tbl) - 160b - .hword L(ipred_smooth_v_tbl) - 80b - .hword L(ipred_smooth_v_tbl) - 40b endfunc +jumptable ipred_smooth_v_tbl + .word 640b - ipred_smooth_v_tbl + .word 320b - ipred_smooth_v_tbl + .word 160b - ipred_smooth_v_tbl + .word 80b - ipred_smooth_v_tbl + .word 40b - ipred_smooth_v_tbl +endjumptable + // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -1237,12 +1257,12 @@ function ipred_smooth_h_8bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 - adr x5, L(ipred_smooth_h_tbl) + movrel x5, ipred_smooth_h_tbl add x12, x2, w3, uxtw sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v5.16b}, [x12] // right - sub x5, x5, w9, uxtw + add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1366,15 +1386,16 @@ function ipred_smooth_h_8bpc_neon, export=1 b 1b 9: ret - -L(ipred_smooth_h_tbl): - .hword L(ipred_smooth_h_tbl) - 640b - .hword L(ipred_smooth_h_tbl) - 320b - .hword L(ipred_smooth_h_tbl) - 160b - .hword L(ipred_smooth_h_tbl) - 80b - .hword L(ipred_smooth_h_tbl) - 40b endfunc +jumptable ipred_smooth_h_tbl + .word 640b - ipred_smooth_h_tbl + .word 320b - ipred_smooth_h_tbl + .word 160b - ipred_smooth_h_tbl + .word 80b - ipred_smooth_h_tbl + .word 40b - ipred_smooth_h_tbl +endjumptable + const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 @@ -1653,11 +1674,11 @@ endfunc // const int dx, const int max_base_x); function ipred_z1_fill1_8bpc_neon, export=1 clz w9, w3 - adr x8, L(ipred_z1_fill1_tbl) + movrel x8, ipred_z1_fill1_tbl sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] + ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw // top[max_base_x] - sub x8, x8, w9, uxtw + add x8, x8, x9 ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 @@ -1815,15 +1836,16 @@ function ipred_z1_fill1_8bpc_neon, export=1 add x13, x13, x1 mov w3, w12 b 169b - -L(ipred_z1_fill1_tbl): - .hword L(ipred_z1_fill1_tbl) - 640b - .hword L(ipred_z1_fill1_tbl) - 320b - .hword L(ipred_z1_fill1_tbl) - 160b - .hword L(ipred_z1_fill1_tbl) - 80b - .hword L(ipred_z1_fill1_tbl) - 40b endfunc +jumptable ipred_z1_fill1_tbl + .word 640b - ipred_z1_fill1_tbl + .word 320b - ipred_z1_fill1_tbl + .word 160b - ipred_z1_fill1_tbl + .word 80b - ipred_z1_fill1_tbl + .word 40b - ipred_z1_fill1_tbl +endjumptable + function ipred_z1_fill2_8bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] @@ -1940,11 +1962,11 @@ endconst // const int dx, const int dy); function ipred_z2_fill1_8bpc_neon, export=1 clz w10, w4 - adr x9, L(ipred_z2_fill1_tbl) + movrel x9, ipred_z2_fill1_tbl sub w10, w10, #25 - ldrh w10, [x9, w10, uxtw #1] + ldrsw x10, [x9, w10, uxtw #2] mov w8, #(1 << 6) // xpos = 1 << 6 - sub x9, x9, w10, uxtw + add x9, x9, x10 sub w8, w8, w6 // xpos -= dx movrel x11, increments @@ -2650,15 +2672,16 @@ function ipred_z2_fill1_8bpc_neon, export=1 ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret - -L(ipred_z2_fill1_tbl): - .hword L(ipred_z2_fill1_tbl) - 640b - .hword L(ipred_z2_fill1_tbl) - 320b - .hword L(ipred_z2_fill1_tbl) - 160b - .hword L(ipred_z2_fill1_tbl) - 80b - .hword L(ipred_z2_fill1_tbl) - 40b endfunc +jumptable ipred_z2_fill1_tbl + .word 640b - ipred_z2_fill1_tbl + .word 320b - ipred_z2_fill1_tbl + .word 160b - ipred_z2_fill1_tbl + .word 80b - ipred_z2_fill1_tbl + .word 40b - ipred_z2_fill1_tbl +endjumptable + function ipred_z2_fill2_8bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 @@ -3160,11 +3183,11 @@ endfunc function ipred_z3_fill1_8bpc_neon, export=1 cmp w6, #64 clz w9, w3 - adr x8, L(ipred_z3_fill1_tbl) + movrel x8, ipred_z3_fill1_tbl sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] + ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw // left[max_base_y] - sub x8, x8, w9, uxtw + add x8, x8, x9 movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments @@ -3502,19 +3525,20 @@ L(ipred_z3_fill1_large_h16): b 1b 9: ret - -L(ipred_z3_fill1_tbl): - .hword L(ipred_z3_fill1_tbl) - 640b - .hword L(ipred_z3_fill1_tbl) - 320b - .hword L(ipred_z3_fill1_tbl) - 160b - .hword L(ipred_z3_fill1_tbl) - 80b - .hword L(ipred_z3_fill1_tbl) - 40b endfunc +jumptable ipred_z3_fill1_tbl + .word 640b - ipred_z3_fill1_tbl + .word 320b - ipred_z3_fill1_tbl + .word 160b - ipred_z3_fill1_tbl + .word 80b - ipred_z3_fill1_tbl + .word 40b - ipred_z3_fill1_tbl +endjumptable + function ipred_z3_fill_padding_neon, export=0 cmp w3, #16 - adr x8, L(ipred_z3_fill_padding_tbl) - b.gt L(ipred_z3_fill_padding_wide) + movrel x8, ipred_z3_fill_padding_tbl + b.gt ipred_z3_fill_padding_wide // w3 = remaining width, w4 = constant height mov w12, w4 @@ -3524,12 +3548,13 @@ function ipred_z3_fill_padding_neon, export=0 // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] - sub x9, x8, w9, uxtw + ldrsw x9, [x8, w9, uxtw #2] + add x9, x8, x9 br x9 -2: +20: AARCH64_VALID_JUMP_TARGET +2: st1 {v31.h}[0], [x0], x1 subs w4, w4, #4 st1 {v31.h}[0], [x13], x1 @@ -3547,8 +3572,9 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b -4: +40: AARCH64_VALID_JUMP_TARGET +4: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 @@ -3566,14 +3592,15 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b -8: +80: AARCH64_VALID_JUMP_TARGET +8: st1 {v31.8b}, [x0], x1 subs w4, w4, #4 st1 {v31.8b}, [x13], x1 st1 {v31.8b}, [x0], x1 st1 {v31.8b}, [x13], x1 - b.gt 4b + b.gt 8b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride @@ -3585,16 +3612,17 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b -16: -32: -64: +160: +320: +640: AARCH64_VALID_JUMP_TARGET +16: st1 {v31.16b}, [x0], x1 subs w4, w4, #4 st1 {v31.16b}, [x13], x1 st1 {v31.16b}, [x0], x1 st1 {v31.16b}, [x13], x1 - b.gt 4b + b.gt 16b subs w3, w3, #16 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride @@ -3608,16 +3636,18 @@ function ipred_z3_fill_padding_neon, export=0 9: ret +endfunc -L(ipred_z3_fill_padding_tbl): - .hword L(ipred_z3_fill_padding_tbl) - 64b - .hword L(ipred_z3_fill_padding_tbl) - 32b - .hword L(ipred_z3_fill_padding_tbl) - 16b - .hword L(ipred_z3_fill_padding_tbl) - 8b - .hword L(ipred_z3_fill_padding_tbl) - 4b - .hword L(ipred_z3_fill_padding_tbl) - 2b +jumptable ipred_z3_fill_padding_tbl + .word 640b - ipred_z3_fill_padding_tbl + .word 320b - ipred_z3_fill_padding_tbl + .word 160b - ipred_z3_fill_padding_tbl + .word 80b - ipred_z3_fill_padding_tbl + .word 40b - ipred_z3_fill_padding_tbl + .word 20b - ipred_z3_fill_padding_tbl +endjumptable -L(ipred_z3_fill_padding_wide): +function ipred_z3_fill_padding_wide // Fill a WxH rectangle with padding, with W > 16. lsr x1, x1, #1 mov w12, w3 @@ -3770,13 +3800,13 @@ function ipred_filter_8bpc_neon, export=1 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 - adr x5, L(ipred_filter_tbl) + movrel x5, ipred_filter_tbl ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b - sub x5, x5, w9, uxtw + add x5, x5, x9 sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 @@ -3916,30 +3946,32 @@ function ipred_filter_8bpc_neon, export=1 b 1b 9: ret - -L(ipred_filter_tbl): - .hword L(ipred_filter_tbl) - 320b - .hword L(ipred_filter_tbl) - 160b - .hword L(ipred_filter_tbl) - 80b - .hword L(ipred_filter_tbl) - 40b endfunc +jumptable ipred_filter_tbl + .word 320b - ipred_filter_tbl + .word 160b - ipred_filter_tbl + .word 80b - ipred_filter_tbl + .word 40b - ipred_filter_tbl +endjumptable + // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 ld1 {v0.8b}, [x2] clz w9, w4 - adr x6, L(pal_pred_tbl) + movrel x6, pal_pred_tbl sub w9, w9, #25 movi v31.16b, #7 - ldrh w9, [x6, w9, uxtw #1] - sub x6, x6, w9, uxtw + ldrsw x9, [x6, w9, uxtw #2] + add x6, x6, x9 add x2, x0, x1 lsl x1, x1, #1 br x6 -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld1 {v1.8b}, [x3], #8 subs w5, w5, #4 ushr v3.8b, v1.8b, #4 @@ -3952,8 +3984,9 @@ function pal_pred_8bpc_neon, export=1 st1 {v1.s}[3], [x2], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 ushr v4.16b, v1.16b, #4 @@ -3968,8 +4001,9 @@ function pal_pred_8bpc_neon, export=1 st1 {v2.d}[1], [x2], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ld1 {v1.16b, v2.16b}, [x3], #32 subs w5, w5, #4 ushr v5.16b, v1.16b, #4 @@ -3990,8 +4024,9 @@ function pal_pred_8bpc_neon, export=1 st1 {v4.16b}, [x2], x1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 subs w5, w5, #4 ushr v21.16b, v16.16b, #4 @@ -4024,8 +4059,9 @@ function pal_pred_8bpc_neon, export=1 st1 {v22.16b, v23.16b}, [x2], x1 b.gt 32b ret -64: +640: AARCH64_VALID_JUMP_TARGET +64: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 subs w5, w5, #2 ushr v21.16b, v16.16b, #4 @@ -4056,32 +4092,34 @@ function pal_pred_8bpc_neon, export=1 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 b.gt 64b ret - -L(pal_pred_tbl): - .hword L(pal_pred_tbl) - 64b - .hword L(pal_pred_tbl) - 32b - .hword L(pal_pred_tbl) - 16b - .hword L(pal_pred_tbl) - 8b - .hword L(pal_pred_tbl) - 4b endfunc +jumptable pal_pred_tbl + .word 640b - pal_pred_tbl + .word 320b - pal_pred_tbl + .word 160b - pal_pred_tbl + .word 80b - pal_pred_tbl + .word 40b - pal_pred_tbl +endjumptable + // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 clz w9, w3 - adr x7, L(ipred_cfl_128_tbl) + movrel x7, ipred_cfl_128_tbl sub w9, w9, #26 - ldrh w9, [x7, w9, uxtw #1] + ldrsw x9, [x7, w9, uxtw #2] movi v0.8h, #128 // dc dup v1.8h, w6 // alpha - sub x7, x7, w9, uxtw + add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET +1: ld1 {v2.8h, v3.8h}, [x5], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h @@ -4100,10 +4138,11 @@ L(ipred_cfl_splat_w4): subs w4, w4, #4 st1 {v3.s}[0], [x0], x1 st1 {v3.s}[1], [x6], x1 - b.gt L(ipred_cfl_splat_w4) + b.gt 1b ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET +1: ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h @@ -4134,7 +4173,7 @@ L(ipred_cfl_splat_w8): subs w4, w4, #4 st1 {v4.8b}, [x0], x1 st1 {v5.8b}, [x6], x1 - b.gt L(ipred_cfl_splat_w8) + b.gt 1b ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET @@ -4180,27 +4219,28 @@ L(ipred_cfl_splat_w16): mov w3, w9 b.gt 1b ret - -L(ipred_cfl_128_tbl): -L(ipred_cfl_splat_tbl): - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc +jumptable ipred_cfl_128_tbl +ipred_cfl_splat_tbl: + .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl + .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl + .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl + .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl +endjumptable + // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 clz w9, w3 - adr x7, L(ipred_cfl_top_tbl) + movrel x7, ipred_cfl_top_tbl sub w9, w9, #26 - ldrh w9, [x7, w9, uxtw #1] + ldrsw x9, [x7, w9, uxtw #2] dup v1.8h, w6 // alpha add x2, x2, #1 - sub x7, x7, w9, uxtw + add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 br x7 @@ -4234,14 +4274,15 @@ function ipred_cfl_top_8bpc_neon, export=1 urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] b L(ipred_cfl_splat_w16) - -L(ipred_cfl_top_tbl): - .hword L(ipred_cfl_top_tbl) - 32b - .hword L(ipred_cfl_top_tbl) - 16b - .hword L(ipred_cfl_top_tbl) - 8b - .hword L(ipred_cfl_top_tbl) - 4b endfunc +jumptable ipred_cfl_top_tbl + .word 32b - ipred_cfl_top_tbl + .word 16b - ipred_cfl_top_tbl + .word 8b - ipred_cfl_top_tbl + .word 4b - ipred_cfl_top_tbl +endjumptable + // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, @@ -4250,15 +4291,15 @@ function ipred_cfl_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w9, w3 clz w8, w4 - adr x10, L(ipred_cfl_splat_tbl) - adr x7, L(ipred_cfl_left_tbl) + movrel x10, ipred_cfl_splat_tbl + movrel x7, ipred_cfl_left_tbl sub w9, w9, #26 sub w8, w8, #26 - ldrh w9, [x10, w9, uxtw #1] - ldrh w8, [x7, w8, uxtw #1] + ldrsw x9, [x10, w9, uxtw #2] + ldrsw x8, [x7, w8, uxtw #2] dup v1.8h, w6 // alpha - sub x9, x10, w9, uxtw - sub x7, x7, w8, uxtw + add x9, x10, x9 + add x7, x7, x8 add x6, x0, x1 lsl x1, x1, #1 br x7 @@ -4296,14 +4337,15 @@ L(ipred_cfl_left_h32): urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] br x9 - -L(ipred_cfl_left_tbl): - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc +jumptable ipred_cfl_left_tbl + .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl + .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl + .word L(ipred_cfl_left_h8) - ipred_cfl_left_tbl + .word L(ipred_cfl_left_h4) - ipred_cfl_left_tbl +endjumptable + // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, @@ -4315,16 +4357,16 @@ function ipred_cfl_8bpc_neon, export=1 clz w9, w3 clz w6, w4 dup v16.8h, w8 // width + height - adr x7, L(ipred_cfl_tbl) + movrel x7, ipred_cfl_tbl rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) - ldrh w9, [x7, w9, uxtw #1] - ldrh w6, [x7, w6, uxtw #1] + ldrsw x9, [x7, w9, uxtw #2] + ldrsw x6, [x7, w6, uxtw #2] neg w8, w8 // -ctz(width + height) - sub x9, x7, w9, uxtw - sub x7, x7, w6, uxtw + add x9, x7, x9 + add x7, x7, x6 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w8 // -ctz(width + height) add x6, x0, x1 @@ -4440,32 +4482,33 @@ L(ipred_cfl_w32): 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) - -L(ipred_cfl_tbl): - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc +jumptable ipred_cfl_tbl + .word L(ipred_cfl_h32) - ipred_cfl_tbl + .word L(ipred_cfl_h16) - ipred_cfl_tbl + .word L(ipred_cfl_h8) - ipred_cfl_tbl + .word L(ipred_cfl_h4) - ipred_cfl_tbl + .word L(ipred_cfl_w32) - ipred_cfl_tbl + .word L(ipred_cfl_w16) - ipred_cfl_tbl + .word L(ipred_cfl_w8) - ipred_cfl_tbl + .word L(ipred_cfl_w4) - ipred_cfl_tbl +endjumptable + // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_420_tbl) + movrel x7, ipred_cfl_ac_420_tbl sub w8, w8, #27 - ldrh w8, [x7, w8, uxtw #1] + ldrsw x8, [x7, w8, uxtw #2] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 - sub x7, x7, w8, uxtw + add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -4604,9 +4647,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc): L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_420_w16_tbl) - ldrh w3, [x7, w3, uxtw #1] - sub x7, x7, w3, uxtw + movrel x7, ipred_cfl_ac_420_w16_tbl + ldrsw x3, [x7, w3, uxtw #2] + add x7, x7, x3 br x7 L(ipred_cfl_ac_420_w16_wpad0): @@ -4762,34 +4805,35 @@ L(ipred_cfl_ac_420_w16_hpad): // Double the height and reuse the w8 summing/subtracting lsl w6, w6, #1 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) - -L(ipred_cfl_ac_420_tbl): - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) - .hword 0 - -L(ipred_cfl_ac_420_w16_tbl): - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc +jumptable ipred_cfl_ac_420_tbl + .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl + .word L(ipred_cfl_ac_420_w8) - ipred_cfl_ac_420_tbl + .word L(ipred_cfl_ac_420_w4) - ipred_cfl_ac_420_tbl +endjumptable + +jumptable ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl +endjumptable + // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_422_tbl) + movrel x7, ipred_cfl_ac_422_tbl sub w8, w8, #27 - ldrh w8, [x7, w8, uxtw #1] + ldrsw x8, [x7, w8, uxtw #2] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 - sub x7, x7, w8, uxtw + add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -4880,9 +4924,9 @@ L(ipred_cfl_ac_422_w8_wpad): L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_422_w16_tbl) - ldrh w3, [x7, w3, uxtw #1] - sub x7, x7, w3, uxtw + movrel x7, ipred_cfl_ac_422_w16_tbl + ldrsw x3, [x7, w3, uxtw #2] + add x7, x7, x3 br x7 L(ipred_cfl_ac_422_w16_wpad0): @@ -4984,34 +5028,35 @@ L(ipred_cfl_ac_422_w16_wpad3): mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) - -L(ipred_cfl_ac_422_tbl): - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) - .hword 0 - -L(ipred_cfl_ac_422_w16_tbl): - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc +jumptable ipred_cfl_ac_422_tbl + .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl + .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl + .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl +endjumptable + +jumptable ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl +endjumptable + // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_444_tbl) + movrel x7, ipred_cfl_ac_444_tbl sub w8, w8, #26 - ldrh w8, [x7, w8, uxtw #1] + ldrsw x8, [x7, w8, uxtw #2] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 - sub x7, x7, w8, uxtw + add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -5132,9 +5177,10 @@ L(ipred_cfl_ac_444_w16_wpad): L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_444_w32_tbl) - ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 - sub x7, x7, w3, uxtw + movrel x7, ipred_cfl_ac_444_w32_tbl + lsr w3, w3, #1 + ldrsw x3, [x7, w3, uxtw #2] + add x7, x7, x3 br x7 L(ipred_cfl_ac_444_w32_wpad0): @@ -5279,16 +5325,18 @@ L(ipred_cfl_ac_444_w32_hpad): urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] b L(ipred_cfl_ac_420_w8_subtract_dc) - -L(ipred_cfl_ac_444_tbl): - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) - -L(ipred_cfl_ac_444_w32_tbl): - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) endfunc + +jumptable ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w8) - ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w4) - ipred_cfl_ac_444_tbl +endjumptable + +jumptable ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl +endjumptable diff --git a/src/arm/64/ipred16.S b/src/arm/64/ipred16.S index 3f8cff986932d9189159305fec60ed64d3fbbe5d..2292a855655bb5c2c631c0b3afa0a5f81db991ae 100644 --- a/src/arm/64/ipred16.S +++ b/src/arm/64/ipred16.S @@ -36,17 +36,18 @@ function ipred_dc_128_16bpc_neon, export=1 ldr w8, [sp] clz w3, w3 - adr x5, L(ipred_dc_128_tbl) + movrel x5, ipred_dc_128_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] dup v0.8h, w8 - sub x5, x5, w3, uxtw + add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 urshr v0.8h, v0.8h, #1 br x5 -4: +40: AARCH64_VALID_JUMP_TARGET +4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 @@ -54,8 +55,9 @@ function ipred_dc_128_16bpc_neon, export=1 st1 {v0.4h}, [x6], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 @@ -105,26 +107,27 @@ function ipred_dc_128_16bpc_neon, export=1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret - -L(ipred_dc_128_tbl): - .hword L(ipred_dc_128_tbl) - 640b - .hword L(ipred_dc_128_tbl) - 320b - .hword L(ipred_dc_128_tbl) - 160b - .hword L(ipred_dc_128_tbl) - 8b - .hword L(ipred_dc_128_tbl) - 4b endfunc +jumptable ipred_dc_128_tbl + .word 640b - ipred_dc_128_tbl + .word 320b - ipred_dc_128_tbl + .word 160b - ipred_dc_128_tbl + .word 80b - ipred_dc_128_tbl + .word 40b - ipred_dc_128_tbl +endjumptable + // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_16bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_v_tbl) + movrel x5, ipred_v_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #2 - sub x5, x5, w3, uxtw + add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -189,32 +192,34 @@ function ipred_v_16bpc_neon, export=1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 b.gt 64b ret - -L(ipred_v_tbl): - .hword L(ipred_v_tbl) - 640b - .hword L(ipred_v_tbl) - 320b - .hword L(ipred_v_tbl) - 160b - .hword L(ipred_v_tbl) - 80b - .hword L(ipred_v_tbl) - 40b endfunc +jumptable ipred_v_tbl + .word 640b - ipred_v_tbl + .word 320b - ipred_v_tbl + .word 160b - ipred_v_tbl + .word 80b - ipred_v_tbl + .word 40b - ipred_v_tbl +endjumptable + // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_16bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_h_tbl) + movrel x5, ipred_h_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] sub x2, x2, #8 - sub x5, x5, w3, uxtw + add x5, x5, x3 mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 br x5 -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.4h}, [x0], x1 st1 {v2.4h}, [x6], x1 @@ -223,8 +228,9 @@ function ipred_h_16bpc_neon, export=1 st1 {v0.4h}, [x6], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 @@ -233,8 +239,9 @@ function ipred_h_16bpc_neon, export=1 st1 {v0.8h}, [x6], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -247,8 +254,9 @@ function ipred_h_16bpc_neon, export=1 st1 {v0.8h}, [x6], x1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -265,8 +273,9 @@ function ipred_h_16bpc_neon, export=1 st1 {v0.8h}, [x6], x1 b.gt 32b ret -64: +640: AARCH64_VALID_JUMP_TARGET +64: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -291,26 +300,27 @@ function ipred_h_16bpc_neon, export=1 st1 {v0.8h}, [x6], x1 b.gt 64b ret - -L(ipred_h_tbl): - .hword L(ipred_h_tbl) - 64b - .hword L(ipred_h_tbl) - 32b - .hword L(ipred_h_tbl) - 16b - .hword L(ipred_h_tbl) - 8b - .hword L(ipred_h_tbl) - 4b endfunc +jumptable ipred_h_tbl + .word 640b - ipred_h_tbl + .word 320b - ipred_h_tbl + .word 160b - ipred_h_tbl + .word 80b - ipred_h_tbl + .word 40b - ipred_h_tbl +endjumptable + // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_16bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_dc_top_tbl) + movrel x5, ipred_dc_top_tbl sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #2 - sub x5, x5, w3, uxtw + add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -408,15 +418,16 @@ function ipred_dc_top_16bpc_neon, export=1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret - -L(ipred_dc_top_tbl): - .hword L(ipred_dc_top_tbl) - 640b - .hword L(ipred_dc_top_tbl) - 320b - .hword L(ipred_dc_top_tbl) - 160b - .hword L(ipred_dc_top_tbl) - 80b - .hword L(ipred_dc_top_tbl) - 40b endfunc +jumptable ipred_dc_top_tbl + .word 640b - ipred_dc_top_tbl + .word 320b - ipred_dc_top_tbl + .word 160b - ipred_dc_top_tbl + .word 80b - ipred_dc_top_tbl + .word 40b - ipred_dc_top_tbl +endjumptable + // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -425,13 +436,13 @@ function ipred_dc_left_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 clz w3, w3 clz w7, w4 - adr x5, L(ipred_dc_left_tbl) + movrel x5, ipred_dc_left_tbl sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 - ldrh w3, [x5, w3, uxtw #1] - ldrh w7, [x5, w7, uxtw #1] - sub x3, x5, w3, uxtw - sub x5, x5, w7, uxtw + ldrsw x3, [x5, w3, uxtw #2] + ldrsw x7, [x5, w7, uxtw #2] + add x3, x5, x3 + add x5, x5, x7 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -445,12 +456,13 @@ L(ipred_dc_left_h4): br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET +1: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 - b.gt L(ipred_dc_left_w4) + b.gt 1b ret L(ipred_dc_left_h8): @@ -462,12 +474,13 @@ L(ipred_dc_left_h8): br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET +1: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 - b.gt L(ipred_dc_left_w8) + b.gt 1b ret L(ipred_dc_left_h16): @@ -549,20 +562,21 @@ L(ipred_dc_left_w64): st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 1b ret - -L(ipred_dc_left_tbl): - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc +jumptable ipred_dc_left_tbl + .word L(ipred_dc_left_h64) - ipred_dc_left_tbl + .word L(ipred_dc_left_h32) - ipred_dc_left_tbl + .word L(ipred_dc_left_h16) - ipred_dc_left_tbl + .word L(ipred_dc_left_h8) - ipred_dc_left_tbl + .word L(ipred_dc_left_h4) - ipred_dc_left_tbl + .word L(ipred_dc_left_w64) - ipred_dc_left_tbl + .word L(ipred_dc_left_w32) - ipred_dc_left_tbl + .word L(ipred_dc_left_w16) - ipred_dc_left_tbl + .word L(ipred_dc_left_w8) - ipred_dc_left_tbl + .word L(ipred_dc_left_w4) - ipred_dc_left_tbl +endjumptable + // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -573,16 +587,16 @@ function ipred_dc_16bpc_neon, export=1 clz w3, w3 clz w6, w4 dup v16.4s, w7 // width + height - adr x5, L(ipred_dc_tbl) + movrel x5, ipred_dc_tbl rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) - ldrh w3, [x5, w3, uxtw #1] - ldrh w6, [x5, w6, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] + ldrsw x6, [x5, w6, uxtw #2] neg w7, w7 // -ctz(width + height) - sub x3, x5, w3, uxtw - sub x5, x5, w6, uxtw + add x3, x5, x3 + add x5, x5, x6 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w7 // -ctz(width + height) add x6, x0, x1 @@ -794,33 +808,34 @@ L(ipred_dc_w64): st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 2b ret - -L(ipred_dc_tbl): - .hword L(ipred_dc_tbl) - L(ipred_dc_h64) - .hword L(ipred_dc_tbl) - L(ipred_dc_h32) - .hword L(ipred_dc_tbl) - L(ipred_dc_h16) - .hword L(ipred_dc_tbl) - L(ipred_dc_h8) - .hword L(ipred_dc_tbl) - L(ipred_dc_h4) - .hword L(ipred_dc_tbl) - L(ipred_dc_w64) - .hword L(ipred_dc_tbl) - L(ipred_dc_w32) - .hword L(ipred_dc_tbl) - L(ipred_dc_w16) - .hword L(ipred_dc_tbl) - L(ipred_dc_w8) - .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc +jumptable ipred_dc_tbl + .word L(ipred_dc_h64) - ipred_dc_tbl + .word L(ipred_dc_h32) - ipred_dc_tbl + .word L(ipred_dc_h16) - ipred_dc_tbl + .word L(ipred_dc_h8) - ipred_dc_tbl + .word L(ipred_dc_h4) - ipred_dc_tbl + .word L(ipred_dc_w64) - ipred_dc_tbl + .word L(ipred_dc_w32) - ipred_dc_tbl + .word L(ipred_dc_w16) - ipred_dc_tbl + .word L(ipred_dc_w8) - ipred_dc_tbl + .word L(ipred_dc_w4) - ipred_dc_tbl +endjumptable + // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_16bpc_neon, export=1 clz w9, w3 - adr x5, L(ipred_paeth_tbl) + movrel x5, ipred_paeth_tbl sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.8h}, [x2] add x8, x2, #2 sub x2, x2, #8 - sub x5, x5, w9, uxtw + add x5, x5, x9 mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 @@ -933,15 +948,16 @@ function ipred_paeth_16bpc_neon, export=1 b 1b 9: ret - -L(ipred_paeth_tbl): - .hword L(ipred_paeth_tbl) - 640b - .hword L(ipred_paeth_tbl) - 320b - .hword L(ipred_paeth_tbl) - 160b - .hword L(ipred_paeth_tbl) - 80b - .hword L(ipred_paeth_tbl) - 40b endfunc +jumptable ipred_paeth_tbl + .word 640b - ipred_paeth_tbl + .word 320b - ipred_paeth_tbl + .word 160b - ipred_paeth_tbl + .word 80b - ipred_paeth_tbl + .word 40b - ipred_paeth_tbl +endjumptable + // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -951,13 +967,13 @@ function ipred_smooth_16bpc_neon, export=1 add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 - adr x5, L(ipred_smooth_tbl) + movrel x5, ipred_smooth_tbl sub x12, x2, w4, uxtw #1 sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.8h}, [x12] // bottom add x8, x2, #2 - sub x5, x5, w9, uxtw + add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1137,15 +1153,16 @@ function ipred_smooth_16bpc_neon, export=1 b 1b 9: ret - -L(ipred_smooth_tbl): - .hword L(ipred_smooth_tbl) - 640b - .hword L(ipred_smooth_tbl) - 320b - .hword L(ipred_smooth_tbl) - 160b - .hword L(ipred_smooth_tbl) - 80b - .hword L(ipred_smooth_tbl) - 40b endfunc +jumptable ipred_smooth_tbl + .word 640b - ipred_smooth_tbl + .word 320b - ipred_smooth_tbl + .word 160b - ipred_smooth_tbl + .word 80b - ipred_smooth_tbl + .word 40b - ipred_smooth_tbl +endjumptable + // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -1154,13 +1171,13 @@ function ipred_smooth_v_16bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 - adr x5, L(ipred_smooth_v_tbl) + movrel x5, ipred_smooth_v_tbl sub x8, x2, w4, uxtw #1 sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.8h}, [x8] // bottom add x2, x2, #2 - sub x5, x5, w9, uxtw + add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1264,15 +1281,16 @@ function ipred_smooth_v_16bpc_neon, export=1 b 1b 9: ret - -L(ipred_smooth_v_tbl): - .hword L(ipred_smooth_v_tbl) - 640b - .hword L(ipred_smooth_v_tbl) - 320b - .hword L(ipred_smooth_v_tbl) - 160b - .hword L(ipred_smooth_v_tbl) - 80b - .hword L(ipred_smooth_v_tbl) - 40b endfunc +jumptable ipred_smooth_v_tbl + .word 640b - ipred_smooth_v_tbl + .word 320b - ipred_smooth_v_tbl + .word 160b - ipred_smooth_v_tbl + .word 80b - ipred_smooth_v_tbl + .word 40b - ipred_smooth_v_tbl +endjumptable + // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, @@ -1281,12 +1299,12 @@ function ipred_smooth_h_16bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 - adr x5, L(ipred_smooth_h_tbl) + movrel x5, ipred_smooth_h_tbl add x12, x2, w3, uxtw #1 sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] ld1r {v5.8h}, [x12] // right - sub x5, x5, w9, uxtw + add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1396,15 +1414,16 @@ function ipred_smooth_h_16bpc_neon, export=1 b 1b 9: ret - -L(ipred_smooth_h_tbl): - .hword L(ipred_smooth_h_tbl) - 640b - .hword L(ipred_smooth_h_tbl) - 320b - .hword L(ipred_smooth_h_tbl) - 160b - .hword L(ipred_smooth_h_tbl) - 80b - .hword L(ipred_smooth_h_tbl) - 40b endfunc +jumptable ipred_smooth_h_tbl + .word 640b - ipred_smooth_h_tbl + .word 320b - ipred_smooth_h_tbl + .word 160b - ipred_smooth_h_tbl + .word 80b - ipred_smooth_h_tbl + .word 40b - ipred_smooth_h_tbl +endjumptable + const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 @@ -1728,11 +1747,11 @@ endfunc // const int dx, const int max_base_x); function ipred_z1_fill1_16bpc_neon, export=1 clz w9, w3 - adr x8, L(ipred_z1_fill1_tbl) + movrel x8, ipred_z1_fill1_tbl sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] + ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw #1 // top[max_base_x] - sub x8, x8, w9, uxtw + add x8, x8, x9 ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 @@ -1916,15 +1935,16 @@ function ipred_z1_fill1_16bpc_neon, export=1 add x13, x13, x1 mov w3, w12 b 169b - -L(ipred_z1_fill1_tbl): - .hword L(ipred_z1_fill1_tbl) - 640b - .hword L(ipred_z1_fill1_tbl) - 320b - .hword L(ipred_z1_fill1_tbl) - 160b - .hword L(ipred_z1_fill1_tbl) - 80b - .hword L(ipred_z1_fill1_tbl) - 40b endfunc +jumptable ipred_z1_fill1_tbl + .word 640b - ipred_z1_fill1_tbl + .word 320b - ipred_z1_fill1_tbl + .word 160b - ipred_z1_fill1_tbl + .word 80b - ipred_z1_fill1_tbl + .word 40b - ipred_z1_fill1_tbl +endjumptable + function ipred_z1_fill2_16bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] @@ -2050,11 +2070,11 @@ endconst // const int dx, const int dy); function ipred_z2_fill1_16bpc_neon, export=1 clz w10, w4 - adr x9, L(ipred_z2_fill1_tbl) + movrel x9, ipred_z2_fill1_tbl sub w10, w10, #25 - ldrh w10, [x9, w10, uxtw #1] + ldrsw x10, [x9, w10, uxtw #2] mov w8, #(1 << 6) // xpos = 1 << 6 - sub x9, x9, w10, uxtw + add x9, x9, x10 sub w8, w8, w6 // xpos -= dx movrel x11, increments @@ -2814,15 +2834,16 @@ function ipred_z2_fill1_16bpc_neon, export=1 ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret - -L(ipred_z2_fill1_tbl): - .hword L(ipred_z2_fill1_tbl) - 640b - .hword L(ipred_z2_fill1_tbl) - 320b - .hword L(ipred_z2_fill1_tbl) - 160b - .hword L(ipred_z2_fill1_tbl) - 80b - .hword L(ipred_z2_fill1_tbl) - 40b endfunc +jumptable ipred_z2_fill1_tbl + .word 640b - ipred_z2_fill1_tbl + .word 320b - ipred_z2_fill1_tbl + .word 160b - ipred_z2_fill1_tbl + .word 80b - ipred_z2_fill1_tbl + .word 40b - ipred_z2_fill1_tbl +endjumptable + function ipred_z2_fill2_16bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 @@ -3432,11 +3453,11 @@ endfunc // const int dy, const int max_base_y); function ipred_z3_fill1_16bpc_neon, export=1 clz w9, w4 - adr x8, L(ipred_z3_fill1_tbl) + movrel x8, ipred_z3_fill1_tbl sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] + ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw #1 // left[max_base_y] - sub x8, x8, w9, uxtw + add x8, x8, x9 ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 @@ -3637,19 +3658,20 @@ function ipred_z3_fill1_16bpc_neon, export=1 b 1b 9: ret - -L(ipred_z3_fill1_tbl): - .hword L(ipred_z3_fill1_tbl) - 640b - .hword L(ipred_z3_fill1_tbl) - 320b - .hword L(ipred_z3_fill1_tbl) - 160b - .hword L(ipred_z3_fill1_tbl) - 80b - .hword L(ipred_z3_fill1_tbl) - 40b endfunc +jumptable ipred_z3_fill1_tbl + .word 640b - ipred_z3_fill1_tbl + .word 320b - ipred_z3_fill1_tbl + .word 160b - ipred_z3_fill1_tbl + .word 80b - ipred_z3_fill1_tbl + .word 40b - ipred_z3_fill1_tbl +endjumptable + function ipred_z3_fill_padding_neon, export=0 cmp w3, #8 - adr x8, L(ipred_z3_fill_padding_tbl) - b.gt L(ipred_z3_fill_padding_wide) + movrel x8, ipred_z3_fill_padding_tbl + b.gt ipred_z3_fill_padding_wide // w3 = remaining width, w4 = constant height mov w12, w4 @@ -3659,12 +3681,13 @@ function ipred_z3_fill_padding_neon, export=0 // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] - sub x9, x8, w9, uxtw + ldrsw x9, [x8, w9, uxtw #2] + add x9, x8, x9 br x9 -2: +20: AARCH64_VALID_JUMP_TARGET +2: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 @@ -3682,8 +3705,9 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b -4: +40: AARCH64_VALID_JUMP_TARGET +4: st1 {v31.4h}, [x0], x1 subs w4, w4, #4 st1 {v31.4h}, [x13], x1 @@ -3701,17 +3725,18 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b -8: -16: -32: -64: +80: +160: +320: +640: AARCH64_VALID_JUMP_TARGET +8: st1 {v31.8h}, [x0], x1 subs w4, w4, #4 st1 {v31.8h}, [x13], x1 st1 {v31.8h}, [x0], x1 st1 {v31.8h}, [x13], x1 - b.gt 4b + b.gt 8b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride @@ -3725,16 +3750,18 @@ function ipred_z3_fill_padding_neon, export=0 9: ret +endfunc -L(ipred_z3_fill_padding_tbl): - .hword L(ipred_z3_fill_padding_tbl) - 64b - .hword L(ipred_z3_fill_padding_tbl) - 32b - .hword L(ipred_z3_fill_padding_tbl) - 16b - .hword L(ipred_z3_fill_padding_tbl) - 8b - .hword L(ipred_z3_fill_padding_tbl) - 4b - .hword L(ipred_z3_fill_padding_tbl) - 2b +jumptable ipred_z3_fill_padding_tbl + .word 640b - ipred_z3_fill_padding_tbl + .word 320b - ipred_z3_fill_padding_tbl + .word 160b - ipred_z3_fill_padding_tbl + .word 80b - ipred_z3_fill_padding_tbl + .word 40b - ipred_z3_fill_padding_tbl + .word 20b - ipred_z3_fill_padding_tbl +endjumptable -L(ipred_z3_fill_padding_wide): +function ipred_z3_fill_padding_wide // Fill a WxH rectangle with padding, with W > 8. lsr x1, x1, #1 mov w12, w3 @@ -3883,13 +3910,13 @@ function ipred_filter_\bpc\()bpc_neon add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 - adr x5, L(ipred_filter\bpc\()_tbl) + movrel x5, ipred_filter\bpc\()_tbl ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 - ldrh w9, [x5, w9, uxtw #1] + ldrsw x9, [x5, w9, uxtw #2] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b - sub x5, x5, w9, uxtw + add x5, x5, x9 sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 @@ -4162,13 +4189,14 @@ function ipred_filter_\bpc\()bpc_neon b 1b 9: ret - -L(ipred_filter\bpc\()_tbl): - .hword L(ipred_filter\bpc\()_tbl) - 320b - .hword L(ipred_filter\bpc\()_tbl) - 160b - .hword L(ipred_filter\bpc\()_tbl) - 80b - .hword L(ipred_filter\bpc\()_tbl) - 40b endfunc + +jumptable ipred_filter\bpc\()_tbl + .word 320b - ipred_filter\bpc\()_tbl + .word 160b - ipred_filter\bpc\()_tbl + .word 80b - ipred_filter\bpc\()_tbl + .word 40b - ipred_filter\bpc\()_tbl +endjumptable .endm filter_fn 10 @@ -4187,12 +4215,12 @@ endfunc function pal_pred_16bpc_neon, export=1 ld1 {v30.8h}, [x2] clz w9, w4 - adr x6, L(pal_pred_tbl) + movrel x6, pal_pred_tbl sub w9, w9, #25 movi v29.16b, #7 - ldrh w9, [x6, w9, uxtw #1] + ldrsw x9, [x6, w9, uxtw #2] movi v31.8h, #1, lsl #8 - sub x6, x6, w9, uxtw + add x6, x6, x9 br x6 40: AARCH64_VALID_JUMP_TARGET @@ -4391,15 +4419,16 @@ function pal_pred_16bpc_neon, export=1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 b.gt 64b ret - -L(pal_pred_tbl): - .hword L(pal_pred_tbl) - 640b - .hword L(pal_pred_tbl) - 320b - .hword L(pal_pred_tbl) - 160b - .hword L(pal_pred_tbl) - 80b - .hword L(pal_pred_tbl) - 40b endfunc +jumptable pal_pred_tbl + .word 640b - pal_pred_tbl + .word 320b - pal_pred_tbl + .word 160b - pal_pred_tbl + .word 80b - pal_pred_tbl + .word 40b - pal_pred_tbl +endjumptable + // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, @@ -4408,18 +4437,19 @@ endfunc function ipred_cfl_128_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 - adr x7, L(ipred_cfl_128_tbl) + movrel x7, ipred_cfl_128_tbl sub w9, w9, #26 - ldrh w9, [x7, w9, uxtw #1] + ldrsw x9, [x7, w9, uxtw #2] urshr v0.8h, v31.8h, #1 dup v1.8h, w6 // alpha - sub x7, x7, w9, uxtw + add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET +1: ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #4 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha @@ -4448,10 +4478,11 @@ L(ipred_cfl_splat_w4): st1 {v2.d}[1], [x6], x1 st1 {v3.d}[0], [x0], x1 st1 {v3.d}[1], [x6], x1 - b.gt L(ipred_cfl_splat_w4) + b.gt 1b ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET +1: ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #2 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha @@ -4478,7 +4509,7 @@ L(ipred_cfl_splat_w8): smin v3.8h, v3.8h, v31.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x6], x1 - b.gt L(ipred_cfl_splat_w8) + b.gt 1b ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET @@ -4544,15 +4575,16 @@ L(ipred_cfl_splat_w16): mov w3, w9 b.gt 1b ret - -L(ipred_cfl_128_tbl): -L(ipred_cfl_splat_tbl): - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc +jumptable ipred_cfl_128_tbl +ipred_cfl_splat_tbl: + .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl + .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl + .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl + .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl +endjumptable + // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, @@ -4561,12 +4593,12 @@ endfunc function ipred_cfl_top_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 - adr x7, L(ipred_cfl_top_tbl) + movrel x7, ipred_cfl_top_tbl sub w9, w9, #26 - ldrh w9, [x7, w9, uxtw #1] + ldrsw x9, [x7, w9, uxtw #2] dup v1.8h, w6 // alpha add x2, x2, #2 - sub x7, x7, w9, uxtw + add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 @@ -4603,14 +4635,15 @@ function ipred_cfl_top_16bpc_neon, export=1 rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) - -L(ipred_cfl_top_tbl): - .hword L(ipred_cfl_top_tbl) - 32b - .hword L(ipred_cfl_top_tbl) - 16b - .hword L(ipred_cfl_top_tbl) - 8b - .hword L(ipred_cfl_top_tbl) - 4b endfunc +jumptable ipred_cfl_top_tbl + .word 32b - ipred_cfl_top_tbl + .word 16b - ipred_cfl_top_tbl + .word 8b - ipred_cfl_top_tbl + .word 4b - ipred_cfl_top_tbl +endjumptable + // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, @@ -4621,15 +4654,15 @@ function ipred_cfl_left_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 clz w9, w3 clz w8, w4 - adr x10, L(ipred_cfl_splat_tbl) - adr x7, L(ipred_cfl_left_tbl) + movrel x10, ipred_cfl_splat_tbl + movrel x7, ipred_cfl_left_tbl sub w9, w9, #26 sub w8, w8, #26 - ldrh w9, [x10, w9, uxtw #1] - ldrh w8, [x7, w8, uxtw #1] + ldrsw x9, [x10, w9, uxtw #2] + ldrsw x8, [x7, w8, uxtw #2] dup v1.8h, w6 // alpha - sub x9, x10, w9, uxtw - sub x7, x7, w8, uxtw + add x9, x10, x9 + add x7, x7, x8 add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 @@ -4670,14 +4703,15 @@ L(ipred_cfl_left_h32): rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] br x9 - -L(ipred_cfl_left_tbl): - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc +jumptable ipred_cfl_left_tbl + .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl + .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl + .word L(ipred_cfl_left_h8) - ipred_cfl_left_tbl + .word L(ipred_cfl_left_h4) - ipred_cfl_left_tbl +endjumptable + // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, @@ -4691,16 +4725,16 @@ function ipred_cfl_16bpc_neon, export=1 clz w9, w3 clz w6, w4 dup v16.4s, w8 // width + height - adr x7, L(ipred_cfl_tbl) + movrel x7, ipred_cfl_tbl rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) - ldrh w9, [x7, w9, uxtw #1] - ldrh w6, [x7, w6, uxtw #1] + ldrsw x9, [x7, w9, uxtw #2] + ldrsw x6, [x7, w6, uxtw #2] neg w8, w8 // -ctz(width + height) - sub x9, x7, w9, uxtw - sub x7, x7, w6, uxtw + add x9, x7, x9 + add x7, x7, x6 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w8 // -ctz(width + height) add x6, x0, x1 @@ -4823,32 +4857,33 @@ L(ipred_cfl_w32): 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) - -L(ipred_cfl_tbl): - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc +jumptable ipred_cfl_tbl + .word L(ipred_cfl_h32) - ipred_cfl_tbl + .word L(ipred_cfl_h16) - ipred_cfl_tbl + .word L(ipred_cfl_h8) - ipred_cfl_tbl + .word L(ipred_cfl_h4) - ipred_cfl_tbl + .word L(ipred_cfl_w32) - ipred_cfl_tbl + .word L(ipred_cfl_w16) - ipred_cfl_tbl + .word L(ipred_cfl_w8) - ipred_cfl_tbl + .word L(ipred_cfl_w4) - ipred_cfl_tbl +endjumptable + // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_420_tbl) + movrel x7, ipred_cfl_ac_420_tbl sub w8, w8, #27 - ldrh w8, [x7, w8, uxtw #1] + ldrsw x8, [x7, w8, uxtw #2] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 - sub x7, x7, w8, uxtw + add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -4980,9 +5015,9 @@ L(ipred_cfl_ac_420_w8_hpad): L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_420_w16_tbl) - ldrh w3, [x7, w3, uxtw #1] - sub x7, x7, w3, uxtw + movrel x7, ipred_cfl_ac_420_w16_tbl + ldrsw x3, [x7, w3, uxtw #2] + add x7, x7, x3 br x7 L(ipred_cfl_ac_420_w16_wpad0): @@ -5158,34 +5193,35 @@ L(ipred_cfl_ac_420_w16_hpad): // Quadruple the height and reuse the w4 summing/subtracting lsl w6, w6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) - -L(ipred_cfl_ac_420_tbl): - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) - .hword 0 - -L(ipred_cfl_ac_420_w16_tbl): - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc +jumptable ipred_cfl_ac_420_tbl + .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl + .word L(ipred_cfl_ac_420_w8) - ipred_cfl_ac_420_tbl + .word L(ipred_cfl_ac_420_w4) - ipred_cfl_ac_420_tbl +endjumptable + +jumptable ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl + .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl +endjumptable + // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_422_tbl) + movrel x7, ipred_cfl_ac_422_tbl sub w8, w8, #27 - ldrh w8, [x7, w8, uxtw #1] + ldrsw x8, [x7, w8, uxtw #2] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 - sub x7, x7, w8, uxtw + add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -5286,9 +5322,9 @@ L(ipred_cfl_ac_422_w8_wpad): L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_422_w16_tbl) - ldrh w3, [x7, w3, uxtw #1] - sub x7, x7, w3, uxtw + movrel x7, ipred_cfl_ac_422_w16_tbl + ldrsw x3, [x7, w3, uxtw #2] + add x7, x7, x3 br x7 L(ipred_cfl_ac_422_w16_wpad0): @@ -5406,34 +5442,35 @@ L(ipred_cfl_ac_422_w16_wpad3): mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) - -L(ipred_cfl_ac_422_tbl): - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) - .hword 0 - -L(ipred_cfl_ac_422_w16_tbl): - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc +jumptable ipred_cfl_ac_422_tbl + .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl + .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl + .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl +endjumptable + +jumptable ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl + .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl +endjumptable + // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_444_tbl) + movrel x7, ipred_cfl_ac_444_tbl sub w8, w8, #26 - ldrh w8, [x7, w8, uxtw #1] + ldrsw x8, [x7, w8, uxtw #2] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 - sub x7, x7, w8, uxtw + add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -5542,10 +5579,11 @@ L(ipred_cfl_ac_444_w16_wpad): L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_444_w32_tbl) - ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + movrel x7, ipred_cfl_ac_444_w32_tbl + lsr w3, w3, #1 + ldrsw x3, [x7, w3, uxtw #2] lsr x2, x2, #1 // Restore the stride to one line increments - sub x7, x7, w3, uxtw + add x7, x7, x3 br x7 L(ipred_cfl_ac_444_w32_wpad0): @@ -5659,16 +5697,18 @@ L(ipred_cfl_ac_444_w32_hpad): // Multiply the height by eight and reuse the w4 subtracting lsl w6, w6, #3 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) - -L(ipred_cfl_ac_444_tbl): - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) - -L(ipred_cfl_ac_444_w32_tbl): - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) endfunc + +jumptable ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w8) - ipred_cfl_ac_444_tbl + .word L(ipred_cfl_ac_444_w4) - ipred_cfl_ac_444_tbl +endjumptable + +jumptable ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl + .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl +endjumptable diff --git a/src/arm/64/looprestoration_common.S b/src/arm/64/looprestoration_common.S index 745f6c20f49168be10d33292ec52c6b4b6b7e210..c10a9f3d7c44165aa2f42a999e62897de6955e3b 100644 --- a/src/arm/64/looprestoration_common.S +++ b/src/arm/64/looprestoration_common.S @@ -28,14 +28,77 @@ #include "src/arm/asm.S" #include "util.S" +// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table. +// In the comments, let RefTable denote the original, reference table. +const x_by_x_tables +// RangeMins +// +// Min(RefTable[i*8:i*8+8]) +// First two values are zeroed. +// +// Lookup using RangeMins[(x >> 3)] + .byte 0, 0, 11, 8, 6, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2 + .byte 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + +// DiffMasks +// +// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range +// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of +// RefTable changes at that particular index. +// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of +// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15). +// +// Lookup using DiffMasks[(x >> 3)] + .byte 0x00, 0x00, 0xD4, 0x44 + .byte 0x42, 0x04, 0x00, 0x00 + .byte 0x00, 0x80, 0x00, 0x00 + .byte 0x04, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x40, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x02 +// Binary form: +// 0b00000000, 0b00000000, 0b11010100, 0b01000100 +// 0b01000010, 0b00000100, 0b00000000, 0b00000000 +// 0b00000000, 0b10000000, 0b00000000, 0b00000000 +// 0b00000100, 0b00000000, 0b00000000, 0b00000000 +// 0b00000000, 0b00000000, 0b00000000, 0b00000000 +// 0b00000000, 0b01000000, 0b00000000, 0b00000000 +// 0b00000000, 0b00000000, 0b00000000, 0b00000000 +// 0b00000000, 0b00000000, 0b00000000, 0b00000010 + +// RefLo +// +// RefTable[0:16] +// i.e. First 16 elements of the original table. +// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable. +// +// Lookup using RangeMins[x] (tbl will replace x > 15 with 0) + .byte 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 + +// Pseudo assembly +// +// hi_bits = x >> 3 +// tbl ref, {RefLo}, x +// tbl diffs, {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits +// tbl min, {RangeMins[0:16], RangeMins[16:32]}, hi_bits +// lo_bits = x & 0x7 +// diffs = diffs << lo_bits +// ref = ref + min +// integral = popcnt(diffs) +// ref = ref + integral +// return ref +endconst + // void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, // int32_t *AA, int16_t *BB, // const int w, const int s, // const int bitdepth_max); function sgr_box3_vert_neon, export=1 - stp d8, d9, [sp, #-0x30]! + stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] add w4, w4, #2 clz w9, w6 // bitdepth_max @@ -49,93 +112,112 @@ function sgr_box3_vert_neon, export=1 movi v31.4s, #9 // n sub w9, w9, #24 // -bitdepth_min_8 - movrel x12, X(sgr_x_by_x) + movrel x12, x_by_x_tables mov w13, #455 // one_by_x - ld1 {v16.16b, v17.16b, v18.16b}, [x12] + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks + movi v22.16b, #0x7 + ldr q23, [x12, #64] //RefLo dup v6.8h, w9 // -bitdepth_min_8 - movi v19.16b, #5 - movi v20.8b, #55 // idx of last 5 - movi v21.8b, #72 // idx of last 4 - movi v22.8b, #101 // idx of last 3 - movi v23.8b, #169 // idx of last 2 - movi v24.8b, #254 // idx of last 1 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 movi v29.8h, #1, lsl #8 dup v30.4s, w13 // one_by_x - sub v16.16b, v16.16b, v19.16b - sub v17.16b, v17.16b, v19.16b - sub v18.16b, v18.16b, v19.16b - - ld1 {v8.4s, v9.4s}, [x5], #32 - ld1 {v10.4s, v11.4s}, [x6], #32 - ld1 {v12.8h}, [x7], #16 - ld1 {v13.8h}, [x8], #16 - ld1 {v0.4s, v1.4s}, [x0], #32 - ld1 {v2.8h}, [x1], #16 + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x5], #64 + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64 + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + ld1 {v20.8h, v21.8h}, [x8], #32 + ld1 {v0.8h, v1.8h}, [x7], #32 1: - - add v8.4s, v8.4s, v10.4s - add v9.4s, v9.4s, v11.4s - - add v12.8h, v12.8h, v13.8h - - subs w4, w4, #8 - add v0.4s, v0.4s, v8.4s - add v1.4s, v1.4s, v9.4s - add v2.8h, v2.8h, v12.8h - - srshl v0.4s, v0.4s, v7.4s - srshl v1.4s, v1.4s, v7.4s - srshl v4.8h, v2.8h, v6.8h - mul v0.4s, v0.4s, v31.4s // a * n - mul v1.4s, v1.4s, v31.4s // a * n - umull v3.4s, v4.4h, v4.4h // b * b - umull2 v4.4s, v4.8h, v4.8h // b * b - uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) - uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) - mul v0.4s, v0.4s, v28.4s // p * s - mul v1.4s, v1.4s, v28.4s // p * s - ld1 {v8.4s, v9.4s}, [x5], #32 - uqshrn v0.4h, v0.4s, #16 - uqshrn2 v0.8h, v1.4s, #16 - ld1 {v10.4s, v11.4s}, [x6], #32 - uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) - - ld1 {v12.8h}, [x7], #16 - - cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 - cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 - tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b - cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 - cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 - add v25.8b, v25.8b, v26.8b - cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 - add v27.8b, v27.8b, v4.8b - add v5.8b, v5.8b, v19.8b - add v25.8b, v25.8b, v27.8b - add v5.8b, v1.8b, v5.8b - ld1 {v13.8h}, [x8], #16 - add v5.8b, v5.8b, v25.8b - ld1 {v0.4s, v1.4s}, [x0], #32 - uxtl v5.8h, v5.8b // x - - umull v3.4s, v5.4h, v2.4h // x * BB[i] - umull2 v4.4s, v5.8h, v2.8h // x * BB[i] - mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x - mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x - srshr v3.4s, v3.4s, #12 // AA[i] - srshr v4.4s, v4.4s, #12 // AA[i] - sub v5.8h, v29.8h, v5.8h // 256 - x - ld1 {v2.8h}, [x1], #16 - - st1 {v3.4s, v4.4s}, [x2], #32 - st1 {v5.8h}, [x3], #16 + ld1 {v2.8h, v3.8h}, [x1], #32 + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + add v0.8h, v0.8h, v20.8h + add v1.8h, v1.8h, v21.8h + + add v16.4s, v16.4s, v8.4s + add v17.4s, v17.4s, v9.4s + add v18.4s, v18.4s, v10.4s + add v19.4s, v19.4s, v11.4s + add v4.8h, v2.8h, v0.8h + add v5.8h, v3.8h, v1.8h + + srshl v16.4s, v16.4s, v7.4s + srshl v17.4s, v17.4s, v7.4s + srshl v18.4s, v18.4s, v7.4s + srshl v19.4s, v19.4s, v7.4s + srshl v9.8h, v4.8h, v6.8h + srshl v13.8h, v5.8h, v6.8h + mul v16.4s, v16.4s, v31.4s // a * n + mul v17.4s, v17.4s, v31.4s // a * n + mul v18.4s, v18.4s, v31.4s // a * n + mul v19.4s, v19.4s, v31.4s // a * n + umull v8.4s, v9.4h, v9.4h // b * b + umull2 v9.4s, v9.8h, v9.8h // b * b + umull v12.4s, v13.4h, v13.4h // b * b + umull2 v13.4s, v13.8h, v13.8h // b * b + uqsub v16.4s, v16.4s, v8.4s // imax(a * n - b * b, 0) + uqsub v17.4s, v17.4s, v9.4s // imax(a * n - b * b, 0) + uqsub v18.4s, v18.4s, v12.4s // imax(a * n - b * b, 0) + uqsub v19.4s, v19.4s, v13.4s // imax(a * n - b * b, 0) + mul v16.4s, v16.4s, v28.4s // p * s + mul v17.4s, v17.4s, v28.4s // p * s + mul v18.4s, v18.4s, v28.4s // p * s + mul v19.4s, v19.4s, v28.4s // p * s + uqshrn v16.4h, v16.4s, #16 + uqshrn2 v16.8h, v17.4s, #16 + uqshrn v18.4h, v18.4s, #16 + uqshrn2 v18.8h, v19.4s, #16 + uqrshrn v1.8b, v16.8h, #4 // imin(z, 255) + uqrshrn2 v1.16b, v18.8h, #4 // imin(z, 255) + + ld1 {v16.4s, v17.4s}, [x0], #32 + subs w4, w4, #16 + + ushr v0.16b, v1.16b, #3 + ld1 {v8.4s, v9.4s}, [x5], #32 + tbl v2.16b, {v26.16b, v27.16b}, v0.16b // RangeMins + tbl v0.16b, {v24.16b, v25.16b}, v0.16b // DiffMasks + tbl v3.16b, {v23.16b}, v1.16b // RefLo + and v1.16b, v1.16b, v22.16b + ld1 {v12.4s, v13.4s}, [x6], #32 + ushl v1.16b, v2.16b, v1.16b + ld1 {v20.8h, v21.8h}, [x8], #32 + add v3.16b, v3.16b, v0.16b + cnt v1.16b, v1.16b + ld1 {v18.4s, v19.4s}, [x0], #32 + add v3.16b, v3.16b, v1.16b + ld1 {v10.4s, v11.4s}, [x5], #32 + uxtl v0.8h, v3.8b // x + uxtl2 v1.8h, v3.16b // x + + ld1 {v14.4s, v15.4s}, [x6], #32 + + umull v2.4s, v0.4h, v4.4h // x * BB[i] + umull2 v3.4s, v0.8h, v4.8h // x * BB[i] + umull v4.4s, v1.4h, v5.4h // x * BB[i] + umull2 v5.4s, v1.8h, v5.8h // x * BB[i] + sub v0.8h, v29.8h, v0.8h // 256 - x + sub v1.8h, v29.8h, v1.8h // 256 - x + mul v2.4s, v2.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v5.4s, v5.4s, v30.4s // x * BB[i] * sgr_one_by_x + st1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v0.8h, v1.8h}, [x7], #32 + srshr v2.4s, v2.4s, #12 // AA[i] + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + srshr v5.4s, v5.4s, #12 // AA[i] + + st1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64 b.gt 1b + ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] - ldp d8, d9, [sp], 0x30 + ldp d8, d9, [sp], 0x40 ret endfunc @@ -144,10 +226,9 @@ endfunc // const int w, const int s, // const int bitdepth_max); function sgr_box5_vert_neon, export=1 - stp d8, d9, [sp, #-0x40]! + stp d8, d9, [sp, #-0x30]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] - stp d14, d15, [sp, #0x30] add w4, w4, #2 clz w15, w6 // bitdepth_max @@ -163,24 +244,19 @@ function sgr_box5_vert_neon, export=1 movi v31.4s, #25 // n sub w15, w15, #24 // -bitdepth_min_8 - movrel x13, X(sgr_x_by_x) - mov w14, #164 // one_by_x - ld1 {v16.16b, v17.16b, v18.16b}, [x13] + movrel x13, x_by_x_tables + movi v30.4s, #164 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks dup v6.8h, w15 // -bitdepth_min_8 - movi v19.16b, #5 - movi v24.8b, #254 // idx of last 1 + movi v19.8b, #0x7 + ldr q18, [x13, #64] // RefLo saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 movi v29.8h, #1, lsl #8 - dup v30.4s, w14 // one_by_x - - sub v16.16b, v16.16b, v19.16b - sub v17.16b, v17.16b, v19.16b - sub v18.16b, v18.16b, v19.16b ld1 {v8.4s, v9.4s}, [x5], #32 ld1 {v10.4s, v11.4s}, [x6], #32 ld1 {v12.4s, v13.4s}, [x7], #32 - ld1 {v14.4s, v15.4s}, [x8], #32 + ld1 {v16.4s, v17.4s}, [x8], #32 ld1 {v20.8h}, [x9], #16 ld1 {v21.8h}, [x10], #16 ld1 {v22.8h}, [x11], #16 @@ -191,8 +267,8 @@ function sgr_box5_vert_neon, export=1 1: add v8.4s, v8.4s, v10.4s add v9.4s, v9.4s, v11.4s - add v12.4s, v12.4s, v14.4s - add v13.4s, v13.4s, v15.4s + add v12.4s, v12.4s, v16.4s + add v13.4s, v13.4s, v17.4s add v20.8h, v20.8h, v21.8h add v22.8h, v22.8h, v23.8h @@ -207,11 +283,6 @@ function sgr_box5_vert_neon, export=1 subs w4, w4, #8 - movi v20.8b, #55 // idx of last 5 - movi v21.8b, #72 // idx of last 4 - movi v22.8b, #101 // idx of last 3 - movi v23.8b, #169 // idx of last 2 - srshl v0.4s, v0.4s, v7.4s srshl v1.4s, v1.4s, v7.4s srshl v4.8h, v2.8h, v6.8h @@ -231,22 +302,19 @@ function sgr_box5_vert_neon, export=1 ld1 {v12.4s, v13.4s}, [x7], #32 - cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 - cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 - tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b - cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 - cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 - ld1 {v14.4s, v15.4s}, [x8], #32 - add v25.8b, v25.8b, v26.8b - cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 - add v27.8b, v27.8b, v4.8b + ushr v1.8b, v0.8b, #3 + ld1 {v16.4s, v17.4s}, [x8], #32 + tbl v5.8b, {v26.16b, v27.16b}, v1.8b // RangeMins + tbl v1.8b, {v24.16b, v25.16b}, v1.8b // DiffMasks + tbl v4.8b, {v18.16b}, v0.8b // RefLo + and v0.8b, v0.8b, v19.8b ld1 {v20.8h}, [x9], #16 - add v5.8b, v5.8b, v19.8b - add v25.8b, v25.8b, v27.8b + ushl v5.8b, v5.8b, v0.8b + add v4.8b, v4.8b, v1.8b ld1 {v21.8h}, [x10], #16 - add v5.8b, v1.8b, v5.8b + cnt v5.8b, v5.8b ld1 {v22.8h}, [x11], #16 - add v5.8b, v5.8b, v25.8b + add v5.8b, v4.8b, v5.8b ld1 {v23.8h}, [x12], #16 uxtl v5.8h, v5.8b // x @@ -264,9 +332,8 @@ function sgr_box5_vert_neon, export=1 st1 {v5.8h}, [x3], #16 b.gt 1b - ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] - ldp d8, d9, [sp], 0x40 + ldp d8, d9, [sp], 0x30 ret endfunc diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 736b2bb4e699beece7f67b222de5b753583a3945..24ef4d298ae92c24f2d4680a78d1ff6fe49b60df 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -79,11 +79,11 @@ function \type\()_8bpc_neon, export=1 .ifc \type, mask movi v31.16b, #256-2 .endif - adr x7, L(\type\()_tbl) + movrel x7, \type\()_tbl sub w4, w4, #24 - ldrh w4, [x7, x4, lsl #1] + ldrsw x4, [x7, x4, lsl #2] \type v4, v0, v1, v2, v3 - sub x7, x7, w4, uxtw + add x7, x7, x4 br x7 40: AARCH64_VALID_JUMP_TARGET @@ -119,17 +119,18 @@ function \type\()_8bpc_neon, export=1 add x7, x0, x1 lsl x1, x1, #1 8: - st1 {v4.d}[0], [x0], x1 + st1 {v4.8b}, [x0], x1 \type v5, v0, v1, v2, v3 st1 {v4.d}[1], [x7], x1 - st1 {v5.d}[0], [x0], x1 + st1 {v5.8b}, [x0], x1 subs w5, w5, #4 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 8b -16: +160: AARCH64_VALID_JUMP_TARGET +16: \type v5, v0, v1, v2, v3 st1 {v4.16b}, [x0], x1 \type v6, v0, v1, v2, v3 @@ -192,14 +193,16 @@ function \type\()_8bpc_neon, export=1 b 128b 0: ret -L(\type\()_tbl): - .hword L(\type\()_tbl) - 1280b - .hword L(\type\()_tbl) - 640b - .hword L(\type\()_tbl) - 320b - .hword L(\type\()_tbl) - 16b - .hword L(\type\()_tbl) - 80b - .hword L(\type\()_tbl) - 40b endfunc + +jumptable \type\()_tbl + .word 1280b - \type\()_tbl + .word 640b - \type\()_tbl + .word 320b - \type\()_tbl + .word 160b - \type\()_tbl + .word 80b - \type\()_tbl + .word 40b - \type\()_tbl +endjumptable .endm bidir_fn avg @@ -210,10 +213,10 @@ bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 clz w8, w4 - adr x9, L(w_mask_\type\()_tbl) + movrel x9, w_mask_\type\()_tbl sub w8, w8, #24 - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 mov w10, #6903 dup v0.8h, w10 .if \type == 444 @@ -230,8 +233,9 @@ function w_mask_\type\()_8bpc_neon, export=1 add x12, x0, x1 lsl x1, x1, #1 br x9 -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) subs w5, w5, #4 @@ -267,7 +271,7 @@ function w_mask_\type\()_8bpc_neon, export=1 addp v18.8h, v24.8h, v24.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 - st1 {v18.s}[0], [x6], #4 + str s18, [x6], #4 .endif st1 {v22.s}[0], [x0], x1 st1 {v22.s}[1], [x12], x1 @@ -275,8 +279,9 @@ function w_mask_\type\()_8bpc_neon, export=1 st1 {v23.s}[1], [x12], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 subs w5, w5, #2 @@ -310,7 +315,7 @@ function w_mask_\type\()_8bpc_neon, export=1 addp v18.8h, v18.8h, v18.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 - st1 {v18.s}[0], [x6], #4 + str s18, [x6], #4 .endif st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x12], x1 @@ -413,14 +418,16 @@ function w_mask_\type\()_8bpc_neon, export=1 add x12, x12, x1 b.gt 161b ret -L(w_mask_\type\()_tbl): - .hword L(w_mask_\type\()_tbl) - 1280b - .hword L(w_mask_\type\()_tbl) - 640b - .hword L(w_mask_\type\()_tbl) - 320b - .hword L(w_mask_\type\()_tbl) - 160b - .hword L(w_mask_\type\()_tbl) - 8b - .hword L(w_mask_\type\()_tbl) - 4b endfunc + +jumptable w_mask_\type\()_tbl + .word 1280b - w_mask_\type\()_tbl + .word 640b - w_mask_\type\()_tbl + .word 320b - w_mask_\type\()_tbl + .word 160b - w_mask_\type\()_tbl + .word 80b - w_mask_\type\()_tbl + .word 40b - w_mask_\type\()_tbl +endjumptable .endm w_mask_fn 444 @@ -429,20 +436,21 @@ w_mask_fn 420 function blend_8bpc_neon, export=1 - adr x6, L(blend_tbl) + movrel x6, blend_tbl clz w3, w3 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldrsw x3, [x6, x3, lsl #2] + add x6, x6, x3 movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 br x6 -4: +40: AARCH64_VALID_JUMP_TARGET - ld1 {v2.8b}, [x5], #8 - ld1 {v1.d}[0], [x2], #8 - ld1 {v0.s}[0], [x0] +4: + ld1 {v2.8b}, [x5], #8 + ldr d1, [x2], #8 + ldr s0, [x0] subs w4, w4, #2 ld1 {v0.s}[1], [x8] sub v3.8b, v4.8b, v2.8b @@ -453,12 +461,13 @@ function blend_8bpc_neon, export=1 st1 {v6.s}[1], [x8], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld1 {v2.16b}, [x5], #16 ld1 {v1.16b}, [x2], #16 - ld1 {v0.d}[0], [x0] - ld1 {v0.d}[1], [x8] + ldr d0, [x0] + ld1 {v0.d}[1], [x8] sub v3.16b, v4.16b, v2.16b subs w4, w4, #2 umull v5.8h, v1.8b, v2.8b @@ -466,13 +475,14 @@ function blend_8bpc_neon, export=1 umull2 v6.8h, v1.16b, v2.16b umlal2 v6.8h, v0.16b, v3.16b rshrn v7.8b, v5.8h, #6 - rshrn2 v7.16b, v6.8h, #6 - st1 {v7.d}[0], [x0], x1 - st1 {v7.d}[1], [x8], x1 + rshrn v16.8b, v6.8h, #6 + st1 {v7.8b}, [x0], x1 + st1 {v16.8b}, [x8], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ld1 {v1.16b, v2.16b}, [x5], #32 ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v0.16b}, [x0] @@ -496,8 +506,9 @@ function blend_8bpc_neon, export=1 st1 {v19.16b}, [x8], x1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v20.16b, v21.16b}, [x0] @@ -535,15 +546,17 @@ function blend_8bpc_neon, export=1 st1 {v27.16b, v28.16b}, [x8], x1 b.gt 32b ret -L(blend_tbl): - .hword L(blend_tbl) - 32b - .hword L(blend_tbl) - 16b - .hword L(blend_tbl) - 8b - .hword L(blend_tbl) - 4b endfunc +jumptable blend_tbl + .word 320b - blend_tbl + .word 160b - blend_tbl + .word 80b - blend_tbl + .word 40b - blend_tbl +endjumptable + function blend_h_8bpc_neon, export=1 - adr x6, L(blend_h_tbl) + movrel x6, blend_h_tbl movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 @@ -552,15 +565,16 @@ function blend_h_8bpc_neon, export=1 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 - ldrh w7, [x6, x7, lsl #1] - sub x6, x6, w7, uxtw + ldrsw x7, [x6, x7, lsl #2] + add x6, x6, x7 br x6 -2: +20: AARCH64_VALID_JUMP_TARGET - ld1 {v0.h}[0], [x5], #2 - ld1 {v1.s}[0], [x2], #4 +2: + ldr h0, [x5], #2 + ldr s1, [x2], #4 subs w4, w4, #2 - ld1 {v2.h}[0], [x0] + ldr h2, [x0] zip1 v0.8b, v0.8b, v0.8b sub v3.8b, v4.8b, v0.8b ld1 {v2.h}[1], [x8] @@ -571,13 +585,14 @@ function blend_h_8bpc_neon, export=1 st1 {v5.h}[1], [x8], x1 b.gt 2b ret -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld2r {v0.8b, v1.8b}, [x5], #2 ld1 {v2.8b}, [x2], #8 subs w4, w4, #2 ext v0.8b, v0.8b, v1.8b, #4 - ld1 {v3.s}[0], [x0] + ldr s3, [x0] sub v5.8b, v4.8b, v0.8b ld1 {v3.s}[1], [x8] umull v6.8h, v2.8b, v0.8b @@ -587,27 +602,29 @@ function blend_h_8bpc_neon, export=1 st1 {v6.s}[1], [x8], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b}, [x2], #16 - ld1 {v3.d}[0], [x0] + ldr d3, [x0] ext v0.16b, v0.16b, v1.16b, #8 sub v5.16b, v4.16b, v0.16b - ld1 {v3.d}[1], [x8] + ld1 {v3.d}[1], [x8] subs w4, w4, #2 umull v6.8h, v0.8b, v2.8b umlal v6.8h, v3.8b, v5.8b umull2 v7.8h, v0.16b, v2.16b umlal2 v7.8h, v3.16b, v5.16b rshrn v16.8b, v6.8h, #6 - rshrn2 v16.16b, v7.8h, #6 - st1 {v16.d}[0], [x0], x1 - st1 {v16.d}[1], [x8], x1 + rshrn v17.8b, v7.8h, #6 + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x8], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b, v3.16b}, [x2], #32 ld1 {v5.16b}, [x0] @@ -682,18 +699,20 @@ function blend_h_8bpc_neon, export=1 add x7, x7, w3, uxtw b.gt 321b ret -L(blend_h_tbl): - .hword L(blend_h_tbl) - 1280b - .hword L(blend_h_tbl) - 640b - .hword L(blend_h_tbl) - 320b - .hword L(blend_h_tbl) - 16b - .hword L(blend_h_tbl) - 8b - .hword L(blend_h_tbl) - 4b - .hword L(blend_h_tbl) - 2b endfunc +jumptable blend_h_tbl + .word 1280b - blend_h_tbl + .word 640b - blend_h_tbl + .word 320b - blend_h_tbl + .word 160b - blend_h_tbl + .word 80b - blend_h_tbl + .word 40b - blend_h_tbl + .word 20b - blend_h_tbl +endjumptable + function blend_v_8bpc_neon, export=1 - adr x6, L(blend_v_tbl) + movrel x6, blend_v_tbl movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 @@ -701,16 +720,16 @@ function blend_v_8bpc_neon, export=1 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldrsw x3, [x6, x3, lsl #2] + add x6, x6, x3 br x6 20: AARCH64_VALID_JUMP_TARGET ld1r {v0.8b}, [x5] sub v1.8b, v4.8b, v0.8b 2: - ld1 {v2.h}[0], [x2], #2 - ld1 {v3.b}[0], [x0] + ldr h2, [x2], #2 + ldr b3, [x0] subs w4, w4, #2 ld1 {v2.b}[1], [x2] ld1 {v3.b}[1], [x8] @@ -729,13 +748,13 @@ function blend_v_8bpc_neon, export=1 sub v1.8b, v4.8b, v0.8b 4: ld1 {v2.8b}, [x2], #8 - ld1 {v3.s}[0], [x0] + ldr s3, [x0] ld1 {v3.s}[1], [x8] subs w4, w4, #2 umull v5.8h, v2.8b, v0.8b umlal v5.8h, v3.8b, v1.8b rshrn v5.8b, v5.8h, #6 - st1 {v5.h}[0], [x0], #2 + str h5, [x0], #2 st1 {v5.h}[2], [x8], #2 st1 {v5.b}[2], [x0], x1 st1 {v5.b}[6], [x8], x1 @@ -746,21 +765,22 @@ function blend_v_8bpc_neon, export=1 ld1r {v0.2d}, [x5] sub x1, x1, #4 sub v1.16b, v4.16b, v0.16b + zip2 v16.2d, v1.2d, v1.2d 8: ld1 {v2.16b}, [x2], #16 - ld1 {v3.d}[0], [x0] - ld1 {v3.d}[1], [x8] + ldr d3, [x0] + ldr d4, [x8] subs w4, w4, #2 umull v5.8h, v0.8b, v2.8b umlal v5.8h, v3.8b, v1.8b umull2 v6.8h, v0.16b, v2.16b - umlal2 v6.8h, v3.16b, v1.16b + umlal v6.8h, v4.8b, v16.8b rshrn v7.8b, v5.8h, #6 - rshrn2 v7.16b, v6.8h, #6 - st1 {v7.s}[0], [x0], #4 - st1 {v7.s}[2], [x8], #4 + rshrn v17.8b, v6.8h, #6 + str s7, [x0], #4 + str s17, [x8], #4 st1 {v7.h}[2], [x0], x1 - st1 {v7.h}[6], [x8], x1 + st1 {v17.h}[2], [x8], x1 b.gt 8b ret 160: @@ -826,21 +846,23 @@ function blend_v_8bpc_neon, export=1 st1 {v27.8b}, [x8], x1 b.gt 32b ret -L(blend_v_tbl): - .hword L(blend_v_tbl) - 320b - .hword L(blend_v_tbl) - 160b - .hword L(blend_v_tbl) - 80b - .hword L(blend_v_tbl) - 40b - .hword L(blend_v_tbl) - 20b endfunc +jumptable blend_v_tbl + .word 320b - blend_v_tbl + .word 160b - blend_v_tbl + .word 80b - blend_v_tbl + .word 40b - blend_v_tbl + .word 20b - blend_v_tbl +endjumptable + // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). function put_neon, export=1 - adr x9, L(put_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, x8 + movrel x9, put_tbl + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 br x9 20: @@ -933,34 +955,39 @@ function put_neon, export=1 add x0, x0, x1 b.gt 128b ret - -L(put_tbl): - .hword L(put_tbl) - 1280b - .hword L(put_tbl) - 640b - .hword L(put_tbl) - 320b - .hword L(put_tbl) - 160b - .hword L(put_tbl) - 80b - .hword L(put_tbl) - 40b - .hword L(put_tbl) - 20b endfunc +jumptable put_tbl + .word 1280b - put_tbl + .word 640b - put_tbl + .word 320b - put_tbl + .word 160b - put_tbl + .word 80b - put_tbl + .word 40b - put_tbl + .word 20b - put_tbl +endjumptable + // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. function prep_neon, export=1 - adr x9, L(prep_tbl) - ldrh w8, [x9, x8, lsl #1] + movrel x9, prep_tbl + ldrsw x8, [x9, x8, lsl #2] movi v24.16b, #16 - sub x9, x9, x8 + add x9, x9, x8 br x9 40: AARCH64_VALID_JUMP_TARGET 4: - ld1 {v0.s}[0], [x1], x2 - ld1 {v0.s}[1], [x1], x2 - ld1 {v1.s}[0], [x1], x2 - ld1 {v1.s}[1], [x1], x2 + ldr s0, [x1] + ldr s2, [x1, x2] + add x1, x1, x2, lsl #1 + ldr s1, [x1] + ldr s3, [x1, x2] + add x1, x1, x2, lsl #1 + mov v0.s[1], v2.s[0] + mov v1.s[1], v3.s[0] ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 subs w4, w4, #4 @@ -1092,16 +1119,17 @@ function prep_neon, export=1 add x0, x0, #256 b.gt 128b ret - -L(prep_tbl): - .hword L(prep_tbl) - 1280b - .hword L(prep_tbl) - 640b - .hword L(prep_tbl) - 320b - .hword L(prep_tbl) - 160b - .hword L(prep_tbl) - 80b - .hword L(prep_tbl) - 40b endfunc +jumptable prep_tbl + .word 1280b - prep_tbl + .word 640b - prep_tbl + .word 320b - prep_tbl + .word 160b - prep_tbl + .word 80b - prep_tbl + .word 40b - prep_tbl +endjumptable + .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd @@ -1335,10 +1363,10 @@ endfunc .endif .endm .macro st_d strd, r0, r1 - st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().8b}, [x0], \strd st1 {\r0\().d}[1], [x8], \strd .ifnb \r1 - st1 {\r1\().d}[0], [x0], \strd + st1 {\r1\().8b}, [x0], \strd st1 {\r1\().d}[1], [x8], \strd .endif .endm @@ -1439,16 +1467,15 @@ L(\type\()_\taps\()_h): add \xmx, x10, \mx, uxtw #3 b.ne L(\type\()_\taps\()_hv) - adr x9, L(\type\()_\taps\()_h_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + movrel x9, \type\()_\taps\()_h_tbl + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 br x9 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1481,8 +1508,7 @@ L(\type\()_\taps\()_h): 40: // 4xN h AARCH64_VALID_JUMP_TARGET - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1514,8 +1540,10 @@ L(\type\()_\taps\()_h): .ifc \type, put sqrshrun v16.8b, v16.8h, #4 sqrshrun v20.8b, v20.8h, #4 - st1 {v16.s}[0], [\dst], \d_strd - st1 {v20.s}[0], [\ds2], \d_strd + str s16, [\dst] + str s20, [\ds2] + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd .else st1 {v16.4h}, [\dst], \d_strd st1 {v20.4h}, [\ds2], \d_strd @@ -1526,7 +1554,11 @@ L(\type\()_\taps\()_h): 80: // 8xN h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] +.ifc \taps, 6tap + sub \src, \src, #2 +.else sub \src, \src, #3 +.endif add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 @@ -1541,25 +1573,23 @@ L(\type\()_\taps\()_h): uxtl v21.8h, v21.8b .ifc \taps, 6tap - ext v19.16b, v16.16b, v17.16b, #2 - ext v23.16b, v20.16b, v21.16b, #2 - mul v18.8h, v19.8h, v0.h[1] - mul v22.8h, v23.8h, v0.h[1] -.irpc i, 23456 - ext v19.16b, v16.16b, v17.16b, #(2*\i) - ext v23.16b, v20.16b, v21.16b, #(2*\i) + mul v18.8h, v16.8h, v0.h[1] + mul v22.8h, v20.8h, v0.h[1] + .irpc i, 23456 + ext v19.16b, v16.16b, v17.16b, #(2*\i-2) + ext v23.16b, v20.16b, v21.16b, #(2*\i-2) mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] -.endr + .endr .else // 8tap mul v18.8h, v16.8h, v0.h[0] mul v22.8h, v20.8h, v0.h[0] -.irpc i, 1234567 + .irpc i, 1234567 ext v19.16b, v16.16b, v17.16b, #(2*\i) ext v23.16b, v20.16b, v21.16b, #(2*\i) mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] -.endr + .endr .endif subs \h, \h, #2 srshr v18.8h, v18.8h, #2 @@ -1581,7 +1611,11 @@ L(\type\()_\taps\()_h): 1280: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] +.ifc \taps, 6tap + sub \src, \src, #2 +.else sub \src, \src, #3 +.endif add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -1606,30 +1640,26 @@ L(\type\()_\taps\()_h): 16: .ifc \taps, 6tap - ext v28.16b, v16.16b, v17.16b, #2 - ext v29.16b, v17.16b, v18.16b, #2 - ext v30.16b, v20.16b, v21.16b, #2 - ext v31.16b, v21.16b, v22.16b, #2 - mul v24.8h, v28.8h, v0.h[1] - mul v25.8h, v29.8h, v0.h[1] - mul v26.8h, v30.8h, v0.h[1] - mul v27.8h, v31.8h, v0.h[1] -.irpc i, 23456 - ext v28.16b, v16.16b, v17.16b, #(2*\i) - ext v29.16b, v17.16b, v18.16b, #(2*\i) - ext v30.16b, v20.16b, v21.16b, #(2*\i) - ext v31.16b, v21.16b, v22.16b, #(2*\i) + mul v24.8h, v16.8h, v0.h[1] + mul v25.8h, v17.8h, v0.h[1] + mul v26.8h, v20.8h, v0.h[1] + mul v27.8h, v21.8h, v0.h[1] + .irpc i, 23456 + ext v28.16b, v16.16b, v17.16b, #(2*\i-2) + ext v29.16b, v17.16b, v18.16b, #(2*\i-2) + ext v30.16b, v20.16b, v21.16b, #(2*\i-2) + ext v31.16b, v21.16b, v22.16b, #(2*\i-2) mla v24.8h, v28.8h, v0.h[\i] mla v25.8h, v29.8h, v0.h[\i] mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] -.endr + .endr .else // 8tap mul v24.8h, v16.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] mul v26.8h, v20.8h, v0.h[0] mul v27.8h, v21.8h, v0.h[0] -.irpc i, 1234567 + .irpc i, 1234567 ext v28.16b, v16.16b, v17.16b, #(2*\i) ext v29.16b, v17.16b, v18.16b, #(2*\i) ext v30.16b, v20.16b, v21.16b, #(2*\i) @@ -1638,7 +1668,7 @@ L(\type\()_\taps\()_h): mla v25.8h, v29.8h, v0.h[\i] mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] -.endr + .endr .endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 @@ -1677,19 +1707,19 @@ L(\type\()_\taps\()_h): subs \h, \h, #2 b.gt 161b ret +endfunc -L(\type\()_\taps\()_h_tbl): - .hword L(\type\()_\taps\()_h_tbl) - 1280b - .hword L(\type\()_\taps\()_h_tbl) - 640b - .hword L(\type\()_\taps\()_h_tbl) - 320b - .hword L(\type\()_\taps\()_h_tbl) - 160b - .hword L(\type\()_\taps\()_h_tbl) - 80b - .hword L(\type\()_\taps\()_h_tbl) - 40b - .hword L(\type\()_\taps\()_h_tbl) - 20b - .hword 0 - - -L(\type\()_\taps\()_v): +jumptable \type\()_\taps\()_h_tbl + .word 1280b - \type\()_\taps\()_h_tbl + .word 640b - \type\()_\taps\()_h_tbl + .word 320b - \type\()_\taps\()_h_tbl + .word 160b - \type\()_\taps\()_h_tbl + .word 80b - \type\()_\taps\()_h_tbl + .word 40b - \type\()_\taps\()_h_tbl + .word 20b - \type\()_\taps\()_h_tbl +endjumptable + +function L(\type\()_\taps\()_v) cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -1698,9 +1728,9 @@ L(\type\()_\taps\()_v): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_\taps\()_v_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + movrel x9, \type\()_\taps\()_v_tbl + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 br x9 20: // 2xN v @@ -1709,8 +1739,7 @@ L(\type\()_\taps\()_v): b.gt 28f cmp \h, #2 - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1789,8 +1818,7 @@ L(\type\()_\taps\()_v): // 4x2, 4x4 v cmp \h, #2 - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1865,8 +1893,7 @@ L(\type\()_\taps\()_v): // 8x2, 8x4 v cmp \h, #2 - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1964,8 +1991,7 @@ L(\type\()_\taps\()_v): b.gt 1680b // 16x2, 16x4 v - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -2003,18 +2029,19 @@ L(\type\()_\taps\()_v): shift_store_16 \type, \d_strd, v1, v2, v3, v4 0: ret +endfunc -L(\type\()_\taps\()_v_tbl): - .hword L(\type\()_\taps\()_v_tbl) - 1280b - .hword L(\type\()_\taps\()_v_tbl) - 640b - .hword L(\type\()_\taps\()_v_tbl) - 320b - .hword L(\type\()_\taps\()_v_tbl) - 160b - .hword L(\type\()_\taps\()_v_tbl) - 80b - .hword L(\type\()_\taps\()_v_tbl) - 40b - .hword L(\type\()_\taps\()_v_tbl) - 20b - .hword 0 - -L(\type\()_\taps\()_hv): +jumptable \type\()_\taps\()_v_tbl + .word 1280b - \type\()_\taps\()_v_tbl + .word 640b - \type\()_\taps\()_v_tbl + .word 320b - \type\()_\taps\()_v_tbl + .word 160b - \type\()_\taps\()_v_tbl + .word 80b - \type\()_\taps\()_v_tbl + .word 40b - \type\()_\taps\()_v_tbl + .word 20b - \type\()_\taps\()_v_tbl +endjumptable + +function L(\type\()_\taps\()_hv) cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -2023,19 +2050,17 @@ L(\type\()_\taps\()_hv): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_\taps\()_hv_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + movrel x9, \type\()_\taps\()_hv_tbl + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 br x9 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] b.gt 280f - add \xmy, \xmy, #2 - ld1 {v1.s}[0], [\xmy] + ldur s1, [\xmy, #2] // 2x2, 2x4 hv sub \sr2, \src, #1 @@ -2169,11 +2194,9 @@ L(\type\()_\taps\()_filter_2): 40: AARCH64_VALID_JUMP_TARGET - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] b.gt 480f - add \xmy, \xmy, #2 - ld1 {v1.s}[0], [\xmy] + ldur s1, [\xmy, #2] sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd @@ -2218,8 +2241,10 @@ L(\type\()_\taps\()_filter_2): .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h - st1 {v2.s}[0], [\dst], \d_strd - st1 {v3.s}[0], [\ds2], \d_strd + str s2, [\dst] + str s3, [\ds2] + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd @@ -2311,8 +2336,10 @@ L(\type\()_\taps\()_filter_2): .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h - st1 {v2.s}[0], [\dst], \d_strd - st1 {v3.s}[0], [\ds2], \d_strd + str s2, [\dst] + str s3, [\ds2] + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd @@ -2359,10 +2386,13 @@ L(\type\()_\taps\()_filter_4): 320: AARCH64_VALID_JUMP_TARGET b.gt 880f - add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] - ld1 {v1.s}[0], [\xmy] + ldur s1, [\xmy, #2] +.ifc \taps, 6tap + sub \src, \src, #2 +.else sub \src, \src, #3 +.endif sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2440,8 +2470,10 @@ L(\type\()_\taps\()_filter_4): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] +.ifc \taps, 6tap + sub \src, \src, #2 +.else sub \src, \src, #3 -.ifc \taps, 8tap sub \src, \src, \s_strd .endif sub \src, \src, \s_strd, lsl #1 @@ -2585,17 +2617,15 @@ L(\type\()_\taps\()_filter_8_first): uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b .ifc \taps, 6tap - ext v24.16b, v28.16b, v29.16b, #(2*1) - ext v25.16b, v28.16b, v29.16b, #(2*2) - ext v26.16b, v28.16b, v29.16b, #(2*3) - ext v27.16b, v28.16b, v29.16b, #(2*4) - mul v16.8h, v24.8h, v0.h[1] + mul v16.8h, v28.8h, v0.h[1] + ext v25.16b, v28.16b, v29.16b, #(2*1) + ext v26.16b, v28.16b, v29.16b, #(2*2) + ext v27.16b, v28.16b, v29.16b, #(2*3) mla v16.8h, v25.8h, v0.h[2] mla v16.8h, v26.8h, v0.h[3] mla v16.8h, v27.8h, v0.h[4] - ext v24.16b, v28.16b, v29.16b, #(2*5) - ext v25.16b, v28.16b, v29.16b, #(2*6) - ext v26.16b, v28.16b, v29.16b, #(2*7) + ext v24.16b, v28.16b, v29.16b, #(2*4) + ext v25.16b, v28.16b, v29.16b, #(2*5) mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] .else // 8tap @@ -2626,40 +2656,38 @@ L(\type\()_\taps\()_filter_8): uxtl v30.8h, v30.8b uxtl v31.8h, v31.8b .ifc \taps, 6tap - ext v26.16b, v28.16b, v29.16b, #2 - ext v27.16b, v30.16b, v31.16b, #2 - mul v24.8h, v26.8h, v0.h[1] - mul v25.8h, v27.8h, v0.h[1] -.irpc i, 23456 - ext v26.16b, v28.16b, v29.16b, #(2*\i) - ext v27.16b, v30.16b, v31.16b, #(2*\i) + mul v24.8h, v28.8h, v0.h[1] + mul v25.8h, v30.8h, v0.h[1] + .irpc i, 23456 + ext v26.16b, v28.16b, v29.16b, #(2*\i-2) + ext v27.16b, v30.16b, v31.16b, #(2*\i-2) mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] -.endr + .endr .else // 8tap mul v24.8h, v28.8h, v0.h[0] mul v25.8h, v30.8h, v0.h[0] -.irpc i, 1234567 + .irpc i, 1234567 ext v26.16b, v28.16b, v29.16b, #(2*\i) ext v27.16b, v30.16b, v31.16b, #(2*\i) mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] -.endr + .endr .endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 ret - -L(\type\()_\taps\()_hv_tbl): - .hword L(\type\()_\taps\()_hv_tbl) - 1280b - .hword L(\type\()_\taps\()_hv_tbl) - 640b - .hword L(\type\()_\taps\()_hv_tbl) - 320b - .hword L(\type\()_\taps\()_hv_tbl) - 160b - .hword L(\type\()_\taps\()_hv_tbl) - 80b - .hword L(\type\()_\taps\()_hv_tbl) - 40b - .hword L(\type\()_\taps\()_hv_tbl) - 20b - .hword 0 endfunc + +jumptable \type\()_\taps\()_hv_tbl + .word 1280b - \type\()_\taps\()_hv_tbl + .word 640b - \type\()_\taps\()_hv_tbl + .word 320b - \type\()_\taps\()_hv_tbl + .word 160b - \type\()_\taps\()_hv_tbl + .word 80b - \type\()_\taps\()_hv_tbl + .word 40b - \type\()_\taps\()_hv_tbl + .word 20b - \type\()_\taps\()_hv_tbl +endjumptable .endm @@ -2686,9 +2714,9 @@ function \type\()_bilin_8bpc_neon, export=1 L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) - adr x9, L(\type\()_bilin_h_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + movrel x9, \type\()_bilin_h_tbl + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 br x9 20: // 2xN h @@ -2699,8 +2727,8 @@ L(\type\()_bilin_h): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: - ld1 {v4.s}[0], [\src], \s_strd - ld1 {v6.s}[0], [\sr2], \s_strd + ld1r {v4.4s}, [\src], \s_strd + ld1r {v6.4s}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #1 ext v7.8b, v6.8b, v6.8b, #1 trn1 v4.4h, v4.4h, v6.4h @@ -2736,7 +2764,7 @@ L(\type\()_bilin_h): st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else - st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.gt 4b @@ -2831,23 +2859,24 @@ L(\type\()_bilin_h): subs \h, \h, #2 b.gt 161b ret +endfunc -L(\type\()_bilin_h_tbl): - .hword L(\type\()_bilin_h_tbl) - 1280b - .hword L(\type\()_bilin_h_tbl) - 640b - .hword L(\type\()_bilin_h_tbl) - 320b - .hword L(\type\()_bilin_h_tbl) - 160b - .hword L(\type\()_bilin_h_tbl) - 80b - .hword L(\type\()_bilin_h_tbl) - 40b - .hword L(\type\()_bilin_h_tbl) - 20b - .hword 0 +jumptable \type\()_bilin_h_tbl + .word 1280b - \type\()_bilin_h_tbl + .word 640b - \type\()_bilin_h_tbl + .word 320b - \type\()_bilin_h_tbl + .word 160b - \type\()_bilin_h_tbl + .word 80b - \type\()_bilin_h_tbl + .word 40b - \type\()_bilin_h_tbl + .word 20b - \type\()_bilin_h_tbl +endjumptable -L(\type\()_bilin_v): +function L(\type\()_bilin_v) cmp \h, #4 - adr x9, L(\type\()_bilin_v_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + movrel x9, \type\()_bilin_v_tbl + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 br x9 20: // 2xN v @@ -2860,24 +2889,24 @@ L(\type\()_bilin_v): lsl \d_strd, \d_strd, #1 // 2x2 v - ld1 {v16.h}[0], [\src], \s_strd + ld1r {v16.8h}, [\src], \s_strd b.gt 24f 22: - ld1 {v17.h}[0], [\sr2], \s_strd - ld1 {v18.h}[0], [\src], \s_strd + ld1r {v17.8h}, [\sr2], \s_strd + ld1r {v18.8h}, [\src], \s_strd trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b uqrshrn v4.8b, v4.8h, #4 - st1 {v4.h}[0], [\dst] + str h4, [\dst] st1 {v4.h}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v - ld1 {v17.h}[0], [\sr2], \s_strd - ld1 {v18.h}[0], [\src], \s_strd - ld1 {v19.h}[0], [\sr2], \s_strd - ld1 {v20.h}[0], [\src], \s_strd + ld1r {v17.8h}, [\sr2], \s_strd + ld1r {v18.8h}, [\src], \s_strd + ld1r {v19.8h}, [\sr2], \s_strd + ld1r {v20.8h}, [\src], \s_strd sub \h, \h, #4 trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h @@ -2907,10 +2936,10 @@ L(\type\()_bilin_v): add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 - ld1 {v16.s}[0], [\src], \s_strd + ld1r {v16.4s}, [\src], \s_strd 4: - ld1 {v17.s}[0], [\sr2], \s_strd - ld1 {v18.s}[0], [\src], \s_strd + ld1r {v17.4s}, [\sr2], \s_strd + ld1r {v18.4s}, [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s umull v4.8h, v16.8b, v2.8b @@ -2921,7 +2950,7 @@ L(\type\()_bilin_v): st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else - st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f @@ -3017,23 +3046,24 @@ L(\type\()_bilin_v): b 1b 0: ret +endfunc -L(\type\()_bilin_v_tbl): - .hword L(\type\()_bilin_v_tbl) - 1280b - .hword L(\type\()_bilin_v_tbl) - 640b - .hword L(\type\()_bilin_v_tbl) - 320b - .hword L(\type\()_bilin_v_tbl) - 160b - .hword L(\type\()_bilin_v_tbl) - 80b - .hword L(\type\()_bilin_v_tbl) - 40b - .hword L(\type\()_bilin_v_tbl) - 20b - .hword 0 - -L(\type\()_bilin_hv): +jumptable \type\()_bilin_v_tbl + .word 1280b - \type\()_bilin_v_tbl + .word 640b - \type\()_bilin_v_tbl + .word 320b - \type\()_bilin_v_tbl + .word 160b - \type\()_bilin_v_tbl + .word 80b - \type\()_bilin_v_tbl + .word 40b - \type\()_bilin_v_tbl + .word 20b - \type\()_bilin_v_tbl +endjumptable + +function L(\type\()_bilin_hv) uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b - adr x9, L(\type\()_bilin_hv_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + movrel x9, \type\()_bilin_hv_tbl + ldrsw x8, [x9, x8, lsl #2] + add x9, x9, x8 br x9 20: // 2xN hv @@ -3044,14 +3074,14 @@ L(\type\()_bilin_hv): lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 - ld1 {v28.s}[0], [\src], \s_strd + ld1r {v28.4s}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 2: - ld1 {v28.s}[0], [\sr2], \s_strd - ld1 {v30.s}[0], [\src], \s_strd + ld1r {v28.4s}, [\sr2], \s_strd + ld1r {v30.4s}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 ext v31.8b, v30.8b, v30.8b, #1 trn1 v28.4h, v28.4h, v30.4h @@ -3107,7 +3137,7 @@ L(\type\()_bilin_hv): st1 {v4.s}[1], [\ds2], \d_strd .else urshr v4.8h, v4.8h, #4 - st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f @@ -3182,17 +3212,17 @@ L(\type\()_bilin_hv): b 1b 0: ret - -L(\type\()_bilin_hv_tbl): - .hword L(\type\()_bilin_hv_tbl) - 1280b - .hword L(\type\()_bilin_hv_tbl) - 640b - .hword L(\type\()_bilin_hv_tbl) - 320b - .hword L(\type\()_bilin_hv_tbl) - 160b - .hword L(\type\()_bilin_hv_tbl) - 80b - .hword L(\type\()_bilin_hv_tbl) - 40b - .hword L(\type\()_bilin_hv_tbl) - 20b - .hword 0 endfunc + +jumptable \type\()_bilin_hv_tbl + .word 1280b - \type\()_bilin_hv_tbl + .word 640b - \type\()_bilin_hv_tbl + .word 320b - \type\()_bilin_hv_tbl + .word 160b - \type\()_bilin_hv_tbl + .word 80b - \type\()_bilin_hv_tbl + .word 40b - \type\()_bilin_hv_tbl + .word 20b - \type\()_bilin_hv_tbl +endjumptable .endm make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S index 576fab158acca6e8bafec0f0dcbf4f1ad8704f3e..66cdfff744e70e4fe057013d14f68b772865707a 100644 --- a/src/arm/64/mc16.S +++ b/src/arm/64/mc16.S @@ -145,11 +145,11 @@ function \type\()_16bpc_neon, export=1 dup v27.4s, w6 neg v27.4s, v27.4s .endif - adr x7, L(\type\()_tbl) + movrel x7, \type\()_tbl sub w4, w4, #24 \type v4, v5, v0, v1, v2, v3 - ldrh w4, [x7, x4, lsl #1] - sub x7, x7, w4, uxtw + ldrsw x4, [x7, x4, lsl #2] + add x7, x7, x4 br x7 40: AARCH64_VALID_JUMP_TARGET @@ -157,9 +157,9 @@ function \type\()_16bpc_neon, export=1 lsl x1, x1, #1 4: subs w5, w5, #4 - st1 {v4.d}[0], [x0], x1 + st1 {v4.8b}, [x0], x1 st1 {v4.d}[1], [x7], x1 - st1 {v5.d}[0], [x0], x1 + st1 {v5.8b}, [x0], x1 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 @@ -175,8 +175,9 @@ function \type\()_16bpc_neon, export=1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 8b -16: +160: AARCH64_VALID_JUMP_TARGET +16: \type v6, v7, v0, v1, v2, v3 st1 {v4.8h, v5.8h}, [x0], x1 subs w5, w5, #2 @@ -184,8 +185,9 @@ function \type\()_16bpc_neon, export=1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 16b -32: +320: AARCH64_VALID_JUMP_TARGET +32: \type v6, v7, v0, v1, v2, v3 subs w5, w5, #1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 @@ -228,14 +230,16 @@ function \type\()_16bpc_neon, export=1 b 128b 0: ret -L(\type\()_tbl): - .hword L(\type\()_tbl) - 1280b - .hword L(\type\()_tbl) - 640b - .hword L(\type\()_tbl) - 32b - .hword L(\type\()_tbl) - 16b - .hword L(\type\()_tbl) - 80b - .hword L(\type\()_tbl) - 40b endfunc + +jumptable \type\()_tbl + .word 1280b - \type\()_tbl + .word 640b - \type\()_tbl + .word 320b - \type\()_tbl + .word 160b - \type\()_tbl + .word 80b - \type\()_tbl + .word 40b - \type\()_tbl +endjumptable .endm bidir_fn avg, w6 @@ -247,12 +251,12 @@ bidir_fn mask, w7 function w_mask_\type\()_16bpc_neon, export=1 ldr w8, [sp] clz w9, w4 - adr x10, L(w_mask_\type\()_tbl) + movrel x10, w_mask_\type\()_tbl dup v31.8h, w8 // bitdepth_max sub w9, w9, #24 clz w8, w8 // clz(bitdepth_max) - ldrh w9, [x10, x9, lsl #1] - sub x10, x10, w9, uxtw + ldrsw x9, [x10, x9, lsl #2] + add x10, x10, x9 sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 mov w9, #PREP_BIAS*64 neg w8, w8 // -sh @@ -274,8 +278,9 @@ function w_mask_\type\()_16bpc_neon, export=1 add x12, x0, x1 lsl x1, x1, #1 br x10 -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) subs w5, w5, #4 @@ -331,16 +336,17 @@ function w_mask_\type\()_16bpc_neon, export=1 addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 - st1 {v20.s}[0], [x6], #4 + str s20, [x6], #4 .endif - st1 {v4.d}[0], [x0], x1 + st1 {v4.8b}, [x0], x1 st1 {v4.d}[1], [x12], x1 - st1 {v5.d}[0], [x0], x1 + st1 {v5.8b}, [x0], x1 st1 {v5.d}[1], [x12], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 subs w5, w5, #2 @@ -394,7 +400,7 @@ function w_mask_\type\()_16bpc_neon, export=1 addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 - st1 {v20.s}[0], [x6], #4 + str s20, [x6], #4 .endif st1 {v4.8h}, [x0], x1 st1 {v5.8h}, [x12], x1 @@ -541,14 +547,16 @@ function w_mask_\type\()_16bpc_neon, export=1 add x12, x12, x1 b.gt 161b ret -L(w_mask_\type\()_tbl): - .hword L(w_mask_\type\()_tbl) - 1280b - .hword L(w_mask_\type\()_tbl) - 640b - .hword L(w_mask_\type\()_tbl) - 320b - .hword L(w_mask_\type\()_tbl) - 160b - .hword L(w_mask_\type\()_tbl) - 8b - .hword L(w_mask_\type\()_tbl) - 4b endfunc + +jumptable w_mask_\type\()_tbl + .word 1280b - w_mask_\type\()_tbl + .word 640b - w_mask_\type\()_tbl + .word 320b - w_mask_\type\()_tbl + .word 160b - w_mask_\type\()_tbl + .word 80b - w_mask_\type\()_tbl + .word 40b - w_mask_\type\()_tbl +endjumptable .endm w_mask_fn 444 @@ -557,11 +565,11 @@ w_mask_fn 420 function blend_16bpc_neon, export=1 - adr x6, L(blend_tbl) + movrel x6, blend_tbl clz w3, w3 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldrsw x3, [x6, x3, lsl #2] + add x6, x6, x3 add x8, x0, x1 br x6 40: @@ -570,7 +578,7 @@ function blend_16bpc_neon, export=1 4: ld1 {v2.8b}, [x5], #8 ld1 {v1.8h}, [x2], #16 - ld1 {v0.d}[0], [x0] + ldr d0, [x0] neg v2.8b, v2.8b // -m subs w4, w4, #2 ld1 {v0.d}[1], [x8] @@ -579,7 +587,7 @@ function blend_16bpc_neon, export=1 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h - st1 {v0.d}[0], [x0], x1 + st1 {v0.8b}, [x0], x1 st1 {v0.d}[1], [x8], x1 b.gt 4b ret @@ -642,8 +650,9 @@ function blend_16bpc_neon, export=1 st1 {v2.8h, v3.8h}, [x8], x1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #1 @@ -673,15 +682,17 @@ function blend_16bpc_neon, export=1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 32b ret -L(blend_tbl): - .hword L(blend_tbl) - 32b - .hword L(blend_tbl) - 160b - .hword L(blend_tbl) - 80b - .hword L(blend_tbl) - 40b endfunc +jumptable blend_tbl + .word 320b - blend_tbl + .word 160b - blend_tbl + .word 80b - blend_tbl + .word 40b - blend_tbl +endjumptable + function blend_h_16bpc_neon, export=1 - adr x6, L(blend_h_tbl) + movrel x6, blend_h_tbl movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 @@ -689,17 +700,18 @@ function blend_h_16bpc_neon, export=1 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 - ldrh w7, [x6, x7, lsl #1] - sub x6, x6, w7, uxtw + ldrsw x7, [x6, x7, lsl #2] + add x6, x6, x7 br x6 -2: +20: AARCH64_VALID_JUMP_TARGET +2: ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.4h}, [x2], #8 ext v2.8b, v2.8b, v3.8b, #6 subs w4, w4, #2 neg v2.8b, v2.8b // -m - ld1 {v0.s}[0], [x0] + ldr s0, [x0] ld1 {v0.s}[1], [x8] sxtl v2.8h, v2.8b shl v2.4h, v2.4h, #9 // -m << 9 @@ -710,26 +722,28 @@ function blend_h_16bpc_neon, export=1 st1 {v0.s}[1], [x8], x1 b.gt 2b ret -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.8h}, [x2], #16 ext v2.8b, v2.8b, v3.8b, #4 subs w4, w4, #2 neg v2.8b, v2.8b // -m - ld1 {v0.d}[0], [x0] + ldr d0, [x0] ld1 {v0.d}[1], [x8] sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h - st1 {v0.d}[0], [x0], x1 + st1 {v0.8b}, [x0], x1 st1 {v0.d}[1], [x8], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld2r {v4.8b, v5.8b}, [x5], #2 ld1 {v2.8h, v3.8h}, [x2], #32 neg v4.8b, v4.8b // -m @@ -751,8 +765,9 @@ function blend_h_16bpc_neon, export=1 st1 {v1.8h}, [x8], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ld2r {v16.8b, v17.8b}, [x5], #2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 neg v16.8b, v16.8b // -m @@ -835,26 +850,28 @@ function blend_h_16bpc_neon, export=1 add x7, x7, w3, uxtw #1 b.gt 321b ret -L(blend_h_tbl): - .hword L(blend_h_tbl) - 1280b - .hword L(blend_h_tbl) - 640b - .hword L(blend_h_tbl) - 320b - .hword L(blend_h_tbl) - 16b - .hword L(blend_h_tbl) - 8b - .hword L(blend_h_tbl) - 4b - .hword L(blend_h_tbl) - 2b endfunc +jumptable blend_h_tbl + .word 1280b - blend_h_tbl + .word 640b - blend_h_tbl + .word 320b - blend_h_tbl + .word 160b - blend_h_tbl + .word 80b - blend_h_tbl + .word 40b - blend_h_tbl + .word 20b - blend_h_tbl +endjumptable + function blend_v_16bpc_neon, export=1 - adr x6, L(blend_v_tbl) + movrel x6, blend_v_tbl movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldrsw x3, [x6, x3, lsl #2] + add x6, x6, x3 br x6 20: AARCH64_VALID_JUMP_TARGET @@ -863,8 +880,8 @@ function blend_v_16bpc_neon, export=1 sxtl v2.8h, v2.8b shl v2.4h, v2.4h, #9 // -m << 9 2: - ld1 {v1.s}[0], [x2], #4 - ld1 {v0.h}[0], [x0] + ldr s1, [x2], #4 + ldr h0, [x0] subs w4, w4, #2 ld1 {v1.h}[1], [x2] ld1 {v0.h}[1], [x8] @@ -885,13 +902,13 @@ function blend_v_16bpc_neon, export=1 shl v2.8h, v2.8h, #9 // -m << 9 4: ld1 {v1.8h}, [x2], #16 - ld1 {v0.d}[0], [x0] + ldr d0, [x0] ld1 {v0.d}[1], [x8] subs w4, w4, #2 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h - st1 {v0.s}[0], [x0], #4 + str s0, [x0], #4 st1 {v0.s}[2], [x8], #4 st1 {v0.h}[2], [x0], x1 st1 {v0.h}[6], [x8], x1 @@ -915,8 +932,8 @@ function blend_v_16bpc_neon, export=1 sqrdmulh v3.8h, v3.8h, v4.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h - st1 {v0.d}[0], [x0], #8 - st1 {v1.d}[0], [x8], #8 + str d0, [x0], #8 + str d1, [x8], #8 st1 {v0.s}[2], [x0], x1 st1 {v1.s}[2], [x8], x1 b.gt 8b @@ -992,34 +1009,38 @@ function blend_v_16bpc_neon, export=1 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 b.gt 32b ret -L(blend_v_tbl): - .hword L(blend_v_tbl) - 320b - .hword L(blend_v_tbl) - 160b - .hword L(blend_v_tbl) - 80b - .hword L(blend_v_tbl) - 40b - .hword L(blend_v_tbl) - 20b endfunc +jumptable blend_v_tbl + .word 320b - blend_v_tbl + .word 160b - blend_v_tbl + .word 80b - blend_v_tbl + .word 40b - blend_v_tbl + .word 20b - blend_v_tbl +endjumptable + // This has got the same signature as the put_8tap functions, // and assumes that x9 is set to (clz(w)-24). -function put_neon - adr x10, L(put_tbl) - ldrh w9, [x10, x9, lsl #1] - sub x10, x10, w9, uxtw +function put_16bpc_neon, export=1 + movrel x10, put_16bpc_tbl + ldrsw x9, [x10, x9, lsl #2] + add x10, x10, x9 br x10 -2: +20: AARCH64_VALID_JUMP_TARGET - ld1 {v0.s}[0], [x2], x3 - ld1 {v1.s}[0], [x2], x3 +2: + ld1r {v0.4s}, [x2], x3 + ld1r {v1.4s}, [x2], x3 subs w5, w5, #2 st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 2b ret -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld1 {v0.4h}, [x2], x3 ld1 {v1.4h}, [x2], x3 subs w5, w5, #2 @@ -1041,8 +1062,9 @@ function put_neon st1 {v1.8h}, [x8], x1 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -1052,8 +1074,9 @@ function put_neon add x0, x0, x1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -1067,8 +1090,9 @@ function put_neon add x0, x0, x1 b.gt 32b ret -64: +640: AARCH64_VALID_JUMP_TARGET +64: ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] @@ -1082,8 +1106,9 @@ function put_neon add x0, x0, x1 b.gt 64b ret -128: +1280: AARCH64_VALID_JUMP_TARGET +128: ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] @@ -1105,27 +1130,28 @@ function put_neon add x0, x0, x1 b.gt 128b ret - -L(put_tbl): - .hword L(put_tbl) - 128b - .hword L(put_tbl) - 64b - .hword L(put_tbl) - 32b - .hword L(put_tbl) - 16b - .hword L(put_tbl) - 80b - .hword L(put_tbl) - 4b - .hword L(put_tbl) - 2b endfunc +jumptable put_16bpc_tbl + .word 1280b - put_16bpc_tbl + .word 640b - put_16bpc_tbl + .word 320b - put_16bpc_tbl + .word 160b - put_16bpc_tbl + .word 80b - put_16bpc_tbl + .word 40b - put_16bpc_tbl + .word 20b - put_16bpc_tbl +endjumptable + // This has got the same signature as the prep_8tap functions, // and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and // x8 to w*2. -function prep_neon - adr x10, L(prep_tbl) - ldrh w9, [x10, x9, lsl #1] +function prep_16bpc_neon + movrel x10, prep_16bpc_tbl + ldrsw x9, [x10, x9, lsl #2] dup v31.8h, w7 // intermediate_bits movi v30.8h, #(PREP_BIAS >> 8), lsl #8 - sub x10, x10, w9, uxtw + add x10, x10, x9 br x10 40: @@ -1133,7 +1159,7 @@ function prep_neon add x9, x1, x2 lsl x2, x2, #1 4: - ld1 {v0.d}[0], [x1], x2 + ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x9], x2 subs w4, w4, #2 sshl v0.8h, v0.8h, v31.8h @@ -1156,8 +1182,9 @@ function prep_neon st1 {v0.8h, v1.8h}, [x0], #32 b.gt 8b ret -16: +160: AARCH64_VALID_JUMP_TARGET +16: ldp q0, q1, [x1] add x1, x1, x2 sshl v0.8h, v0.8h, v31.8h @@ -1174,8 +1201,9 @@ function prep_neon st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ldp q0, q1, [x1] sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] @@ -1191,8 +1219,9 @@ function prep_neon st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 32b ret -64: +640: AARCH64_VALID_JUMP_TARGET +64: ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h @@ -1222,8 +1251,9 @@ function prep_neon add x0, x0, x8 b.gt 64b ret -128: +1280: AARCH64_VALID_JUMP_TARGET +128: ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h @@ -1277,16 +1307,17 @@ function prep_neon add x0, x0, x8 b.gt 128b ret - -L(prep_tbl): - .hword L(prep_tbl) - 128b - .hword L(prep_tbl) - 64b - .hword L(prep_tbl) - 32b - .hword L(prep_tbl) - 16b - .hword L(prep_tbl) - 80b - .hword L(prep_tbl) - 40b endfunc +jumptable prep_16bpc_tbl + .word 1280b - prep_16bpc_tbl + .word 640b - prep_16bpc_tbl + .word 320b - prep_16bpc_tbl + .word 160b - prep_16bpc_tbl + .word 80b - prep_16bpc_tbl + .word 40b - prep_16bpc_tbl +endjumptable + .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd @@ -1455,10 +1486,10 @@ endfunc .endif .endm .macro st_d strd, r0, r1 - st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().8b}, [x0], \strd st1 {\r0\().d}[1], [x9], \strd .ifnb \r1 - st1 {\r1\().d}[0], [x0], \strd + st1 {\r1\().8b}, [x0], \strd st1 {\r1\().d}[1], [x9], \strd .endif .endm @@ -1556,7 +1587,7 @@ function \type\()_\taps\()_neon b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) b.ne L(\type\()_\taps\()_v) - b \type\()_neon + b \type\()_16bpc_neon L(\type\()_\taps\()_h): cmp \w, #4 @@ -1569,26 +1600,25 @@ L(\type\()_\taps\()_h): add \xmx, x11, \mx, uxtw #3 b.ne L(\type\()_\taps\()_hv) - adr x10, L(\type\()_\taps\()_h_tbl) - dup v30.4s, w12 // 6 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] - neg v30.4s, v30.4s // -(6-intermediate_bits) + movrel x10, \type\()_\taps\()_h_tbl + ldrsw x9, [x10, x9, lsl #2] .ifc \type, put - dup v29.8h, \bdmax // intermediate_bits + mov w12, #34 // rounding for 10-bit + mov w13, #40 // rounding for 12-bit + cmp \bdmax, #2 // 10-bit: 4, 12-bit: 2 + csel w12, w12, w13, ne // select rounding based on \bdmax .else + neg w12, w12 // -(6 - intermediate_bits) movi v28.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw -.ifc \type, put - neg v29.8h, v29.8h // -intermediate_bits -.endif + add x10, x10, x9 + dup v30.4s, w12 // rounding or shift amount br x10 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1598,6 +1628,7 @@ L(\type\()_\taps\()_h): 2: ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd + mov v2.16b, v30.16b ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 subs \h, \h, #2 @@ -1605,24 +1636,21 @@ L(\type\()_\taps\()_h): trn2 v6.2s, v4.2s, v6.2s trn1 v4.2s, v5.2s, v7.2s trn2 v7.2s, v5.2s, v7.2s - smull v3.4s, v3.4h, v0.h[0] - smlal v3.4s, v4.4h, v0.h[1] - smlal v3.4s, v6.4h, v0.h[2] - smlal v3.4s, v7.4h, v0.h[3] - srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) - sqxtun v3.4h, v3.4s - srshl v3.4h, v3.4h, v29.4h // -intermediate_bits - umin v3.4h, v3.4h, v31.4h - st1 {v3.s}[0], [\dst], \d_strd - st1 {v3.s}[1], [\ds2], \d_strd + smlal v2.4s, v3.4h, v0.h[0] + smlal v2.4s, v4.4h, v0.h[1] + smlal v2.4s, v6.4h, v0.h[2] + smlal v2.4s, v7.4h, v0.h[3] + sqshrun v2.4h, v2.4s, #6 + umin v2.4h, v2.4h, v31.4h + st1 {v2.s}[0], [\dst], \d_strd + st1 {v2.s}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1632,6 +1660,10 @@ L(\type\()_\taps\()_h): 4: ld1 {v16.8h}, [\src], \s_strd ld1 {v20.8h}, [\sr2], \s_strd +.ifc \type, put + mov v2.16b, v30.16b + mov v3.16b, v30.16b +.endif ext v17.16b, v16.16b, v16.16b, #2 ext v18.16b, v16.16b, v16.16b, #4 ext v19.16b, v16.16b, v16.16b, #6 @@ -1639,26 +1671,33 @@ L(\type\()_\taps\()_h): ext v22.16b, v20.16b, v20.16b, #4 ext v23.16b, v20.16b, v20.16b, #6 subs \h, \h, #2 - smull v16.4s, v16.4h, v0.h[0] - smlal v16.4s, v17.4h, v0.h[1] - smlal v16.4s, v18.4h, v0.h[2] - smlal v16.4s, v19.4h, v0.h[3] - smull v20.4s, v20.4h, v0.h[0] - smlal v20.4s, v21.4h, v0.h[1] - smlal v20.4s, v22.4h, v0.h[2] - smlal v20.4s, v23.4h, v0.h[3] - srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) - srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put - sqxtun v16.4h, v16.4s - sqxtun2 v16.8h, v20.4s - srshl v16.8h, v16.8h, v29.8h // -intermediate_bits + smlal v2.4s, v16.4h, v0.h[0] +.else + smull v2.4s, v16.4h, v0.h[0] +.endif + smlal v2.4s, v17.4h, v0.h[1] + smlal v2.4s, v18.4h, v0.h[2] + smlal v2.4s, v19.4h, v0.h[3] +.ifc \type, put + smlal v3.4s, v20.4h, v0.h[0] +.else + smull v3.4s, v20.4h, v0.h[0] +.endif + smlal v3.4s, v21.4h, v0.h[1] + smlal v3.4s, v22.4h, v0.h[2] + smlal v3.4s, v23.4h, v0.h[3] +.ifc \type, put + sqshrun v16.4h, v2.4s, #6 + sqshrun2 v16.8h, v3.4s, #6 umin v16.8h, v16.8h, v31.8h .else + srshl v16.4s, v2.4s, v30.4s // -(6-intermediate_bits) + srshl v20.4s, v3.4s, v30.4s // -(6-intermediate_bits) uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 sub v16.8h, v16.8h, v28.8h // PREP_BIAS .endif - st1 {v16.d}[0], [\dst], \d_strd + st1 {v16.8b}, [\dst], \d_strd st1 {v16.d}[1], [\ds2], \d_strd b.gt 4b ret @@ -1670,7 +1709,11 @@ L(\type\()_\taps\()_h): 1280: // 8xN, 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] +.ifc \taps, 6tap + sub \src, \src, #4 +.else sub \src, \src, #6 +.endif add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -1689,49 +1732,67 @@ L(\type\()_\taps\()_h): 8: .ifc \taps, 6tap - ext v24.16b, v16.16b, v17.16b, #2 - ext v25.16b, v20.16b, v21.16b, #2 - smull v18.4s, v24.4h, v0.h[1] - smull2 v19.4s, v24.8h, v0.h[1] - smull v22.4s, v25.4h, v0.h[1] - smull2 v23.4s, v25.8h, v0.h[1] -.irpc i, 23456 - ext v24.16b, v16.16b, v17.16b, #(2*\i) - ext v25.16b, v20.16b, v21.16b, #(2*\i) + .ifc \type, put + mov v18.16b, v30.16b + mov v19.16b, v30.16b + smlal v18.4s, v16.4h, v0.h[1] + smlal2 v19.4s, v16.8h, v0.h[1] + mov v22.16b, v30.16b + mov v23.16b, v30.16b + smlal v22.4s, v20.4h, v0.h[1] + smlal2 v23.4s, v20.8h, v0.h[1] + .else + smull v18.4s, v16.4h, v0.h[1] + smull2 v19.4s, v16.8h, v0.h[1] + smull v22.4s, v20.4h, v0.h[1] + smull2 v23.4s, v20.8h, v0.h[1] + .endif + .irpc i, 23456 + ext v24.16b, v16.16b, v17.16b, #(2*\i-2) + ext v25.16b, v20.16b, v21.16b, #(2*\i-2) smlal v18.4s, v24.4h, v0.h[\i] smlal2 v19.4s, v24.8h, v0.h[\i] smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] -.endr + .endr .else // 8tap + .ifc \type, put + mov v18.16b, v30.16b + mov v19.16b, v30.16b + smlal v18.4s, v16.4h, v0.h[0] + smlal2 v19.4s, v16.8h, v0.h[0] + mov v22.16b, v30.16b + mov v23.16b, v30.16b + smlal v22.4s, v20.4h, v0.h[0] + smlal2 v23.4s, v20.8h, v0.h[0] + .else smull v18.4s, v16.4h, v0.h[0] smull2 v19.4s, v16.8h, v0.h[0] smull v22.4s, v20.4h, v0.h[0] smull2 v23.4s, v20.8h, v0.h[0] -.irpc i, 1234567 + .endif + .irpc i, 1234567 ext v24.16b, v16.16b, v17.16b, #(2*\i) ext v25.16b, v20.16b, v21.16b, #(2*\i) smlal v18.4s, v24.4h, v0.h[\i] smlal2 v19.4s, v24.8h, v0.h[\i] smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] -.endr + .endr .endif subs \mx, \mx, #8 - srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) - srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) - srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) - srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put - sqxtun v18.4h, v18.4s - sqxtun2 v18.8h, v19.4s - sqxtun v22.4h, v22.4s - sqxtun2 v22.8h, v23.4s - srshl v18.8h, v18.8h, v29.8h // -intermediate_bits - srshl v22.8h, v22.8h, v29.8h // -intermediate_bits + sqshrun v18.4h, v18.4s, #6 + sqshrun2 v18.8h, v19.4s, #6 + sqshrun v22.4h, v22.4s, #6 + sqshrun2 v22.8h, v23.4s, #6 umin v18.8h, v18.8h, v31.8h umin v22.8h, v22.8h, v31.8h .else + srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) + srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) + srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) + srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 uzp1 v22.8h, v22.8h, v23.8h // Ditto sub v18.8h, v18.8h, v28.8h // PREP_BIAS @@ -1756,19 +1817,20 @@ L(\type\()_\taps\()_h): subs \h, \h, #2 b.gt 81b ret +endfunc -L(\type\()_\taps\()_h_tbl): - .hword L(\type\()_\taps\()_h_tbl) - 1280b - .hword L(\type\()_\taps\()_h_tbl) - 640b - .hword L(\type\()_\taps\()_h_tbl) - 320b - .hword L(\type\()_\taps\()_h_tbl) - 160b - .hword L(\type\()_\taps\()_h_tbl) - 80b - .hword L(\type\()_\taps\()_h_tbl) - 40b - .hword L(\type\()_\taps\()_h_tbl) - 20b - .hword 0 +jumptable \type\()_\taps\()_h_tbl + .word 1280b - \type\()_\taps\()_h_tbl + .word 640b - \type\()_\taps\()_h_tbl + .word 320b - \type\()_\taps\()_h_tbl + .word 160b - \type\()_\taps\()_h_tbl + .word 80b - \type\()_\taps\()_h_tbl + .word 40b - \type\()_\taps\()_h_tbl + .word 20b - \type\()_\taps\()_h_tbl +endjumptable -L(\type\()_\taps\()_v): +function L(\type\()_\taps\()_v) cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -1781,12 +1843,12 @@ L(\type\()_\taps\()_v): dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - adr x10, L(\type\()_\taps\()_v_tbl) - ldrh w9, [x10, x9, lsl #1] + movrel x10, \type\()_\taps\()_v_tbl + ldrsw x9, [x10, x9, lsl #2] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) .endif - sub x10, x10, w9, uxtw + add x10, x10, x9 br x10 20: // 2xN v @@ -1795,8 +1857,7 @@ L(\type\()_\taps\()_v): b.gt 28f cmp \h, #2 - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1873,8 +1934,7 @@ L(\type\()_\taps\()_v): // 4x2, 4x4 v cmp \h, #2 - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1938,8 +1998,7 @@ L(\type\()_\taps\()_v): // 8x2, 8x4 v cmp \h, #2 - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -2027,8 +2086,7 @@ L(\type\()_\taps\()_v): b.gt 1680b // 16x2, 16x4 v - add \xmy, \xmy, #2 - ld1 {v0.s}[0], [\xmy] + ldur s0, [\xmy, #2] sub \src, \src, \s_strd sxtl v0.8h, v0.8b @@ -2051,18 +2109,19 @@ L(\type\()_\taps\()_v): b 16b 0: ret +endfunc -L(\type\()_\taps\()_v_tbl): - .hword L(\type\()_\taps\()_v_tbl) - 1280b - .hword L(\type\()_\taps\()_v_tbl) - 640b - .hword L(\type\()_\taps\()_v_tbl) - 320b - .hword L(\type\()_\taps\()_v_tbl) - 160b - .hword L(\type\()_\taps\()_v_tbl) - 80b - .hword L(\type\()_\taps\()_v_tbl) - 40b - .hword L(\type\()_\taps\()_v_tbl) - 20b - .hword 0 - -L(\type\()_\taps\()_hv): +jumptable \type\()_\taps\()_v_tbl + .word 1280b - \type\()_\taps\()_v_tbl + .word 640b - \type\()_\taps\()_v_tbl + .word 320b - \type\()_\taps\()_v_tbl + .word 160b - \type\()_\taps\()_v_tbl + .word 80b - \type\()_\taps\()_v_tbl + .word 40b - \type\()_\taps\()_v_tbl + .word 20b - \type\()_\taps\()_v_tbl +endjumptable + +function L(\type\()_\taps\()_hv) cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -2071,16 +2130,16 @@ L(\type\()_\taps\()_hv): 4: add \xmy, x11, \my, uxtw #3 - adr x10, L(\type\()_\taps\()_hv_tbl) + movrel x10, \type\()_\taps\()_hv_tbl dup v30.4s, w12 // 6 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] + ldrsw x9, [x10, x9, lsl #2] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.4s, w13 // 6 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw + add x10, x10, x9 .ifc \type, put neg v29.4s, v29.4s // -(6+intermediate_bits) .endif @@ -2089,11 +2148,9 @@ L(\type\()_\taps\()_hv): 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] b.gt 280f - add \xmy, \xmy, #2 - ld1 {v1.s}[0], [\xmy] + ldur s1, [\xmy, #2] // 2x2, 2x4 hv sub \sr2, \src, #2 @@ -2236,11 +2293,9 @@ L(\type\()_\taps\()_filter_2): 40: AARCH64_VALID_JUMP_TARGET - add \xmx, \xmx, #2 - ld1 {v0.s}[0], [\xmx] + ldur s0, [\xmx, #2] b.gt 480f - add \xmy, \xmy, #2 - ld1 {v1.s}[0], [\xmy] + ldur s1, [\xmy, #2] sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd @@ -2293,7 +2348,7 @@ L(\type\()_\taps\()_filter_2): .endif subs \h, \h, #2 - st1 {v2.d}[0], [\dst], \d_strd + st1 {v2.8b}, [\dst], \d_strd st1 {v2.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b @@ -2392,7 +2447,7 @@ L(\type\()_\taps\()_filter_2): sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 - st1 {v3.d}[0], [\dst], \d_strd + st1 {v3.8b}, [\dst], \d_strd st1 {v3.d}[1], [\ds2], \d_strd b.le 0f .ifc \taps, 8tap @@ -2436,10 +2491,13 @@ L(\type\()_\taps\()_filter_4): 320: AARCH64_VALID_JUMP_TARGET b.gt 880f - add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] - ld1 {v1.s}[0], [\xmy] + ldur s1, [\xmy, #2] +.ifc \taps, 6tap + sub \src, \src, #4 +.else sub \src, \src, #6 +.endif sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2453,13 +2511,23 @@ L(\type\()_\taps\()_filter_4): lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd +.ifc \taps, 6tap + smull v24.4s, v27.4h, v0.h[1] + smull2 v25.4s, v27.8h, v0.h[1] + .irpc i, 23456 + ext v26.16b, v27.16b, v28.16b, #(2*\i-2) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] + .endr +.else smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] -.irpc i, 1234567 + .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] -.endr + .endr +.endif srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without @@ -2537,8 +2605,10 @@ L(\type\()_\taps\()_filter_4): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] +.ifc \taps, 6tap + sub \src, \src, #4 +.else sub \src, \src, #6 -.ifc \taps, 8tap sub \src, \src, \s_strd .endif sub \src, \src, \s_strd, lsl #1 @@ -2555,22 +2625,21 @@ L(\type\()_\taps\()_filter_4): ld1 {v27.8h, v28.8h}, [\src], \s_strd .ifc \taps, 6tap - ext v26.16b, v27.16b, v28.16b, #2 - smull v24.4s, v26.4h, v0.h[1] - smull2 v25.4s, v26.8h, v0.h[1] -.irpc i, 23456 - ext v26.16b, v27.16b, v28.16b, #(2*\i) + smull v24.4s, v27.4h, v0.h[1] + smull2 v25.4s, v27.8h, v0.h[1] + .irpc i, 23456 + ext v26.16b, v27.16b, v28.16b, #(2*\i-2) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] -.endr + .endr .else // 8tap smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] -.irpc i, 1234567 + .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] -.endr + .endr .endif srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) @@ -2712,15 +2781,13 @@ L(\type\()_\taps\()_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd ld1 {v6.8h, v7.8h}, [\src], \s_strd .ifc \taps, 6tap - ext v23.16b, v4.16b, v5.16b, #2 - ext v24.16b, v6.16b, v7.16b, #2 - smull v25.4s, v23.4h, v0.h[1] - smull2 v26.4s, v23.8h, v0.h[1] - smull v27.4s, v24.4h, v0.h[1] - smull2 v28.4s, v24.8h, v0.h[1] + smull v25.4s, v4.4h, v0.h[1] + smull2 v26.4s, v4.8h, v0.h[1] + smull v27.4s, v6.4h, v0.h[1] + smull2 v28.4s, v6.8h, v0.h[1] .irpc i, 23456 - ext v23.16b, v4.16b, v5.16b, #(2*\i) - ext v24.16b, v6.16b, v7.16b, #(2*\i) + ext v23.16b, v4.16b, v5.16b, #(2*\i-2) + ext v24.16b, v6.16b, v7.16b, #(2*\i-2) smlal v25.4s, v23.4h, v0.h[\i] smlal2 v26.4s, v23.8h, v0.h[\i] smlal v27.4s, v24.4h, v0.h[\i] @@ -2747,17 +2814,17 @@ L(\type\()_\taps\()_filter_8): uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 uzp1 v24.8h, v27.8h, v28.8h // Ditto ret - -L(\type\()_\taps\()_hv_tbl): - .hword L(\type\()_\taps\()_hv_tbl) - 1280b - .hword L(\type\()_\taps\()_hv_tbl) - 640b - .hword L(\type\()_\taps\()_hv_tbl) - 320b - .hword L(\type\()_\taps\()_hv_tbl) - 160b - .hword L(\type\()_\taps\()_hv_tbl) - 80b - .hword L(\type\()_\taps\()_hv_tbl) - 40b - .hword L(\type\()_\taps\()_hv_tbl) - 20b - .hword 0 endfunc + +jumptable \type\()_\taps\()_hv_tbl + .word 1280b - \type\()_\taps\()_hv_tbl + .word 640b - \type\()_\taps\()_hv_tbl + .word 320b - \type\()_\taps\()_hv_tbl + .word 160b - \type\()_\taps\()_hv_tbl + .word 80b - \type\()_\taps\()_hv_tbl + .word 40b - \type\()_\taps\()_hv_tbl + .word 20b - \type\()_\taps\()_hv_tbl +endjumptable .endm @@ -2787,21 +2854,21 @@ function \type\()_bilin_16bpc_neon, export=1 add w12, \bdmax, #4 // 4 + intermediate_bits cbnz \mx, L(\type\()_bilin_h) cbnz \my, L(\type\()_bilin_v) - b \type\()_neon + b \type\()_16bpc_neon L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) - adr x10, L(\type\()_bilin_h_tbl) + movrel x10, \type\()_bilin_h_tbl dup v31.8h, w11 // 4 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] + ldrsw x9, [x10, x9, lsl #2] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.8h, \bdmax // intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw + add x10, x10, x9 .ifc \type, put neg v30.8h, v30.8h // -intermediate_bits .endif @@ -2854,7 +2921,7 @@ L(\type\()_bilin_h): .else sub v4.8h, v4.8h, v29.8h .endif - st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.gt 4b ret @@ -2958,30 +3025,31 @@ L(\type\()_bilin_h): subs \h, \h, #2 b.gt 161b ret +endfunc -L(\type\()_bilin_h_tbl): - .hword L(\type\()_bilin_h_tbl) - 1280b - .hword L(\type\()_bilin_h_tbl) - 640b - .hword L(\type\()_bilin_h_tbl) - 320b - .hword L(\type\()_bilin_h_tbl) - 160b - .hword L(\type\()_bilin_h_tbl) - 80b - .hword L(\type\()_bilin_h_tbl) - 40b - .hword L(\type\()_bilin_h_tbl) - 20b - .hword 0 +jumptable \type\()_bilin_h_tbl + .word 1280b - \type\()_bilin_h_tbl + .word 640b - \type\()_bilin_h_tbl + .word 320b - \type\()_bilin_h_tbl + .word 160b - \type\()_bilin_h_tbl + .word 80b - \type\()_bilin_h_tbl + .word 40b - \type\()_bilin_h_tbl + .word 20b - \type\()_bilin_h_tbl +endjumptable -L(\type\()_bilin_v): +function L(\type\()_bilin_v) cmp \h, #4 - adr x10, L(\type\()_bilin_v_tbl) + movrel x10, \type\()_bilin_v_tbl .ifc \type, prep dup v31.8h, w11 // 4 - intermediate_bits .endif - ldrh w9, [x10, x9, lsl #1] + ldrsw x9, [x10, x9, lsl #2] .ifc \type, prep movi v29.8h, #(PREP_BIAS >> 8), lsl #8 neg v31.8h, v31.8h // -(4-intermediate_bits) .endif - sub x10, x10, w9, uxtw + add x10, x10, x9 br x10 20: // 2xN v @@ -2994,24 +3062,24 @@ L(\type\()_bilin_v): lsl \d_strd, \d_strd, #1 // 2x2 v - ld1 {v16.s}[0], [\src], \s_strd + ld1r {v16.4s}, [\src], \s_strd b.gt 24f 22: - ld1 {v17.s}[0], [\sr2], \s_strd - ld1 {v18.s}[0], [\src], \s_strd + ld1r {v17.4s}, [\sr2], \s_strd + ld1r {v18.4s}, [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s mul v4.4h, v16.4h, v2.4h mla v4.4h, v17.4h, v3.4h urshr v4.8h, v4.8h, #4 - st1 {v4.s}[0], [\dst] + str s4, [\dst] st1 {v4.s}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v - ld1 {v17.s}[0], [\sr2], \s_strd - ld1 {v18.s}[0], [\src], \s_strd - ld1 {v19.s}[0], [\sr2], \s_strd - ld1 {v20.s}[0], [\src], \s_strd + ld1r {v17.4s}, [\sr2], \s_strd + ld1r {v18.4s}, [\src], \s_strd + ld1r {v19.4s}, [\sr2], \s_strd + ld1r {v20.4s}, [\src], \s_strd sub \h, \h, #4 trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s @@ -3056,7 +3124,7 @@ L(\type\()_bilin_v): urshl v4.8h, v4.8h, v31.8h sub v4.8h, v4.8h, v29.8h .endif - st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b @@ -3156,28 +3224,29 @@ L(\type\()_bilin_v): b 1b 0: ret +endfunc -L(\type\()_bilin_v_tbl): - .hword L(\type\()_bilin_v_tbl) - 1280b - .hword L(\type\()_bilin_v_tbl) - 640b - .hword L(\type\()_bilin_v_tbl) - 320b - .hword L(\type\()_bilin_v_tbl) - 160b - .hword L(\type\()_bilin_v_tbl) - 80b - .hword L(\type\()_bilin_v_tbl) - 40b - .hword L(\type\()_bilin_v_tbl) - 20b - .hword 0 - -L(\type\()_bilin_hv): - adr x10, L(\type\()_bilin_hv_tbl) +jumptable \type\()_bilin_v_tbl + .word 1280b - \type\()_bilin_v_tbl + .word 640b - \type\()_bilin_v_tbl + .word 320b - \type\()_bilin_v_tbl + .word 160b - \type\()_bilin_v_tbl + .word 80b - \type\()_bilin_v_tbl + .word 40b - \type\()_bilin_v_tbl + .word 20b - \type\()_bilin_v_tbl +endjumptable + +function L(\type\()_bilin_hv) + movrel x10, \type\()_bilin_hv_tbl dup v31.8h, w11 // 4 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] + ldrsw x9, [x10, x9, lsl #2] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.4s, w12 // 4 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw + add x10, x10, x9 .ifc \type, put neg v30.4s, v30.4s // -(4+intermediate_bits) .endif @@ -3264,7 +3333,7 @@ L(\type\()_bilin_hv): sub v4.8h, v4.8h, v29.8h .endif subs \h, \h, #2 - st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f trn2 v16.2d, v17.2d, v17.2d @@ -3350,17 +3419,17 @@ L(\type\()_bilin_hv): b 1b 0: ret - -L(\type\()_bilin_hv_tbl): - .hword L(\type\()_bilin_hv_tbl) - 1280b - .hword L(\type\()_bilin_hv_tbl) - 640b - .hword L(\type\()_bilin_hv_tbl) - 320b - .hword L(\type\()_bilin_hv_tbl) - 160b - .hword L(\type\()_bilin_hv_tbl) - 80b - .hword L(\type\()_bilin_hv_tbl) - 40b - .hword L(\type\()_bilin_hv_tbl) - 20b - .hword 0 endfunc + +jumptable \type\()_bilin_hv_tbl + .word 1280b - \type\()_bilin_hv_tbl + .word 640b - \type\()_bilin_hv_tbl + .word 320b - \type\()_bilin_hv_tbl + .word 160b - \type\()_bilin_hv_tbl + .word 80b - \type\()_bilin_hv_tbl + .word 40b - \type\()_bilin_hv_tbl + .word 20b - \type\()_bilin_hv_tbl +endjumptable .endm make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap diff --git a/src/arm/64/mc16_sve.S b/src/arm/64/mc16_sve.S new file mode 100644 index 0000000000000000000000000000000000000000..9ebdb2187ec80e1a63cf6e0120a789b6bc9a3bc6 --- /dev/null +++ b/src/arm/64/mc16_sve.S @@ -0,0 +1,1649 @@ +/* + * Copyright © 2024, Arm Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define PREP_BIAS 32, lsl #8 // 8192 +#define PREP_BIAS_NEG 224, lsl #8 // -8192 + +#if HAVE_SVE2 +ENABLE_SVE +ENABLE_SVE2 + +// No spaces in these expressions, due to gas-preprocessor. It is translated by +// -1 to save the negative offset when getting the address of `mc_subpel_filters`. +#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) +#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) +#define SHARP1 (((2*15-1)<<7)|(3*15-1)) + +#define FUNC_ALIGN 2 +#define JUMP_ALIGN 2 +#define LOOP_ALIGN 2 + + +// Shuffle indices to permute horizontal samples in preparation for input to +// 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample +// indices in the interval of [-3, 4] relative to the current sample position. +const h_tbl_sve, align=4 + .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 +endconst + +// Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit +// registers contain a transposed 4x4 matrix of values. Subsequent iterations +// of the vertical convolution can reuse the 3x4 sub-matrix from the previous +// loop iteration. These shuffle indices shift and merge this 4x4 matrix with +// the values of a new line. +const v_tbl_sve, align=4 + .byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25 + .byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19 + .byte 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23 + .byte 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27 + .byte 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31 +endconst + + +.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 +function \op\()_8tap_\type\()_16bpc_\isa, export=1, align=FUNC_ALIGN + mov x9, \type_h + mov x10, \type_v + .if \jump + b \op\()_8tap_\isa + .endif +endfunc +.endm + +.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd +make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa +make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa +make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa +make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa +make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa +make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa +make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa +make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa +make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 + +function \type\()_8tap_\isa, align=FUNC_ALIGN + clz w8, \w + mov w11, #0x4081 // (1<<14) | (1<<7) | 1 + ptrue p0.b, vl16 + sub w8, w8, #24 // for jump tables + movrel x12, X(mc_subpel_filters) + cbnz \mx, L(\type\()_8tap_h_hv_\isa) +.ifc \type, prep + cbz \my, prep_sve +.else // put + cbnz \my, L(\type\()_8tap_v_\isa) + mov w9, w8 + b X(put_16bpc_neon) + + .align JUMP_ALIGN +.endif + +L(\type\()_8tap_v_\isa): + madd \my, \my, w11, w10 + movrel x13, v_tbl_sve +.ifc \bdmax, w8 // put case, but skip + ld1r {v5.8h}, [sp] // loading into w8 +.endif + sub \src, \src, \s_strd // src - s_strd + ubfx w11, \my, #7, #7 + and \my, \my, #0x7F + ldr q6, [x13] + cmp \h, #4 + csel \my, \my, w11, le + sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd + add \xmy, x12, \xmy, lsl #3 // subpel V filter address + ldp q28, q29, [x13, #16] + ld1sb {z7.h}, p0/z, [\xmy] +.ifc \type, prep + clz \bdmax, \bdmax + sub \bdmax, \bdmax, #24 + dup v5.4s, \bdmax +.endif + cmp \w, #8 + b.lt 40f + + // .align JUMP_ALIGN // fallthrough +80: // V - 8xN+ + ldp q30, q31, [x13, #48] +.ifc \type, prep + add \wd_strd, \w, \w // d_strd = 2 * w +.endif + .align LOOP_ALIGN +81: + add \lsrc, \src, \s_strd, lsl #1 + + ldr q16, [\src] + ldr q17, [\src, \s_strd] + ldr q18, [\lsrc] + ldr q19, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + mov \ldst, \dst + + ldr q20, [\lsrc] + ldr q21, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + ldr q22, [\lsrc] + ldr q23, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + sub w8, \h, #1 + + zip1 v0.8h, v16.8h, v17.8h + zip2 v1.8h, v16.8h, v17.8h + zip1 v2.8h, v18.8h, v19.8h + zip2 v3.8h, v18.8h, v19.8h + + zip1 v18.8h, v20.8h, v21.8h + zip2 v21.8h, v20.8h, v21.8h + zip1 v24.8h, v22.8h, v23.8h + zip2 v27.8h, v22.8h, v23.8h + + zip1 v16.4s, v0.4s, v2.4s + zip2 v19.4s, v0.4s, v2.4s + zip1 v22.4s, v1.4s, v3.4s + zip2 v25.4s, v1.4s, v3.4s + + zip1 v17.4s, v18.4s, v24.4s + zip2 v20.4s, v18.4s, v24.4s + zip1 v23.4s, v21.4s, v27.4s + zip2 v26.4s, v21.4s, v27.4s + + .align LOOP_ALIGN +8: + ld1 {v18.16b}, [\lsrc], \s_strd + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + mov v21.16b, v18.16b + mov v24.16b, v18.16b + mov v27.16b, v18.16b + + sdot z0.d, z16.h, z7.h[0] + tbl v16.16b, {v16.16b, v17.16b}, v6.16b + sdot z1.d, z19.h, z7.h[0] + tbl v19.16b, {v19.16b, v20.16b}, v6.16b + sdot z2.d, z22.h, z7.h[0] + tbl v22.16b, {v22.16b, v23.16b}, v6.16b + subs w8, w8, #1 + sdot z3.d, z25.h, z7.h[0] + tbl v25.16b, {v25.16b, v26.16b}, v6.16b + + sdot z0.d, z17.h, z7.h[1] + tbl v17.16b, {v17.16b, v18.16b}, v28.16b + sdot z1.d, z20.h, z7.h[1] + tbl v20.16b, {v20.16b, v21.16b}, v29.16b + sdot z2.d, z23.h, z7.h[1] + tbl v23.16b, {v23.16b, v24.16b}, v30.16b + sdot z3.d, z26.h, z7.h[1] + tbl v26.16b, {v26.16b, v27.16b}, v31.16b + + uzp1 v0.4s, v0.4s, v1.4s + uzp1 v1.4s, v2.4s, v3.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + uzp1 v0.8h, v0.8h, v1.8h + sub z0.h, z0.h, #PREP_BIAS +.else // put + sqrshrun v0.4h, v0.4s, #6 + sqrshrun2 v0.8h, v1.4s, #6 + umin v0.8h, v0.8h, v5.8h +.endif + st1 {v0.16b}, [\ldst], \d_strd + b.gt 8b + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + + sdot z0.d, z16.h, z7.h[0] + sdot z1.d, z19.h, z7.h[0] + sdot z2.d, z22.h, z7.h[0] + sdot z3.d, z25.h, z7.h[0] + + sdot z0.d, z17.h, z7.h[1] + sdot z1.d, z20.h, z7.h[1] + sdot z2.d, z23.h, z7.h[1] + sdot z3.d, z26.h, z7.h[1] + subs \w, \w, #8 + + uzp1 v0.4s, v0.4s, v1.4s + uzp1 v1.4s, v2.4s, v3.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + uzp1 v0.8h, v0.8h, v1.8h + sub z0.h, z0.h, #PREP_BIAS +.else // put + sqrshrun v0.4h, v0.4s, #6 + sqrshrun2 v0.8h, v1.4s, #6 + umin v0.8h, v0.8h, v5.8h +.endif + str q0, [\ldst] + + add \dst, \dst, #16 + add \src, \src, #16 + b.gt 81b + ret + + .align JUMP_ALIGN +40: // V - 4xN, put only: 2xN +.ifc \type, put + lsr \d_strd, \d_strd, #1 // hword index for `st1h` + whilelt p1.h, wzr, \w // masking for writes +.endif + cmp \h, #4 + b.le 44f + + ldr d16, [\src] + ldr d17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d18, [\src] + ldr d19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr d20, [\src] + ldr d21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d22, [\src] + ldr d23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + sub \h, \h, #2 + + zip1 v0.8h, v16.8h, v17.8h + zip1 v2.8h, v18.8h, v19.8h + zip1 v18.8h, v20.8h, v21.8h + zip1 v24.8h, v22.8h, v23.8h + + zip1 v16.4s, v0.4s, v2.4s + zip2 v19.4s, v0.4s, v2.4s + zip1 v17.4s, v18.4s, v24.4s + zip2 v20.4s, v18.4s, v24.4s + + .align LOOP_ALIGN +4: + ldr d18, [\src] + ldr d24, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + mov v21.16b, v18.16b + mov v27.16b, v24.16b + + sdot z0.d, z16.h, z7.h[0] + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + sdot z1.d, z19.h, z7.h[0] + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + sdot z0.d, z17.h, z7.h[1] + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + sdot z1.d, z20.h, z7.h[1] + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + subs \h, \h, #2 + + sdot z2.d, z22.h, z7.h[0] + tbl v16.16b, {v22.16b, v23.16b}, v6.16b + sdot z3.d, z25.h, z7.h[0] + tbl v19.16b, {v25.16b, v26.16b}, v6.16b + sdot z2.d, z23.h, z7.h[1] + tbl v17.16b, {v23.16b, v24.16b}, v28.16b + sdot z3.d, z26.h, z7.h[1] + tbl v20.16b, {v26.16b, v27.16b}, v29.16b + + uzp1 v0.4s, v0.4s, v1.4s + uzp1 v1.4s, v2.4s, v3.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + uzp1 v0.8h, v0.8h, v1.8h + sub z0.h, z0.h, #PREP_BIAS + str q0, [\dst], #16 +.else // put + sqrshrun v0.4h, v0.4s, #6 + sqrshrun v1.4h, v1.4s, #6 + umin v0.4h, v0.4h, v5.4h + umin v1.4h, v1.4h, v5.4h + st1h {z0.h}, p1, [\dst] + st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] + add \dst, \dst, \d_strd, lsl #2 +.endif + b.gt 4b + + ldr d18, [\src] + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + mov v21.16b, v18.16b + + sdot z0.d, z16.h, z7.h[0] + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + sdot z1.d, z19.h, z7.h[0] + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + sdot z0.d, z17.h, z7.h[1] + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + sdot z1.d, z20.h, z7.h[1] + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + + sdot z2.d, z22.h, z7.h[0] + sdot z3.d, z25.h, z7.h[0] + sdot z2.d, z23.h, z7.h[1] + sdot z3.d, z26.h, z7.h[1] + + uzp1 v0.4s, v0.4s, v1.4s + uzp1 v1.4s, v2.4s, v3.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + uzp1 v0.8h, v0.8h, v1.8h + sub z0.h, z0.h, #PREP_BIAS + str q0, [\dst] +.else // put + sqrshrun v0.4h, v0.4s, #6 + sqrshrun v1.4h, v1.4s, #6 + umin v0.4h, v0.4h, v5.4h + umin v1.4h, v1.4h, v5.4h + st1h {z0.h}, p1, [\dst] + st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] +.endif + ret + + .align JUMP_ALIGN +44: // V - 4x4, put only: 4x2, 2x4, 2x2 + add \src, \src, \s_strd, lsl #1 // src - s_strd + subs \h, \h, #2 + + ldr d16, [\src] + ldr d17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d18, [\src] + ldr d19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ext v7.16b, v7.16b, v7.16b, #4 // [\xmy + 2 * 2] + + zip1 v0.8h, v16.8h, v17.8h + zip1 v2.8h, v18.8h, v19.8h + zip1 v16.4s, v0.4s, v2.4s + zip2 v19.4s, v0.4s, v2.4s + +.ifc \type, put + b.eq 42f +.endif + ldr d17, [\src] + ldr d23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + mov v20.16b, v17.16b + mov v26.16b, v23.16b + + sdot z0.d, z16.h, z7.h[0] + tbl v22.16b, {v16.16b, v17.16b}, v28.16b + sdot z1.d, z19.h, z7.h[0] + tbl v25.16b, {v19.16b, v20.16b}, v29.16b + sdot z2.d, z22.h, z7.h[0] + tbl v16.16b, {v22.16b, v23.16b}, v28.16b + sdot z3.d, z25.h, z7.h[0] + tbl v19.16b, {v25.16b, v26.16b}, v29.16b + + uzp1 v0.4s, v0.4s, v1.4s + uzp1 v1.4s, v2.4s, v3.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + uzp1 v0.8h, v0.8h, v1.8h + sub z0.h, z0.h, #PREP_BIAS + str q0, [\dst], #16 +.else // put + sqrshrun v0.4h, v0.4s, #6 + sqrshrun v1.4h, v1.4s, #6 + umin v0.4h, v0.4h, v5.4h + umin v1.4h, v1.4h, v5.4h + st1h {z0.h}, p1, [\dst] + st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] + add \dst, \dst, \d_strd, lsl #2 +.endif + +.ifc \type, put + .align JUMP_ALIGN +42: +.endif + ldr d17, [\src] + + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 + mov v20.16b, v17.16b + + sdot z0.d, z16.h, z7.h[0] + tbl v22.16b, {v16.16b, v17.16b}, v28.16b + sdot z1.d, z19.h, z7.h[0] + tbl v25.16b, {v19.16b, v20.16b}, v29.16b + + sdot z2.d, z22.h, z7.h[0] + sdot z3.d, z25.h, z7.h[0] + + uzp1 v0.4s, v0.4s, v1.4s + uzp1 v1.4s, v2.4s, v3.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + uzp1 v0.8h, v0.8h, v1.8h + sub z0.h, z0.h, #PREP_BIAS + str q0, [\dst] +.else // put + sqrshrun v0.4h, v0.4s, #6 + sqrshrun v1.4h, v1.4s, #6 + umin v0.4h, v0.4h, v5.4h + umin v1.4h, v1.4h, v5.4h + st1h {z0.h}, p1, [\dst] + st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] +.endif + ret + + .align JUMP_ALIGN +L(\type\()_8tap_h_hv_\isa): + madd \mx, \mx, w11, w9 + movrel x13, h_tbl_sve + sub \src, \src, #6 // src - 3 * 2 + ubfx w9, \mx, #7, #7 + and \mx, \mx, #0x7F + cmp \w, #4 + csel \mx, \mx, w9, le + ldp q30, q31, [x13] + add \xmx, x12, \xmx, lsl #3 // subpel H filter address + cbz \my, L(\type\()_8tap_h_\isa) + + // HV cases + madd w14, \my, w11, w10 +.ifc \bdmax, w8 + ldr \bdmax, [sp] +.endif + ubfx w11, w14, #7, #7 + and w14, w14, #0x7F + ld1sb {z4.h}, p0/z, [\xmx] + cmp \h, #4 + csel w14, w14, w11, le +.ifc \type, put + dup v29.8h, \bdmax +.endif + clz \bdmax, \bdmax + add \xmy, x12, x14, lsl #3 // subpel V filter address + ld1sb {z7.h}, p0/z, [\xmy] +.ifc \type, put + mov w9, #12 + sub w9, w9, \bdmax + dup v6.4s, w9 +.endif + sub \bdmax, \bdmax, #24 + mov x15, x30 + sub \src, \src, \s_strd // src - s_strd - 3 * 2 + dup v5.4s, \bdmax + cmp w10, SHARP1 + b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 + + // HV 8-tap cases + cmp \w, #4 + b.le 40f + + // .align JUMP_ALIGN // fallthrough +80: // HV8 - 8xN+ +.ifc \type, prep + add \wd_strd, \w, \w // d_strd = 2 * w +.endif + cmp \h, #4 + b.le 84f + sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2 + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + uzp1 v16.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v17.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v18.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v19.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v20.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v21.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v22.8h, v23.8h, v24.8h + + .align LOOP_ALIGN +8: + ldp q24, q28, [\lsrc] + smull v0.4s, v16.4h, v7.h[0] + smull2 v1.4s, v16.8h, v7.h[0] + mov v16.16b, v17.16b + + movi v2.2d, #0 + movi v3.2d, #0 + tbl v23.16b, {v24.16b}, v30.16b + tbl v24.16b, {v24.16b}, v31.16b + + ldur q26, [\lsrc, #8] + smlal v0.4s, v17.4h, v7.h[1] + smlal2 v1.4s, v17.8h, v7.h[1] + mov v17.16b, v18.16b + add \lsrc, \lsrc, \s_strd + + sdot z2.d, z23.h, z4.h[0] + sdot z3.d, z24.h, z4.h[0] + movi v23.2d, #0 + movi v24.2d, #0 + tbl v25.16b, {v26.16b}, v30.16b + tbl v26.16b, {v26.16b}, v31.16b + smlal v0.4s, v18.4h, v7.h[2] + smlal2 v1.4s, v18.8h, v7.h[2] + mov v18.16b, v19.16b + + sdot z23.d, z25.h, z4.h[0] + sdot z24.d, z26.h, z4.h[0] + tbl v27.16b, {v28.16b}, v30.16b + tbl v28.16b, {v28.16b}, v31.16b + smlal v0.4s, v19.4h, v7.h[3] + smlal2 v1.4s, v19.8h, v7.h[3] + mov v19.16b, v20.16b + + subs w8, w8, #1 + sdot z2.d, z25.h, z4.h[1] + sdot z3.d, z26.h, z4.h[1] + sdot z23.d, z27.h, z4.h[1] + sdot z24.d, z28.h, z4.h[1] + + smlal v0.4s, v20.4h, v7.h[4] + smlal2 v1.4s, v20.8h, v7.h[4] + mov v20.16b, v21.16b + + uzp1 v3.4s, v2.4s, v3.4s + uzp1 v24.4s, v23.4s, v24.4s + smlal v0.4s, v21.4h, v7.h[5] + smlal2 v1.4s, v21.8h, v7.h[5] + mov v21.16b, v22.16b + + srshl v23.4s, v3.4s, v5.4s + srshl v24.4s, v24.4s, v5.4s + smlal v0.4s, v22.4h, v7.h[6] + smlal2 v1.4s, v22.8h, v7.h[6] + + uzp1 v22.8h, v23.8h, v24.8h + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + +.ifc \type, prep + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + sub z0.h, z0.h, #PREP_BIAS +.else // put + srshl v0.4s, v0.4s, v6.4s + srshl v1.4s, v1.4s, v6.4s + sqxtun v0.4h, v0.4s + sqxtun2 v0.8h, v1.4s + umin v0.8h, v0.8h, v29.8h +.endif + st1 {v0.8h}, [\ldst], \d_strd + b.gt 8b + + subs \w, \w, #8 + add \src, \src, #16 + add \dst, \dst, #16 + b.gt 81b + ret x15 + + .align JUMP_ALIGN +40: // HV8 - 4xN, put only: 2xN +.ifc \type, put + lsr \d_strd, \d_strd, #1 // hword index for `st1h` + whilelt p1.h, wzr, \w // masking for writes +.endif + ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] + add \src, \src, #4 + + cmp \h, #4 + b.le 44f + + sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2 + bl L(\type\()_hv_filter4_\isa) + xtn v16.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v17.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v18.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v19.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v20.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v21.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v22.4h, v0.4s + + .align LOOP_ALIGN +4: + ld1 {v3.16b}, [\src], \s_strd + + smull v24.4s, v16.4h, v7.h[0] + smlal v24.4s, v17.4h, v7.h[1] + tbl v2.16b, {v3.16b}, v30.16b + tbl v3.16b, {v3.16b}, v31.16b + movi v0.2d, #0 + movi v1.2d, #0 + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + smlal v24.4s, v18.4h, v7.h[2] + smlal v24.4s, v19.4h, v7.h[3] + sdot z0.d, z2.h, z4.h[0] + sdot z1.d, z3.h, z4.h[0] + mov v18.16b, v19.16b + mov v19.16b, v20.16b + uzp1 v0.4s, v0.4s, v1.4s + + smlal v24.4s, v20.4h, v7.h[4] + smlal v24.4s, v21.4h, v7.h[5] + srshl v0.4s, v0.4s, v5.4s + mov v20.16b, v21.16b + mov v21.16b, v22.16b + + subs \h, \h, #1 + smlal v24.4s, v22.4h, v7.h[6] + xtn v22.4h, v0.4s + smlal v24.4s, v22.4h, v7.h[7] + +.ifc \type, prep + rshrn v0.4h, v24.4s, #6 + sub z0.h, z0.h, #PREP_BIAS + str d0, [\dst], #8 +.else // put + srshl v0.4s, v24.4s, v6.4s + sqxtun v0.4h, v0.4s + umin v0.4h, v0.4h, v29.4h + st1h {z0.h}, p1, [\dst] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + ret x15 + + .align JUMP_ALIGN +L(\type\()_6tap_hv_\isa): + cmp \w, #4 + b.le 46f + + // .align JUMP_ALIGN // fallthrough +80: // HV6 - 8xN+ +.ifc \type, prep + add \wd_strd, \w, \w // d_strd = 2 * w +.endif + cmp \h, #4 + b.le 84f + sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2 + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + uzp1 v16.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v17.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v18.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v19.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v20.8h, v23.8h, v24.8h + + .align LOOP_ALIGN +8: + ldp q24, q28, [\lsrc] + + smull v0.4s, v16.4h, v7.h[1] + smull2 v1.4s, v16.8h, v7.h[1] + mov v16.16b, v17.16b + + tbl v23.16b, {v24.16b}, v30.16b + tbl v24.16b, {v24.16b}, v31.16b + movi v2.2d, #0 + movi v3.2d, #0 + + ldur q26, [\lsrc, #8] + add \lsrc, \lsrc, \s_strd + + sdot z2.d, z23.h, z4.h[0] + sdot z3.d, z24.h, z4.h[0] + tbl v25.16b, {v26.16b}, v30.16b + tbl v26.16b, {v26.16b}, v31.16b + movi v23.2d, #0 + movi v24.2d, #0 + + sdot z23.d, z25.h, z4.h[0] + sdot z24.d, z26.h, z4.h[0] + tbl v27.16b, {v28.16b}, v30.16b + tbl v28.16b, {v28.16b}, v31.16b + smlal v0.4s, v17.4h, v7.h[2] + smlal2 v1.4s, v17.8h, v7.h[2] + mov v17.16b, v18.16b + + sdot z2.d, z25.h, z4.h[1] + sdot z3.d, z26.h, z4.h[1] + sdot z23.d, z27.h, z4.h[1] + sdot z24.d, z28.h, z4.h[1] + + smlal v0.4s, v18.4h, v7.h[3] + smlal2 v1.4s, v18.8h, v7.h[3] + mov v18.16b, v19.16b + + uzp1 v3.4s, v2.4s, v3.4s + uzp1 v24.4s, v23.4s, v24.4s + smlal v0.4s, v19.4h, v7.h[4] + smlal2 v1.4s, v19.8h, v7.h[4] + mov v19.16b, v20.16b + + srshl v23.4s, v3.4s, v5.4s + srshl v24.4s, v24.4s, v5.4s + smlal v0.4s, v20.4h, v7.h[5] + smlal2 v1.4s, v20.8h, v7.h[5] + + subs w8, w8, #1 + uzp1 v20.8h, v23.8h, v24.8h + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + +.ifc \type, prep + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + sub z0.h, z0.h, #PREP_BIAS +.else // put + srshl v0.4s, v0.4s, v6.4s + srshl v1.4s, v1.4s, v6.4s + sqxtun v0.4h, v0.4s + sqxtun2 v0.8h, v1.4s + umin v0.8h, v0.8h, v29.8h +.endif + st1 {v0.8h}, [\ldst], \d_strd + b.gt 8b + + add \dst, \dst, #16 + subs \w, \w, #8 + add \src, \src, #16 + b.gt 81b + ret x15 + + .align LOOP_ALIGN +84: // HV4 - 8x4, 8x2 + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + uzp1 v17.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v18.8h, v23.8h, v24.8h + bl L(\type\()_hv_filter8_\isa) + uzp1 v19.8h, v23.8h, v24.8h + + .align LOOP_ALIGN +81: + ldp q24, q28, [\lsrc] + ldur q26, [\lsrc, #8] + add \lsrc, \lsrc, \s_strd + + tbl v23.16b, {v24.16b}, v30.16b + tbl v24.16b, {v24.16b}, v31.16b + movi v2.2d, #0 + movi v3.2d, #0 + sdot z2.d, z23.h, z4.h[0] + sdot z3.d, z24.h, z4.h[0] + + tbl v25.16b, {v26.16b}, v30.16b + tbl v26.16b, {v26.16b}, v31.16b + movi v23.2d, #0 + movi v24.2d, #0 + sdot z23.d, z25.h, z4.h[0] + sdot z24.d, z26.h, z4.h[0] + + tbl v27.16b, {v28.16b}, v30.16b + tbl v28.16b, {v28.16b}, v31.16b + sdot z2.d, z25.h, z4.h[1] + sdot z3.d, z26.h, z4.h[1] + sdot z23.d, z27.h, z4.h[1] + sdot z24.d, z28.h, z4.h[1] + + smull v0.4s, v17.4h, v7.h[2] + smull2 v1.4s, v17.8h, v7.h[2] + mov v17.16b, v18.16b + + subs w8, w8, #1 + uzp1 v3.4s, v2.4s, v3.4s + uzp1 v24.4s, v23.4s, v24.4s + smlal v0.4s, v18.4h, v7.h[3] + smlal2 v1.4s, v18.8h, v7.h[3] + mov v18.16b, v19.16b + + srshl v23.4s, v3.4s, v5.4s + srshl v24.4s, v24.4s, v5.4s + smlal v0.4s, v19.4h, v7.h[4] + smlal2 v1.4s, v19.8h, v7.h[4] + + uzp1 v19.8h, v23.8h, v24.8h + smlal v0.4s, v19.4h, v7.h[5] + smlal2 v1.4s, v19.8h, v7.h[5] + +.ifc \type, prep + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + sub z0.h, z0.h, #PREP_BIAS +.else // put + srshl v0.4s, v0.4s, v6.4s + srshl v1.4s, v1.4s, v6.4s + sqxtun v0.4h, v0.4s + sqxtun2 v0.8h, v1.4s + umin v0.8h, v0.8h, v29.8h +.endif + st1 {v0.8h}, [\ldst], \d_strd + b.gt 81b + + subs \w, \w, #8 + add \dst, \dst, #16 + add \src, \src, #16 + b.gt 84b + ret x15 + + .align FUNC_ALIGN +L(\type\()_hv_filter8_\isa): + ldp q24, q28, [\lsrc] + ldur q26, [\lsrc, #8] + add \lsrc, \lsrc, \s_strd + + tbl v23.16b, {v24.16b}, v30.16b + tbl v24.16b, {v24.16b}, v31.16b + movi v2.2d, #0 + movi v3.2d, #0 + sdot z2.d, z23.h, z4.h[0] + sdot z3.d, z24.h, z4.h[0] + + tbl v25.16b, {v26.16b}, v30.16b + tbl v26.16b, {v26.16b}, v31.16b + movi v23.2d, #0 + movi v24.2d, #0 + sdot z23.d, z25.h, z4.h[0] + sdot z24.d, z26.h, z4.h[0] + + tbl v27.16b, {v28.16b}, v30.16b + tbl v28.16b, {v28.16b}, v31.16b + sdot z2.d, z25.h, z4.h[1] + sdot z3.d, z26.h, z4.h[1] + sdot z23.d, z27.h, z4.h[1] + sdot z24.d, z28.h, z4.h[1] + + uzp1 v3.4s, v2.4s, v3.4s + uzp1 v24.4s, v23.4s, v24.4s + srshl v23.4s, v3.4s, v5.4s + srshl v24.4s, v24.4s, v5.4s + ret + + .align FUNC_ALIGN +L(\type\()_hv_filter4_\isa): + ld1 {v3.16b}, [\src], \s_strd + + tbl v2.16b, {v3.16b}, v30.16b + tbl v3.16b, {v3.16b}, v31.16b + movi v0.2d, #0 + movi v1.2d, #0 + sdot z0.d, z2.h, z4.h[0] + sdot z1.d, z3.h, z4.h[0] + + uzp1 v0.4s, v0.4s, v1.4s + srshl v0.4s, v0.4s, v5.4s + ret + + .align JUMP_ALIGN +46: // H4V6 - 4xN, put only: 2xN +.ifc \type, put + lsr \d_strd, \d_strd, #1 // hword index for `st1h` + whilelt p1.h, wzr, \w // masking for writes +.endif + ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] + add \src, \src, #4 + + cmp \h, #4 + b.le 44f + + sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2 + bl L(\type\()_hv_filter4_\isa) + xtn v16.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v17.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v18.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v19.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v20.4h, v0.4s + + .align LOOP_ALIGN +4: + ld1 {v3.16b}, [\src], \s_strd + smull v24.4s, v16.4h, v7.h[1] + smlal v24.4s, v17.4h, v7.h[2] + + tbl v2.16b, {v3.16b}, v30.16b + tbl v3.16b, {v3.16b}, v31.16b + movi v0.2d, #0 + movi v1.2d, #0 + sdot z0.d, z2.h, z4.h[0] + sdot z1.d, z3.h, z4.h[0] + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + smlal v24.4s, v18.4h, v7.h[3] + smlal v24.4s, v19.4h, v7.h[4] + uzp1 v0.4s, v0.4s, v1.4s + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + subs \h, \h, #1 + srshl v0.4s, v0.4s, v5.4s + smlal v24.4s, v20.4h, v7.h[5] + xtn v20.4h, v0.4s + smlal v24.4s, v20.4h, v7.h[6] + +.ifc \type, prep + rshrn v0.4h, v24.4s, #6 + sub z0.h, z0.h, #PREP_BIAS + str d0, [\dst], #8 +.else // put + srshl v0.4s, v24.4s, v6.4s + sqxtun v0.4h, v0.4s + umin v0.4h, v0.4h, v29.4h + st1h {z0.h}, p1, [\dst] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + ret x15 + + .align JUMP_ALIGN +44: // H4V4 - 4x4, put only: 4x2, 2x4, 2x2 + bl L(\type\()_hv_filter4_\isa) + xtn v17.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v18.4h, v0.4s + bl L(\type\()_hv_filter4_\isa) + xtn v19.4h, v0.4s + + .align LOOP_ALIGN +4: + ld1 {v3.16b}, [\src], \s_strd + smull v24.4s, v17.4h, v7.h[2] + smlal v24.4s, v18.4h, v7.h[3] + + tbl v2.16b, {v3.16b}, v30.16b + tbl v3.16b, {v3.16b}, v31.16b + movi v0.2d, #0 + movi v1.2d, #0 + sdot z0.d, z2.h, z4.h[0] + sdot z1.d, z3.h, z4.h[0] + uzp1 v0.4s, v0.4s, v1.4s + + mov v17.16b, v18.16b + mov v18.16b, v19.16b + subs \h, \h, #1 + srshl v0.4s, v0.4s, v5.4s + smlal v24.4s, v19.4h, v7.h[4] + xtn v19.4h, v0.4s + smlal v24.4s, v19.4h, v7.h[5] + +.ifc \type, prep + rshrn v0.4h, v24.4s, #6 + sub z0.h, z0.h, #PREP_BIAS + str d0, [\dst], #8 +.else // put + srshl v0.4s, v24.4s, v6.4s + sqxtun v0.4h, v0.4s + umin v0.4h, v0.4h, v29.4h + st1h {z0.h}, p1, [\dst] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + ret x15 + + .align JUMP_ALIGN +L(\type\()_8tap_h_\isa): + movrel x11, \type\()_8tap_h_\isa\()_tbl + ldrsw x12, [x11, x8, lsl #2] +.ifc \bdmax, w8 + ldr \bdmax, [sp] +.endif +.ifc \type, prep + clz \bdmax, \bdmax + sub \bdmax, \bdmax, #24 + dup v5.4s, \bdmax +.else // put + mov w9, #34 // rounding for 10-bit case + mov w10, #40 // rounding for 12-bit case + cmp \bdmax, #0xFFF + csel w9, w9, w10, ne // select rounding based on \bdmax + dup v5.8h, \bdmax + dup v6.2d, x9 +.endif + add x11, x11, x12 + ld1sb {z4.h}, p0/z, [\xmx] + br x11 + + .align JUMP_ALIGN +20: // H - 4xN, put only: 2xN +40: + AARCH64_VALID_JUMP_TARGET + add \src, \src, #4 // src - 1 * 2 + ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] +.ifc \type, put + lsr \d_strd, \d_strd, #1 // hword index for `st1h` + whilelt p1.h, wzr, \w // masking for writes +.endif + .align LOOP_ALIGN +4: + ldr q17, [\src] + ldr q19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + +.ifc \type, prep + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 +.else + mov v0.16b, v6.16b + mov v1.16b, v6.16b + mov v2.16b, v6.16b + mov v3.16b, v6.16b +.endif + tbl v16.16b, {v17.16b}, v30.16b + tbl v17.16b, {v17.16b}, v31.16b + sdot z0.d, z16.h, z4.h[0] + sdot z1.d, z17.h, z4.h[0] + subs \h, \h, #2 + tbl v18.16b, {v19.16b}, v30.16b + tbl v19.16b, {v19.16b}, v31.16b + sdot z2.d, z18.h, z4.h[0] + sdot z3.d, z19.h, z4.h[0] + + uzp1 v0.4s, v0.4s, v1.4s + uzp1 v1.4s, v2.4s, v3.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + uzp1 v0.8h, v0.8h, v1.8h + sub z0.h, z0.h, #PREP_BIAS + str q0, [\dst], #16 +.else // put + sqshrun v0.4h, v0.4s, #6 + sqshrun v1.4h, v1.4s, #6 + umin v0.4h, v0.4h, v5.4h + umin v1.4h, v1.4h, v5.4h + st1h {z0.h}, p1, [\dst] + st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] + add \dst, \dst, \d_strd, lsl #2 +.endif + b.gt 4b + ret + + .align JUMP_ALIGN +80: // H - 8xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +8: + ldp q17, q21, [\src] + ldur q19, [\src, #8] + +.ifc \type, prep + movi v0.2d, #0 + movi v2.2d, #0 +.else + mov v0.16b, v6.16b + mov v2.16b, v6.16b +.endif + tbl v16.16b, {v17.16b}, v30.16b + tbl v17.16b, {v17.16b}, v31.16b + add \src, \src, \s_strd + sdot z0.d, z16.h, z4.h[0] + sdot z2.d, z17.h, z4.h[0] + + tbl v18.16b, {v19.16b}, v30.16b + tbl v19.16b, {v19.16b}, v31.16b +.ifc \type, prep + movi v16.2d, #0 + movi v17.2d, #0 +.else + mov v16.16b, v6.16b + mov v17.16b, v6.16b +.endif + ldp q23, q27, [\src] + ldur q25, [\src, #8] + + sdot z16.d, z18.h, z4.h[0] + sdot z17.d, z19.h, z4.h[0] + + tbl v22.16b, {v23.16b}, v30.16b + tbl v23.16b, {v23.16b}, v31.16b +.ifc \type, prep + movi v1.2d, #0 + movi v3.2d, #0 +.else + mov v1.16b, v6.16b + mov v3.16b, v6.16b +.endif + add \src, \src, \s_strd + sdot z1.d, z22.h, z4.h[0] + sdot z3.d, z23.h, z4.h[0] + + tbl v24.16b, {v25.16b}, v30.16b + tbl v25.16b, {v25.16b}, v31.16b +.ifc \type, prep + movi v22.2d, #0 + movi v23.2d, #0 +.else + mov v22.16b, v6.16b + mov v23.16b, v6.16b +.endif + sdot z22.d, z24.h, z4.h[0] + sdot z23.d, z25.h, z4.h[0] + + tbl v20.16b, {v21.16b}, v30.16b + tbl v21.16b, {v21.16b}, v31.16b + sdot z0.d, z18.h, z4.h[1] + sdot z2.d, z19.h, z4.h[1] + tbl v26.16b, {v27.16b}, v30.16b + tbl v27.16b, {v27.16b}, v31.16b + sdot z16.d, z20.h, z4.h[1] + sdot z17.d, z21.h, z4.h[1] + + sdot z1.d, z24.h, z4.h[1] + sdot z3.d, z25.h, z4.h[1] + + sdot z22.d, z26.h, z4.h[1] + sdot z23.d, z27.h, z4.h[1] + + subs \h, \h, #2 + uzp1 v0.4s, v0.4s, v2.4s + uzp1 v2.4s, v16.4s, v17.4s + uzp1 v1.4s, v1.4s, v3.4s + uzp1 v3.4s, v22.4s, v23.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v2.4s, v2.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + srshl v3.4s, v3.4s, v5.4s + uzp1 v0.8h, v0.8h, v2.8h + uzp1 v1.8h, v1.8h, v3.8h + sub z0.h, z0.h, #PREP_BIAS + sub z1.h, z1.h, #PREP_BIAS + stp q0, q1, [\dst], #32 +.else // put + sqshrun v0.4h, v0.4s, #6 + sqshrun2 v0.8h, v2.4s, #6 + sqshrun v1.4h, v1.4s, #6 + sqshrun2 v1.8h, v3.4s, #6 + umin v0.8h, v0.8h, v5.8h + umin v1.8h, v1.8h, v5.8h + st1 {v0.16b}, [\dst], \d_strd + st1 {v1.16b}, [\dst], \d_strd +.endif + b.gt 8b + ret + + .align JUMP_ALIGN +160: // H - 16xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +16: + ldp q17, q21, [\src] + ldur q19, [\src, #8] + +.ifc \type, prep + movi v0.2d, #0 + movi v2.2d, #0 +.else + mov v0.16b, v6.16b + mov v2.16b, v6.16b +.endif + tbl v16.16b, {v17.16b}, v30.16b + tbl v17.16b, {v17.16b}, v31.16b + sdot z0.d, z16.h, z4.h[0] + sdot z2.d, z17.h, z4.h[0] + + tbl v18.16b, {v19.16b}, v30.16b + tbl v19.16b, {v19.16b}, v31.16b +.ifc \type, prep + movi v16.2d, #0 + movi v17.2d, #0 +.else + mov v16.16b, v6.16b + mov v17.16b, v6.16b +.endif + ldur q25, [\src, #24] + ldr q27, [\src, #32] + + sdot z16.d, z18.h, z4.h[0] + sdot z17.d, z19.h, z4.h[0] + + tbl v22.16b, {v21.16b}, v30.16b + tbl v23.16b, {v21.16b}, v31.16b +.ifc \type, prep + movi v1.2d, #0 + movi v3.2d, #0 +.else + mov v1.16b, v6.16b + mov v3.16b, v6.16b +.endif + add \src, \src, \s_strd + sdot z1.d, z22.h, z4.h[0] + sdot z3.d, z23.h, z4.h[0] + + tbl v24.16b, {v25.16b}, v30.16b + tbl v25.16b, {v25.16b}, v31.16b +.ifc \type, prep + movi v22.2d, #0 + movi v23.2d, #0 +.else + mov v22.16b, v6.16b + mov v23.16b, v6.16b +.endif + sdot z22.d, z24.h, z4.h[0] + sdot z23.d, z25.h, z4.h[0] + + tbl v20.16b, {v21.16b}, v30.16b + tbl v21.16b, {v21.16b}, v31.16b + sdot z0.d, z18.h, z4.h[1] + sdot z2.d, z19.h, z4.h[1] + tbl v26.16b, {v27.16b}, v30.16b + tbl v27.16b, {v27.16b}, v31.16b + sdot z16.d, z20.h, z4.h[1] + sdot z17.d, z21.h, z4.h[1] + + sdot z1.d, z24.h, z4.h[1] + sdot z3.d, z25.h, z4.h[1] + + sdot z22.d, z26.h, z4.h[1] + sdot z23.d, z27.h, z4.h[1] + + subs \h, \h, #1 + uzp1 v0.4s, v0.4s, v2.4s + uzp1 v2.4s, v16.4s, v17.4s + uzp1 v1.4s, v1.4s, v3.4s + uzp1 v3.4s, v22.4s, v23.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v2.4s, v2.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + srshl v3.4s, v3.4s, v5.4s + uzp1 v0.8h, v0.8h, v2.8h + uzp1 v1.8h, v1.8h, v3.8h + sub z0.h, z0.h, #PREP_BIAS + sub z1.h, z1.h, #PREP_BIAS + stp q0, q1, [\dst], #32 +.else // put + sqshrun v0.4h, v0.4s, #6 + sqshrun2 v0.8h, v2.4s, #6 + sqshrun v1.4h, v1.4s, #6 + sqshrun2 v1.8h, v3.4s, #6 + umin v0.8h, v0.8h, v5.8h + umin v1.8h, v1.8h, v5.8h + st1 {v0.16b, v1.16b}, [\dst], \d_strd +.endif + b.gt 16b + ret + + .align JUMP_ALIGN +320: // H - 32xN+ +640: +1280: + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + sub \d_strd, \d_strd, \w, uxtw #1 +.endif + sub \s_strd, \s_strd, \w, uxtw #1 + mov w8, \w + + .align LOOP_ALIGN +32: + ldp q17, q21, [\src] + ldur q19, [\src, #8] + +.ifc \type, prep + movi v0.2d, #0 + movi v2.2d, #0 +.else + mov v0.16b, v6.16b + mov v2.16b, v6.16b +.endif + tbl v16.16b, {v17.16b}, v30.16b + tbl v17.16b, {v17.16b}, v31.16b + sdot z0.d, z16.h, z4.h[0] + sdot z2.d, z17.h, z4.h[0] + + tbl v18.16b, {v19.16b}, v30.16b + tbl v19.16b, {v19.16b}, v31.16b +.ifc \type, prep + movi v16.2d, #0 + movi v17.2d, #0 +.else + mov v16.16b, v6.16b + mov v17.16b, v6.16b +.endif + ldur q25, [\src, #24] + + sdot z16.d, z18.h, z4.h[0] + sdot z17.d, z19.h, z4.h[0] + + ldr q27, [\src, #32]! + + tbl v22.16b, {v21.16b}, v30.16b + tbl v23.16b, {v21.16b}, v31.16b +.ifc \type, prep + movi v1.2d, #0 + movi v3.2d, #0 +.else + mov v1.16b, v6.16b + mov v3.16b, v6.16b +.endif + sdot z1.d, z22.h, z4.h[0] + sdot z3.d, z23.h, z4.h[0] + + tbl v24.16b, {v25.16b}, v30.16b + tbl v25.16b, {v25.16b}, v31.16b +.ifc \type, prep + movi v22.2d, #0 + movi v23.2d, #0 +.else + mov v22.16b, v6.16b + mov v23.16b, v6.16b +.endif + sdot z22.d, z24.h, z4.h[0] + sdot z23.d, z25.h, z4.h[0] + + tbl v20.16b, {v21.16b}, v30.16b + tbl v21.16b, {v21.16b}, v31.16b + sdot z0.d, z18.h, z4.h[1] + sdot z2.d, z19.h, z4.h[1] + tbl v26.16b, {v27.16b}, v30.16b + tbl v27.16b, {v27.16b}, v31.16b + sdot z16.d, z20.h, z4.h[1] + sdot z17.d, z21.h, z4.h[1] + + sdot z1.d, z24.h, z4.h[1] + sdot z3.d, z25.h, z4.h[1] + + sdot z22.d, z26.h, z4.h[1] + sdot z23.d, z27.h, z4.h[1] + + subs w8, w8, #16 + uzp1 v0.4s, v0.4s, v2.4s + uzp1 v2.4s, v16.4s, v17.4s + uzp1 v1.4s, v1.4s, v3.4s + uzp1 v3.4s, v22.4s, v23.4s +.ifc \type, prep + srshl v0.4s, v0.4s, v5.4s + srshl v2.4s, v2.4s, v5.4s + srshl v1.4s, v1.4s, v5.4s + srshl v3.4s, v3.4s, v5.4s + uzp1 v0.8h, v0.8h, v2.8h + uzp1 v1.8h, v1.8h, v3.8h + sub z0.h, z0.h, #PREP_BIAS + sub z1.h, z1.h, #PREP_BIAS +.else // put + sqshrun v0.4h, v0.4s, #6 + sqshrun2 v0.8h, v2.4s, #6 + sqshrun v1.4h, v1.4s, #6 + sqshrun2 v1.8h, v3.4s, #6 + umin v0.8h, v0.8h, v5.8h + umin v1.8h, v1.8h, v5.8h +.endif + stp q0, q1, [\dst], #32 + b.gt 32b + + add \src, \src, \s_strd +.ifc \type, put + add \dst, \dst, \d_strd +.endif + subs \h, \h, #1 + mov w8, \w + b.gt 32b + ret +endfunc + +jumptable \type\()_8tap_h_\isa\()_tbl + .word 1280b - \type\()_8tap_h_\isa\()_tbl + .word 640b - \type\()_8tap_h_\isa\()_tbl + .word 320b - \type\()_8tap_h_\isa\()_tbl + .word 160b - \type\()_8tap_h_\isa\()_tbl + .word 80b - \type\()_8tap_h_\isa\()_tbl + .word 40b - \type\()_8tap_h_\isa\()_tbl +.ifc \type, put + .word 20b - \type\()_8tap_h_\isa\()_tbl +.endif +endjumptable +.endm + + +function prep_sve + movrel x9, prep_tbl + mov w6, #19 + ldrsw x8, [x9, x8, lsl #2] + sub w6, w6, w7, lsr #8 // 19 - bdmax / 256 + add x9, x9, x8 + movi v30.8h, #PREP_BIAS_NEG + dup v29.8h, w6 // 10b: 1 << 4, 12b: 1 << 2 + br x9 + + .align JUMP_ALIGN +40: // prep - 4xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +4: + ldr d0, [x1] + ldr d1, [x1, x2] + add x1, x1, x2, lsl #1 + subs w4, w4, #2 + mad z0.h, p0/m, z29.h, z30.h + mad z1.h, p0/m, z29.h, z30.h + stp d0, d1, [x0], #16 + b.gt 4b + ret + + .align JUMP_ALIGN +80: // prep - 8xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +8: + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x1], x2 + subs w4, w4, #2 + mad z0.h, p0/m, z29.h, z30.h + mad z1.h, p0/m, z29.h, z30.h + stp q0, q1, [x0], #32 + b.gt 8b + ret + + .align JUMP_ALIGN +160: // prep - 16xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +16: + ld1 {v0.8h, v1.8h}, [x1], x2 + mad z0.h, p0/m, z29.h, z30.h + mad z1.h, p0/m, z29.h, z30.h + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x1], x2 + mad z2.h, p0/m, z29.h, z30.h + mad z3.h, p0/m, z29.h, z30.h + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + add x0, x0, #64 + b.gt 16b + ret + + .align JUMP_ALIGN +320: // prep - 32xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +32: + ldp q0, q1, [x1] + mad z0.h, p0/m, z29.h, z30.h + mad z1.h, p0/m, z29.h, z30.h + ldp q2, q3, [x1, #32] + subs w4, w4, #1 + mad z2.h, p0/m, z29.h, z30.h + mad z3.h, p0/m, z29.h, z30.h + add x1, x1, x2 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + add x0, x0, #64 + b.gt 32b + ret + + .align JUMP_ALIGN +640: // prep - 64xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +64: + ldp q0, q1, [x1] + mad z0.h, p0/m, z29.h, z30.h + mad z1.h, p0/m, z29.h, z30.h + ldp q2, q3, [x1, #32] + mad z2.h, p0/m, z29.h, z30.h + mad z3.h, p0/m, z29.h, z30.h + ldp q4, q5, [x1, #64] + mad z4.h, p0/m, z29.h, z30.h + mad z5.h, p0/m, z29.h, z30.h + ldp q6, q7, [x1, #96] + add x1, x1, x2 + subs w4, w4, #1 + mad z6.h, p0/m, z29.h, z30.h + mad z7.h, p0/m, z29.h, z30.h + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 + b.gt 64b + ret + + .align JUMP_ALIGN +1280: // prep - 128xN + AARCH64_VALID_JUMP_TARGET + + .align LOOP_ALIGN +128: + ldp q0, q1, [x1] + mad z0.h, p0/m, z29.h, z30.h + mad z1.h, p0/m, z29.h, z30.h + ldp q2, q3, [x1, #32] + mad z2.h, p0/m, z29.h, z30.h + mad z3.h, p0/m, z29.h, z30.h + ldp q4, q5, [x1, #64] + mad z4.h, p0/m, z29.h, z30.h + mad z5.h, p0/m, z29.h, z30.h + ldp q6, q7, [x1, #96] + mad z6.h, p0/m, z29.h, z30.h + mad z7.h, p0/m, z29.h, z30.h + ldp q16, q17, [x1, #128] + mad z16.h, p0/m, z29.h, z30.h + mad z17.h, p0/m, z29.h, z30.h + ldp q18, q19, [x1, #160] + mad z18.h, p0/m, z29.h, z30.h + mad z19.h, p0/m, z29.h, z30.h + ldp q20, q21, [x1, #192] + mad z20.h, p0/m, z29.h, z30.h + mad z21.h, p0/m, z29.h, z30.h + ldp q22, q23, [x1, #224] + add x1, x1, x2 + mad z22.h, p0/m, z29.h, z30.h + mad z23.h, p0/m, z29.h, z30.h + subs w4, w4, #1 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + stp q16, q17, [x0, #128] + stp q18, q19, [x0, #160] + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + add x0, x0, #256 + b.gt 128b + ret +endfunc + +jumptable prep_tbl + .word 1280b - prep_tbl + .word 640b - prep_tbl + .word 320b - prep_tbl + .word 160b - prep_tbl + .word 80b - prep_tbl + .word 40b - prep_tbl +endjumptable + + +// dst(x0), d_strd(x9), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6), bdmax(w7) +// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w9), ws_strd(w2) +filter_8tap_fn prep, sve2, x0, x9, x1, x2, w3, w4, w5, w6, w7, x5, x6, x5, x6, w9, w2 + +// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8) +// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3) +filter_8tap_fn put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3 + +DISABLE_SVE2 +DISABLE_SVE +#endif // HAVE_SVE2 diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index a3eef59f36f23bca4f577fcd5fe5125869603635..079ff9eb7a1261d5f87cc7b24cf7ab4ef3a2881b 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -45,32 +45,39 @@ ENABLE_DOTPROD #define LOOP_ALIGN 2 -// Lookup table used to help conversion of shifted 32-bit values to 8-bit. - .align 4 -L(hv_tbl_neon_dotprod): - .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 - -// Shuffle indices to permute horizontal samples in preparation for input to -// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the -// interval of [-3, 4] relative to the current sample position. - .align 4 -L(h_tbl_neon_dotprod): +const h_tbl_neon_dotprod, align=4 + // Shuffle indices to permute horizontal samples in preparation for + // input to SDOT instructions. The 8-tap horizontal convolution uses + // sample indices in the interval of [-3, 4] relative to the current + // sample position. .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -// Vertical convolutions are also using SDOT instructions, where a 128-bit -// register contains a transposed 4x4 matrix of values. Subsequent iterations of -// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop -// iteration. These shuffle indices shift and merge this 4x4 matrix with the -// values of a new line. - .align 4 -L(v_tbl_neon_dotprod): + // Shuffle indices to permute horizontal samples in preparation for + // input to USMMLA instructions. +#define OFFSET_USMMLA 48 + .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 + + // Lookup table used to help conversion of shifted 32-bit values to 8-bit. +#define OFFSET_CVT_32_8 80 + .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 +endconst + +const v_tbl_neon_dotprod, align=4 + // Vertical convolutions are also using SDOT instructions, where a + // 128-bit register contains a transposed 4x4 matrix of values. + // Subsequent iterations of the vertical convolution can reuse the + // 3x4 sub-matrix from the previous loop iteration. These shuffle + // indices shift and merge this 4x4 matrix with the values of a new + // line. .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 +endconst .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 @@ -109,24 +116,24 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN .align JUMP_ALIGN L(\type\()_8tap_v_\isa): madd \my, \my, w11, w10 - ldr q6, L(v_tbl_neon_dotprod) + movrel x13, v_tbl_neon_dotprod sub \src, \src, \s_strd .ifc \isa, neon_dotprod .ifc \type, prep - mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding + mov w8, #0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 .else - movi v4.4s, #32, lsl 8 // FILTER_WEIGHT * 128, bias for SDOT + movi v4.4s, #32, lsl #8 // FILTER_WEIGHT * 128, bias for SDOT .endif .endif ubfx w11, \my, #7, #7 and \my, \my, #0x7F - ldr q28, L(v_tbl_neon_dotprod) + 16 + ldp q6, q28, [x13] cmp \h, #4 csel \my, \my, w11, le sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 add \xmy, x12, \xmy, lsl #3 // subpel V filter address - ldr q29, L(v_tbl_neon_dotprod) + 32 + ldr q29, [x13, #32] .ifc \isa, neon_dotprod movi v5.16b, #128 .endif @@ -137,8 +144,7 @@ L(\type\()_8tap_v_\isa): // .align JUMP_ALIGN // fallthrough 160: // V - 16xN+ - ldr q30, L(v_tbl_neon_dotprod) + 48 - ldr q31, L(v_tbl_neon_dotprod) + 64 + ldp q30, q31, [x13, #48] .ifc \type, prep add \wd_strd, \w, \w .endif @@ -676,18 +682,19 @@ L(\type\()_8tap_v_\isa): L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 madd w14, \my, w11, w10 // for HV - ldr q28, L(h_tbl_neon_dotprod) .ifc \isa, neon_dotprod - mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding + mov w13, #0x2002 // FILTER_WEIGHT * 128 + rounding dup v27.4s, w13 // put H overrides this .endif + movrel x13, h_tbl_neon_dotprod sub \src, \src, #3 // src - 3 - ubfx w9, \mx, #7, #7 + ldr q28, [x13] // for 4-tap & 8-tap H filters + ubfx w15, \mx, #7, #7 and \mx, \mx, #0x7F ubfx w11, w14, #7, #7 // for HV and w14, w14, #0x7F // for HV cmp \w, #4 - csel \mx, \mx, w9, le + csel \mx, \mx, w15, le add \xmx, x12, \xmx, lsl #3 // subpel H filter address .ifc \isa, neon_dotprod movi v24.16b, #128 @@ -702,10 +709,10 @@ L(\type\()_8tap_h_hv_\isa): mov x15, x30 ldr d7, [\xmy] .ifc \type, put - ldr q25, L(hv_tbl_neon_dotprod) -.endif + ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion +.endif // of 32b values to 8b sxtl v7.8h, v7.8b - cmp w10, SHARP1 + cmp w10, #SHARP1 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 // HV 8-tap cases @@ -718,8 +725,7 @@ L(\type\()_8tap_h_hv_\isa): // .align JUMP_ALIGN // fallthrough 80: // HV8 - 8xN+ - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 + ldp q29, q30, [x13, #16] ldr d26, [\xmx] .ifc \type, prep add \wd_strd, \w, \w @@ -860,7 +866,7 @@ L(\type\()_8tap_h_hv_\isa): .align JUMP_ALIGN 40: // HV8 - 4xN - ldr s26, [\xmx, #2] + ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) @@ -930,7 +936,7 @@ L(\type\()_8tap_h_hv_\isa): .ifc \type, put .align JUMP_ALIGN 20: // HV8 - 2xN - ldr s26, [\xmx, #2] + ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) @@ -1005,12 +1011,91 @@ L(\type\()_6tap_hv_\isa): // .align JUMP_ALIGN // fallthrough 80: // HV6 - 8xN+ - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 ldr d26, [\xmx] .ifc \type, prep add \wd_strd, \w, \w .endif +.ifc \isa, neon_i8mm + cmp w9, #SHARP1 + b.eq 88f // horizontal == SHARP1 + + ldp q29, q30, [x13, #(OFFSET_USMMLA)] + ext v0.8b, v26.8b, v26.8b, #7 + ins v26.d[1], v0.d[0] + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter6_neon_i8mm) + srshr v16.8h, v22.8h, #2 + bl L(\type\()_hv_filter6_neon_i8mm) + srshr v17.8h, v22.8h, #2 + bl L(\type\()_hv_filter6_neon_i8mm) + srshr v18.8h, v22.8h, #2 + bl L(\type\()_hv_filter6_neon_i8mm) + srshr v19.8h, v22.8h, #2 + bl L(\type\()_hv_filter6_neon_i8mm) + srshr v20.8h, v22.8h, #2 + + .align LOOP_ALIGN +8: + ld1 {v23.16b}, [\lsrc], \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smull2 v1.4s, v16.8h, v7.h[1] + mov v16.16b, v17.16b + movi v5.4s, #0 + movi v6.4s, #0 + tbl v2.16b, {v23.16b}, v29.16b + tbl v3.16b, {v23.16b}, v30.16b + + smlal v0.4s, v17.4h, v7.h[2] + smlal2 v1.4s, v17.8h, v7.h[2] + mov v17.16b, v18.16b + + usmmla v5.4s, v2.16b, v26.16b + usmmla v6.4s, v3.16b, v26.16b + + smlal v0.4s, v18.4h, v7.h[3] + smlal2 v1.4s, v18.8h, v7.h[3] + mov v18.16b, v19.16b + subs w8, w8, #1 + + smlal v0.4s, v19.4h, v7.h[4] + smlal2 v1.4s, v19.8h, v7.h[4] + uzp1 v23.8h, v5.8h, v6.8h + mov v19.16b, v20.16b + + smlal v0.4s, v20.4h, v7.h[5] + smlal2 v1.4s, v20.8h, v7.h[5] + srshr v20.8h, v23.8h, #2 + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + .ifc \type, prep + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + st1 {v0.8h}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #16 + .else + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + st1 {v0.8b}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #8 + .endif + add \src, \src, #8 + subs \w, \w, #8 + b.gt 81b + ret x15 + + .align JUMP_ALIGN +88: +.endif // neon_i8mm + ldp q29, q30, [x13, #16] .align LOOP_ALIGN 81: @@ -1042,8 +1127,8 @@ L(\type\()_6tap_hv_\isa): .endif .align LOOP_ALIGN 8: - ldr q23, [\xmy] - add \xmy, \xmy, \s_strd + ldr q23, [\lsrc] + add \lsrc, \lsrc, \s_strd smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] @@ -1130,6 +1215,20 @@ L(\type\()_hv_filter8_\isa): uzp1 v22.8h, v22.8h, v23.8h ret +.ifc \isa, neon_i8mm + .align FUNC_ALIGN +L(\type\()_hv_filter6_neon_i8mm): + ld1 {v4.16b}, [\lsrc], \s_strd + movi v22.4s, #0 + movi v23.4s, #0 + tbl v2.16b, {v4.16b}, v29.16b + tbl v3.16b, {v4.16b}, v30.16b + usmmla v22.4s, v2.16b, v26.16b + usmmla v23.4s, v3.16b, v26.16b + uzp1 v22.8h, v22.8h, v23.8h + ret +.endif + .align FUNC_ALIGN L(\type\()_hv_filter4_\isa): ld1 {v4.8b}, [\src], \s_strd @@ -1145,7 +1244,7 @@ L(\type\()_hv_filter4_\isa): .align JUMP_ALIGN 40: // HV6 - 4xN - ldr s26, [\xmx, #2] + ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) @@ -1206,7 +1305,7 @@ L(\type\()_hv_filter4_\isa): .ifc \type, put .align JUMP_ALIGN 20: // HV6 - 2xN - ldr s26, [\xmx, #2] + ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) @@ -1266,8 +1365,8 @@ L(\type\()_hv_filter4_\isa): .align JUMP_ALIGN L(\type\()_8tap_h_\isa): - adr x9, L(\type\()_8tap_h_\isa\()_tbl) - ldrh w8, [x9, x8, lsl #1] + movrel x11, \type\()_8tap_h_\isa\()_tbl + ldrsw x8, [x11, x8, lsl #2] .ifc \type, put .ifc \isa, neon_i8mm movi v27.4s, #34 // special rounding @@ -1276,15 +1375,15 @@ L(\type\()_8tap_h_\isa): dup v27.4s, w10 .endif .endif - sub x9, x9, x8 - br x9 + add x11, x11, x8 + br x11 .ifc \type, put .align JUMP_ALIGN 20: // H - 2xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 - ldr s26, [\xmx, #2] + ldur s26, [\xmx, #2] .align LOOP_ALIGN 2: @@ -1321,7 +1420,7 @@ L(\type\()_8tap_h_\isa): 40: // H - 4xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 - ldr s26, [\xmx, #2] + ldur s26, [\xmx, #2] .align LOOP_ALIGN 4: @@ -1370,9 +1469,63 @@ L(\type\()_8tap_h_\isa): .align JUMP_ALIGN 80: // H - 8xN AARCH64_VALID_JUMP_TARGET - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 ldr d26, [\xmx] +.ifc \isa, neon_i8mm + cmp w9, #SHARP1 + b.eq 88f // horizontal == SHARP1 + + ldp q29, q30, [x13, #(OFFSET_USMMLA)] + ext v0.8b, v26.8b, v26.8b, #7 + ins v26.d[1], v0.d[0] + + .align LOOP_ALIGN +8: + ldr q0, [\src] + ldr q16, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + .ifc \type, prep + movi v4.4s, #0 + movi v5.4s, #0 + movi v20.4s, #0 + movi v21.4s, #0 + .else + mov v4.16b, v27.16b + mov v5.16b, v27.16b + mov v20.16b, v27.16b + mov v21.16b, v27.16b + .endif + tbl v1.16b, {v0.16b}, v29.16b + tbl v2.16b, {v0.16b}, v30.16b + tbl v17.16b, {v16.16b}, v29.16b + tbl v18.16b, {v16.16b}, v30.16b + + usmmla v4.4s, v1.16b, v26.16b + usmmla v5.4s, v2.16b, v26.16b + usmmla v20.4s, v17.16b, v26.16b + usmmla v21.4s, v18.16b, v26.16b + + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v20.8h, v20.8h, v21.8h + .ifc \type, prep + srshr v4.8h, v4.8h, #2 + srshr v20.8h, v20.8h, #2 + subs \h, \h, #2 + stp q4, q20, [\dst], #32 + .else // put + sqshrun v4.8b, v4.8h, #6 + sqshrun v20.8b, v20.8h, #6 + subs \h, \h, #2 + str d4, [\dst] + str d20, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 + .endif + b.gt 8b + ret + + .align JUMP_ALIGN +88: +.endif // neon_i8mm + ldp q29, q30, [x13, #16] .align LOOP_ALIGN 8: @@ -1436,14 +1589,66 @@ L(\type\()_8tap_h_\isa): .align JUMP_ALIGN 160: // H - 16xN AARCH64_VALID_JUMP_TARGET - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 ldr d26, [\xmx] +.ifc \isa, neon_i8mm + cmp w9, #SHARP1 + b.eq 168f // horizontal == SHARP1 + + ldp q29, q30, [x13, #(OFFSET_USMMLA)] + ext v0.8b, v26.8b, v26.8b, #7 + ins v26.d[1], v0.d[0] .align LOOP_ALIGN 16: ldr q16, [\src] - ldr q17, [\src, #12] // avoid 2 register TBL for small cores + ldur q17, [\src, #8] // avoid 2 register TBL for small cores + add \src, \src, \s_strd + .ifc \type, prep + movi v6.4s, #0 + movi v7.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 + .else + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + .endif + tbl v0.16b, {v16.16b}, v29.16b + tbl v1.16b, {v16.16b}, v30.16b + tbl v2.16b, {v17.16b}, v29.16b + tbl v3.16b, {v17.16b}, v30.16b + + usmmla v6.4s, v0.16b, v26.16b + usmmla v7.4s, v1.16b, v26.16b + usmmla v22.4s, v2.16b, v26.16b + usmmla v23.4s, v3.16b, v26.16b + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h + .ifc \type, prep + srshr v6.8h, v6.8h, #2 + srshr v22.8h, v22.8h, #2 + subs \h, \h, #1 + stp q6, q22, [\dst], #32 + .else // put + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs \h, \h, #1 + st1 {v6.16b}, [\dst], \d_strd + .endif + b.gt 16b + ret + + .align JUMP_ALIGN +168: +.endif // neon_i8mm + ldp q29, q30, [x13, #16] + + .align LOOP_ALIGN +16: + ldr q16, [\src] + ldur q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, \s_strd .ifc \type\()_\isa, prep_neon_i8mm movi v6.4s, #0 @@ -1501,8 +1706,6 @@ L(\type\()_8tap_h_\isa): 640: 1280: AARCH64_VALID_JUMP_TARGET - ldr q29, L(h_tbl_neon_dotprod) + 16 - ldr q30, L(h_tbl_neon_dotprod) + 32 ldr d26, [\xmx] .ifc \type, put sub \d_strd, \d_strd, \w, uxtw @@ -1510,10 +1713,73 @@ L(\type\()_8tap_h_\isa): sub \s_strd, \s_strd, \w, uxtw mov w8, \w +.ifc \isa, neon_i8mm + cmp w9, #SHARP1 + b.eq 328f // horizontal == SHARP1 + + ldp q29, q30, [x13, #(OFFSET_USMMLA)] + ext v0.8b, v26.8b, v26.8b, #7 + ins v26.d[1], v0.d[0] + + .align LOOP_ALIGN +32: + ldr q16, [\src] + ldur q17, [\src, #8] // avoid 2 register TBL for small cores + add \src, \src, #16 + .ifc \type, prep + movi v6.4s, #0 + movi v7.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 + .else + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + .endif + tbl v0.16b, {v16.16b}, v29.16b + tbl v1.16b, {v16.16b}, v30.16b + tbl v2.16b, {v17.16b}, v29.16b + tbl v3.16b, {v17.16b}, v30.16b + + usmmla v6.4s, v0.16b, v26.16b + usmmla v7.4s, v1.16b, v26.16b + usmmla v22.4s, v2.16b, v26.16b + usmmla v23.4s, v3.16b, v26.16b + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h + .ifc \type, prep + srshr v6.8h, v6.8h, #2 + srshr v22.8h, v22.8h, #2 + subs w8, w8, #16 + stp q6, q22, [\dst], #32 + .else // put + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs w8, w8, #16 + str q6, [\dst], #16 + .endif + b.gt 32b + + add \src, \src, \s_strd + .ifc \type, put + add \dst, \dst, \d_strd + .endif + mov w8, \w + subs \h, \h, #1 + b.gt 32b + ret + + .align JUMP_ALIGN +328: +.endif // neon_i8mm + ldp q29, q30, [x13, #16] + .align LOOP_ALIGN 32: ldr q16, [\src] - ldr q17, [\src, #12] // avoid 2 register TBL for small cores + ldur q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, #16 .ifc \type\()_\isa, prep_neon_i8mm movi v6.4s, #0 @@ -1573,19 +1839,19 @@ L(\type\()_8tap_h_\isa): subs \h, \h, #1 b.gt 32b ret +endfunc -L(\type\()_8tap_h_\isa\()_tbl): - .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b) - .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b) - .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b) - .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b) - .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b) - .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b) +jumptable \type\()_8tap_h_\isa\()_tbl + .word 1280b - \type\()_8tap_h_\isa\()_tbl + .word 640b - \type\()_8tap_h_\isa\()_tbl + .word 320b - \type\()_8tap_h_\isa\()_tbl + .word 160b - \type\()_8tap_h_\isa\()_tbl + .word 80b - \type\()_8tap_h_\isa\()_tbl + .word 40b - \type\()_8tap_h_\isa\()_tbl .ifc \type, put - .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b) - .hword 0 + .word 20b - \type\()_8tap_h_\isa\()_tbl .endif -endfunc +endjumptable .endm // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) diff --git a/src/arm/64/refmvs.S b/src/arm/64/refmvs.S index e905682f47cdfa23cc2a24604c067bade503733b..c75c47890945bf1214da0947a2e48081b13d81d6 100644 --- a/src/arm/64/refmvs.S +++ b/src/arm/64/refmvs.S @@ -34,13 +34,13 @@ function splat_mv_neon, export=1 ld1 {v3.16b}, [x1] clz w3, w3 - adr x5, L(splat_tbl) + movrel x5, splat_tbl sub w3, w3, #26 ext v2.16b, v3.16b, v3.16b, #12 - ldrh w3, [x5, w3, uxtw #1] + ldrsw x3, [x5, w3, uxtw #2] add w2, w2, w2, lsl #1 ext v0.16b, v2.16b, v3.16b, #4 - sub x3, x5, w3, uxtw + add x3, x5, x3 ext v1.16b, v2.16b, v3.16b, #8 lsl w2, w2, #2 ext v2.16b, v2.16b, v3.16b, #12 @@ -80,16 +80,17 @@ function splat_mv_neon, export=1 st1 {v0.16b, v1.16b, v2.16b}, [x1] b.gt 1b ret - -L(splat_tbl): - .hword L(splat_tbl) - 320b - .hword L(splat_tbl) - 160b - .hword L(splat_tbl) - 80b - .hword L(splat_tbl) - 40b - .hword L(splat_tbl) - 20b - .hword L(splat_tbl) - 10b endfunc +jumptable splat_tbl + .word 320b - splat_tbl + .word 160b - splat_tbl + .word 80b - splat_tbl + .word 40b - splat_tbl + .word 20b - splat_tbl + .word 10b - splat_tbl +endjumptable + const mv_tbls, align=4 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 @@ -112,7 +113,7 @@ function save_tmvs_neon, export=1 movi v30.8b, #0 ld1 {v31.8b}, [x3] - adr x8, L(save_tmvs_tbl) + movrel x8, save_tmvs_tbl movrel x16, mask_mult movrel x13, mv_tbls ld1 {v29.8b}, [x16] @@ -137,9 +138,9 @@ function save_tmvs_neon, export=1 2: ldrb w11, [x9, #10] // cand_b->bs ld1 {v0.16b}, [x9] // cand_b->mv - add x11, x8, w11, uxtw #2 + add x11, x8, w11, uxtw #3 ldr h1, [x9, #8] // cand_b->ref - ldrh w12, [x11] // bw8 + ldr w12, [x11] // bw8 mov x15, x8 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 cmp x9, x10 @@ -149,9 +150,9 @@ function save_tmvs_neon, export=1 ldrb w15, [x9, #10] // cand_b->bs add x16, x9, #8 ld1 {v4.16b}, [x9] // cand_b->mv - add x15, x8, w15, uxtw #2 + add x15, x8, w15, uxtw #3 ld1 {v1.h}[1], [x16] // cand_b->ref - ldrh w12, [x15] // bw8 + ldr w12, [x15] // bw8 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 trn1 v2.2d, v0.2d, v4.2d @@ -166,12 +167,12 @@ function save_tmvs_neon, export=1 addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] umov w16, v1.h[0] // Extract case for first block umov w17, v1.h[1] - ldrh w11, [x11, #2] // Fetch jump table entry - ldrh w15, [x15, #2] + ldrsw x11, [x11, #4] // Fetch jump table entry + ldrsw x15, [x15, #4] ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case ldr q5, [x13, w17, uxtw #4] - sub x11, x8, w11, uxtw // Find jump table target - sub x15, x8, w15, uxtw + add x11, x8, x11 // Find jump table target + add x15, x8, x15 tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block tbl v4.16b, {v4.16b}, v5.16b @@ -243,50 +244,51 @@ function save_tmvs_neon, export=1 str q2, [x3, #(16*5-16)] add x3, x3, #16*5 ret - -L(save_tmvs_tbl): - .hword 16 * 12 - .hword L(save_tmvs_tbl) - 160b - .hword 16 * 12 - .hword L(save_tmvs_tbl) - 160b - .hword 8 * 12 - .hword L(save_tmvs_tbl) - 80b - .hword 8 * 12 - .hword L(save_tmvs_tbl) - 80b - .hword 8 * 12 - .hword L(save_tmvs_tbl) - 80b - .hword 8 * 12 - .hword L(save_tmvs_tbl) - 80b - .hword 4 * 12 - .hword L(save_tmvs_tbl) - 40b - .hword 4 * 12 - .hword L(save_tmvs_tbl) - 40b - .hword 4 * 12 - .hword L(save_tmvs_tbl) - 40b - .hword 4 * 12 - .hword L(save_tmvs_tbl) - 40b - .hword 2 * 12 - .hword L(save_tmvs_tbl) - 20b - .hword 2 * 12 - .hword L(save_tmvs_tbl) - 20b - .hword 2 * 12 - .hword L(save_tmvs_tbl) - 20b - .hword 2 * 12 - .hword L(save_tmvs_tbl) - 20b - .hword 2 * 12 - .hword L(save_tmvs_tbl) - 20b - .hword 1 * 12 - .hword L(save_tmvs_tbl) - 10b - .hword 1 * 12 - .hword L(save_tmvs_tbl) - 10b - .hword 1 * 12 - .hword L(save_tmvs_tbl) - 10b - .hword 1 * 12 - .hword L(save_tmvs_tbl) - 10b - .hword 1 * 12 - .hword L(save_tmvs_tbl) - 10b - .hword 1 * 12 - .hword L(save_tmvs_tbl) - 10b - .hword 1 * 12 - .hword L(save_tmvs_tbl) - 10b endfunc + +jumptable save_tmvs_tbl + .word 16 * 12 + .word 160b - save_tmvs_tbl + .word 16 * 12 + .word 160b - save_tmvs_tbl + .word 8 * 12 + .word 80b - save_tmvs_tbl + .word 8 * 12 + .word 80b - save_tmvs_tbl + .word 8 * 12 + .word 80b - save_tmvs_tbl + .word 8 * 12 + .word 80b - save_tmvs_tbl + .word 4 * 12 + .word 40b - save_tmvs_tbl + .word 4 * 12 + .word 40b - save_tmvs_tbl + .word 4 * 12 + .word 40b - save_tmvs_tbl + .word 4 * 12 + .word 40b - save_tmvs_tbl + .word 2 * 12 + .word 20b - save_tmvs_tbl + .word 2 * 12 + .word 20b - save_tmvs_tbl + .word 2 * 12 + .word 20b - save_tmvs_tbl + .word 2 * 12 + .word 20b - save_tmvs_tbl + .word 2 * 12 + .word 20b - save_tmvs_tbl + .word 1 * 12 + .word 10b - save_tmvs_tbl + .word 1 * 12 + .word 10b - save_tmvs_tbl + .word 1 * 12 + .word 10b - save_tmvs_tbl + .word 1 * 12 + .word 10b - save_tmvs_tbl + .word 1 * 12 + .word 10b - save_tmvs_tbl + .word 1 * 12 + .word 10b - save_tmvs_tbl + .word 1 * 12 + .word 10b - save_tmvs_tbl +endjumptable diff --git a/src/arm/arm-arch.h b/src/arm/arm-arch.h new file mode 100644 index 0000000000000000000000000000000000000000..f00b9b2fcecbd105f27a2d958be0da18ea4cc634 --- /dev/null +++ b/src/arm/arm-arch.h @@ -0,0 +1,68 @@ +/* + * Copyright © 2024, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ARM_ARM_ARCH_H +#define ARM_ARM_ARCH_H + +/* Compatibility header to define __ARM_ARCH with older compilers */ +#ifndef __ARM_ARCH + +#ifdef _M_ARM +#define __ARM_ARCH _M_ARM + +#elif defined(__ARM_ARCH_8A__) || defined(_M_ARM64) +#define __ARM_ARCH 8 + +#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7EM__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) +#define __ARM_ARCH 7 + +#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) +#define __ARM_ARCH 6 + +#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) +#define __ARM_ARCH 5 + +#elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) +#define __ARM_ARCH 4 + +#elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__) +#define __ARM_ARCH 3 + +#elif defined(__ARM_ARCH_2__) +#define __ARM_ARCH 2 + +#else +#error Unknown ARM architecture version +#endif + +#endif /* !__ARM_ARCH */ + +#endif /* ARM_ARM_ARCH_H */ diff --git a/src/arm/asm.S b/src/arm/asm.S index fed73b30483cf8b760fbb296d71dea1e8f3413b3..e3731fe732bf9eb7358fcfedf2b7a99577869f5f 100644 --- a/src/arm/asm.S +++ b/src/arm/asm.S @@ -323,6 +323,32 @@ EXTERN\name: \name: .endm +.macro jumptable name +#ifdef _WIN32 +// MS armasm64 doesn't seem to be able to create relocations for subtraction +// of labels in different sections; for armasm64 (and all of Windows for +// simplicity), write the jump table in the text section, to allow calculating +// differences at assembly time. See +// https://developercommunity.visualstudio.com/t/armasm64-unable-to-create-cross-section/10722340 +// for reference. (LLVM can create such relocations, but checking for _WIN32 +// for simplicity, as execute-only memory isn't relevant on Windows at the +// moment.) + function \name +#else +// For other platforms, write jump tables in a const data section, to allow +// working in environments where executable memory isn't readable. + const \name +#endif +.endm + +.macro endjumptable +#ifdef _WIN32 + endfunc +#else + endconst +#endif +.endm + #ifdef __APPLE__ #define L(x) L ## x #else diff --git a/src/arm/cpu.c b/src/arm/cpu.c index d9b1751a6ae221c3af7f8fbc108e54a748e088e5..5275b7404a0486dc192a0560ac262df1770053f5 100644 --- a/src/arm/cpu.c +++ b/src/arm/cpu.c @@ -29,9 +29,10 @@ #include "common/attributes.h" +#include "src/cpu.h" #include "src/arm/cpu.h" -#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) +#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO #include <sys/auxv.h> #if ARCH_AARCH64 @@ -42,7 +43,7 @@ #define HWCAP2_AARCH64_I8MM (1 << 13) COLD unsigned dav1d_get_cpu_flags_arm(void) { -#ifdef HAVE_GETAUXVAL +#if HAVE_GETAUXVAL unsigned long hw_cap = getauxval(AT_HWCAP); unsigned long hw_cap2 = getauxval(AT_HWCAP2); #else @@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) { elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2)); #endif - unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; + unsigned flags = dav1d_get_default_cpu_flags(); flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0; flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0; @@ -68,14 +69,15 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) { #define HWCAP_ARM_I8MM (1 << 27) COLD unsigned dav1d_get_cpu_flags_arm(void) { -#ifdef HAVE_GETAUXVAL +#if HAVE_GETAUXVAL unsigned long hw_cap = getauxval(AT_HWCAP); #else unsigned long hw_cap = 0; elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); #endif - unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0; + unsigned flags = dav1d_get_default_cpu_flags(); + flags |= (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0; flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0; return flags; @@ -95,7 +97,7 @@ static int have_feature(const char *feature) { } COLD unsigned dav1d_get_cpu_flags_arm(void) { - unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; + unsigned flags = dav1d_get_default_cpu_flags(); if (have_feature("hw.optional.arm.FEAT_DotProd")) flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; if (have_feature("hw.optional.arm.FEAT_I8MM")) @@ -104,17 +106,67 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) { return flags; } +#elif defined(__OpenBSD__) && ARCH_AARCH64 +#include <machine/armreg.h> +#include <machine/cpu.h> +#include <sys/types.h> +#include <sys/sysctl.h> + +COLD unsigned dav1d_get_cpu_flags_arm(void) { + unsigned flags = dav1d_get_default_cpu_flags(); + +#ifdef CPU_ID_AA64ISAR0 + int mib[2]; + uint64_t isar0; + uint64_t isar1; + size_t len; + + mib[0] = CTL_MACHDEP; + mib[1] = CPU_ID_AA64ISAR0; + len = sizeof(isar0); + if (sysctl(mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_DP(isar0) >= ID_AA64ISAR0_DP_IMPL) + flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; + } + + mib[0] = CTL_MACHDEP; + mib[1] = CPU_ID_AA64ISAR1; + len = sizeof(isar1); + if (sysctl(mib, 2, &isar1, &len, NULL, 0) != -1) { +#ifdef ID_AA64ISAR1_I8MM_IMPL + if (ID_AA64ISAR1_I8MM(isar1) >= ID_AA64ISAR1_I8MM_IMPL) + flags |= DAV1D_ARM_CPU_FLAG_I8MM; +#endif + } +#endif + + return flags; +} + #elif defined(_WIN32) #include <windows.h> COLD unsigned dav1d_get_cpu_flags_arm(void) { - unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; + unsigned flags = dav1d_get_default_cpu_flags(); #ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; #endif - /* No I8MM or SVE feature detection available on Windows at the time of - * writing. */ +#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE + if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) + flags |= DAV1D_ARM_CPU_FLAG_SVE; +#endif +#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE + if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) + flags |= DAV1D_ARM_CPU_FLAG_SVE2; +#endif +#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE + /* There's no PF_* flag that indicates whether plain I8MM is available + * or not. But if SVE_I8MM is available, that also implies that + * regular I8MM is available. */ + if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE)) + flags |= DAV1D_ARM_CPU_FLAG_I8MM; +#endif return flags; } @@ -160,7 +212,8 @@ static unsigned parse_proc_cpuinfo(const char *flag) { } COLD unsigned dav1d_get_cpu_flags_arm(void) { - unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; + unsigned flags = dav1d_get_default_cpu_flags(); + flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0; flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0; @@ -174,7 +227,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) { #else /* Unsupported OS */ COLD unsigned dav1d_get_cpu_flags_arm(void) { - return 0; + return dav1d_get_default_cpu_flags(); } #endif diff --git a/src/arm/itx.h b/src/arm/itx.h index 2a58a31322ee6c3afb864d695f217ca544a8de9c..657f85e613231c59ca4b388d92f94db170a7a767 100644 --- a/src/arm/itx.h +++ b/src/arm/itx.h @@ -49,7 +49,9 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); -static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { +static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc, + int *const all_simd) +{ const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; @@ -77,4 +79,5 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int assign_itx1_fn (R, 64, 16, neon); assign_itx1_fn (R, 64, 32, neon); assign_itx1_fn ( , 64, 64, neon); + *all_simd = 1; } diff --git a/src/arm/mc.h b/src/arm/mc.h index dabdab35753e9451f6f61cb4dec9b3db320468aa..bb6f7f8cae60e159613b95695d530b3ff9ac6376 100644 --- a/src/arm/mc.h +++ b/src/arm/mc.h @@ -63,6 +63,7 @@ decl_8tap_fns(neon); decl_8tap_fns(neon_dotprod); decl_8tap_fns(neon_i8mm); +decl_8tap_fns(sve2); decl_mc_fn(BF(dav1d_put_bilin, neon)); decl_mct_fn(BF(dav1d_prep_bilin, neon)); @@ -110,17 +111,27 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); c->emu_edge = BF(dav1d_emu_edge, neon); -#if ARCH_AARCH64 && BITDEPTH == 8 +#if ARCH_AARCH64 +#if BITDEPTH == 8 #if HAVE_DOTPROD - if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return; - - init_8tap_fns(neon_dotprod); + if (flags & DAV1D_ARM_CPU_FLAG_DOTPROD) { + init_8tap_fns(neon_dotprod); + } #endif // HAVE_DOTPROD #if HAVE_I8MM - if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return; - - init_8tap_fns(neon_i8mm); + if (flags & DAV1D_ARM_CPU_FLAG_I8MM) { + init_8tap_fns(neon_i8mm); + } #endif // HAVE_I8MM -#endif // ARCH_AARCH64 && BITDEPTH == 8 +#endif // BITDEPTH == 8 + +#if BITDEPTH == 16 +#if HAVE_SVE2 + if (flags & DAV1D_ARM_CPU_FLAG_SVE2) { + init_8tap_fns(sve2); + } +#endif // HAVE_SVE2 +#endif // BITDEPTH == 16 +#endif // ARCH_AARCH64 } diff --git a/src/cpu.c b/src/cpu.c index 9bb85f151b1b656bc6435db4c716e5dc9c43b64e..415266711e11ebc06ce8ed9e9609c576151d0b15 100644 --- a/src/cpu.c +++ b/src/cpu.c @@ -33,20 +33,24 @@ #ifdef _WIN32 #include <windows.h> -#elif defined(__APPLE__) +#endif +#ifdef __APPLE__ #include <sys/sysctl.h> #include <sys/types.h> -#else -#include <pthread.h> +#endif +#if HAVE_UNISTD_H #include <unistd.h> #endif -#ifdef HAVE_PTHREAD_NP_H +#if HAVE_PTHREAD_GETAFFINITY_NP +#include <pthread.h> +#if HAVE_PTHREAD_NP_H #include <pthread_np.h> #endif #if defined(__FreeBSD__) #define cpu_set_t cpuset_t #endif +#endif unsigned dav1d_cpu_flags = 0U; unsigned dav1d_cpu_flags_mask = ~0U; @@ -87,7 +91,7 @@ COLD int dav1d_num_logical_processors(Dav1dContext *const c) { GetNativeSystemInfo(&system_info); return system_info.dwNumberOfProcessors; #endif -#elif defined(HAVE_PTHREAD_GETAFFINITY_NP) && defined(CPU_COUNT) +#elif HAVE_PTHREAD_GETAFFINITY_NP && defined(CPU_COUNT) cpu_set_t affinity; if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) return CPU_COUNT(&affinity); diff --git a/src/cpu.h b/src/cpu.h index 7205e8e62ff66943cc0ba8169392333de8a6a7d8..c18b7ff1fb97155ce43c4fe541921e49187532e3 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -54,12 +54,9 @@ void dav1d_init_cpu(void); DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask); int dav1d_num_logical_processors(Dav1dContext *c); -static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { - unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask; +static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) { + unsigned flags = 0; -#if TRIM_DSP_FUNCTIONS -/* Since this function is inlined, unconditionally setting a flag here will - * enable dead code elimination in the calling function. */ #if ARCH_AARCH64 || ARCH_ARM #if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 flags |= DAV1D_ARM_CPU_FLAG_NEON; @@ -119,6 +116,17 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { flags |= DAV1D_X86_CPU_FLAG_SSE2; #endif #endif + + return flags; +} + +static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { + unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask; + +#if TRIM_DSP_FUNCTIONS +/* Since this function is inlined, unconditionally setting a flag here will + * enable dead code elimination in the calling function. */ + flags |= dav1d_get_default_cpu_flags(); #endif return flags; diff --git a/src/ctx.c b/src/ctx.c new file mode 100644 index 0000000000000000000000000000000000000000..0a0fe54c7ceacee1adf1625aa96dd9983b7897c0 --- /dev/null +++ b/src/ctx.c @@ -0,0 +1,65 @@ +/* + * Copyright © 2024, VideoLAN and dav1d authors + * Copyright © 2024, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include <string.h> + +#include "ctx.h" + +static void memset_w1(void *const ptr, const int value) { + set_ctx1((uint8_t *) ptr, 0, value); +} + +static void memset_w2(void *const ptr, const int value) { + set_ctx2((uint8_t *) ptr, 0, value); +} + +static void memset_w4(void *const ptr, const int value) { + set_ctx4((uint8_t *) ptr, 0, value); +} + +static void memset_w8(void *const ptr, const int value) { + set_ctx8((uint8_t *) ptr, 0, value); +} + +static void memset_w16(void *const ptr, const int value) { + set_ctx16((uint8_t *) ptr, 0, value); +} + +static void memset_w32(void *const ptr, const int value) { + set_ctx32((uint8_t *) ptr, 0, value); +} + +const dav1d_memset_pow2_fn dav1d_memset_pow2[6] = { + memset_w1, + memset_w2, + memset_w4, + memset_w8, + memset_w16, + memset_w32 +}; diff --git a/src/ctx.h b/src/ctx.h index d0e1f310ae2d9c65637a9da8bd6c10021ff5e2bd..7dea8b68948eebac6b1581b0dbc0a7221a44b8a8 100644 --- a/src/ctx.h +++ b/src/ctx.h @@ -31,61 +31,59 @@ #include <stdint.h> #include "common/attributes.h" +#include "common/intops.h" union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS; union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS; union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS; union alias8 { uint8_t u8; } ATTR_ALIAS; -#define set_ctx_rep4(type, var, off, val) do { \ - const uint64_t const_val = val; \ - ((union alias64 *) &var[off + 0])->u64 = const_val; \ - ((union alias64 *) &var[off + 8])->u64 = const_val; \ - ((union alias64 *) &var[off + 16])->u64 = const_val; \ - ((union alias64 *) &var[off + 24])->u64 = const_val; \ +typedef void (*dav1d_memset_pow2_fn)(void *ptr, int value); +EXTERN const dav1d_memset_pow2_fn dav1d_memset_pow2[6]; + +static inline void dav1d_memset_likely_pow2(void *const ptr, const int value, const int n) { + assert(n >= 1 && n <= 32); + if ((n&(n-1)) == 0) { + dav1d_memset_pow2[ulog2(n)](ptr, value); + } else { + memset(ptr, value, n); + } +} + +// For smaller sizes use multiplication to broadcast bytes. memset misbehaves on the smaller sizes. +// For the larger sizes, we want to use memset to get access to vector operations. +#define set_ctx1(var, off, val) \ + ((union alias8 *) &(var)[off])->u8 = (val) * 0x01 +#define set_ctx2(var, off, val) \ + ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101 +#define set_ctx4(var, off, val) \ + ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U +#define set_ctx8(var, off, val) \ + ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL +#define set_ctx16(var, off, val) do { \ + memset(&(var)[off], val, 16); \ } while (0) -#define set_ctx_rep2(type, var, off, val) do { \ - const uint64_t const_val = val; \ - ((union alias64 *) &var[off + 0])->u64 = const_val; \ - ((union alias64 *) &var[off + 8])->u64 = const_val; \ +#define set_ctx32(var, off, val) do { \ + memset(&(var)[off], val, 32); \ } while (0) -#define set_ctx_rep1(typesz, var, off, val) \ - ((union alias##typesz *) &var[off])->u##typesz = val -#define case_set(var, dir, diridx, off) \ - switch (var) { \ - case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ - case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ - case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ - case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ - case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ - case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \ - } -#define case_set_upto16(var, dir, diridx, off) \ - switch (var) { \ - case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ - case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ - case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ - case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ - case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ - } -#define case_set_upto32_with_default(var, dir, diridx, off) \ +#define case_set(var) \ switch (var) { \ - case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ - case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ - case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ - case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ - case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ - case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \ - default: default_memset(dir, diridx, off, var); break; \ + case 0: set_ctx(set_ctx1); break; \ + case 1: set_ctx(set_ctx2); break; \ + case 2: set_ctx(set_ctx4); break; \ + case 3: set_ctx(set_ctx8); break; \ + case 4: set_ctx(set_ctx16); break; \ + case 5: set_ctx(set_ctx32); break; \ + default: assert(0); \ } -#define case_set_upto16_with_default(var, dir, diridx, off) \ +#define case_set_upto16(var) \ switch (var) { \ - case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \ - case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \ - case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \ - case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \ - case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \ - default: default_memset(dir, diridx, off, var); break; \ + case 0: set_ctx(set_ctx1); break; \ + case 1: set_ctx(set_ctx2); break; \ + case 2: set_ctx(set_ctx4); break; \ + case 3: set_ctx(set_ctx8); break; \ + case 4: set_ctx(set_ctx16); break; \ + default: assert(0); \ } #endif /* DAV1D_SRC_CTX_H */ diff --git a/src/decode.c b/src/decode.c index ea371324216de34f748db67287c9bc9513019726..f5b6db95838d24695489bbf205f3cbe8a4074831 100644 --- a/src/decode.c +++ b/src/decode.c @@ -161,14 +161,8 @@ static void read_tx_tree(Dav1dTaskContext *const t, } t->by -= txsh; } else { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh) - case_set_upto16(t_dim->h, l., 1, by4); -#undef set_ctx -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw) - case_set_upto16(t_dim->w, a->, 0, bx4); -#undef set_ctx + dav1d_memset_pow2[t_dim->lw](&t->a->tx[bx4], is_split ? TX_4X4 : txw); + dav1d_memset_pow2[t_dim->lh](&t->l.tx[by4], is_split ? TX_4X4 : txh); } } @@ -464,19 +458,13 @@ static void read_vartx_tree(Dav1dTaskContext *const t, { b->max_ytx = b->uvtx = TX_4X4; if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir tx, off, TX_4X4) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); -#undef set_ctx + dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], TX_4X4); + dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], TX_4X4); } } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) { if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx]) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); -#undef set_ctx + dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], b_dim[2 + 0]); + dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], b_dim[2 + 1]); } b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout]; } else { @@ -696,8 +684,7 @@ static int decode_b(Dav1dTaskContext *const t, const enum BlockLevel bl, const enum BlockSize bs, const enum BlockPartition bp, - const enum EdgeFlags intra_edge_flags) -{ + const enum EdgeFlags intra_edge_flags) { Dav1dTileState *const ts = t->ts; const Dav1dFrameContext *const f = t->f; Av1Block b_mem, *const b = t->frame_thread.pass ? @@ -722,11 +709,13 @@ static int decode_b(Dav1dTaskContext *const t, const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode; -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \ - rep_macro(type, t->dir intra, off, mul) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); +#define set_ctx(rep_macro) \ + rep_macro(edge->mode, off, y_mode_nofilt); \ + rep_macro(edge->intra, off, 1) + BlockContext *edge = t->a; + for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { + case_set(b_dim[2 + i]); + } #undef set_ctx if (IS_INTER_OR_SWITCH(f->frame_hdr)) { refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx]; @@ -742,11 +731,9 @@ static int decode_b(Dav1dTaskContext *const t, } if (has_chroma) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir uvmode, off, mul * b->uv_mode) - case_set(cbh4, l., 1, cby4); - case_set(cbw4, a->, 0, cbx4); -#undef set_ctx + uint8_t uv_mode = b->uv_mode; + dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode); + dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode); } } else { if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ && @@ -784,13 +771,15 @@ static int decode_b(Dav1dTaskContext *const t, if (f->bd_fn.recon_b_inter(t, bs, b)) return -1; const uint8_t *const filter = dav1d_filter_dir[b->filter2d]; -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir filter[0], off, mul * filter[0]); \ - rep_macro(type, t->dir filter[1], off, mul * filter[1]); \ - rep_macro(type, t->dir intra, off, 0) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); + BlockContext *edge = t->a; + for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { +#define set_ctx(rep_macro) \ + rep_macro(edge->filter[0], off, filter[0]); \ + rep_macro(edge->filter[1], off, filter[1]); \ + rep_macro(edge->intra, off, 0) + case_set(b_dim[2 + i]); #undef set_ctx + } if (IS_INTER_OR_SWITCH(f->frame_hdr)) { refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx]; @@ -808,11 +797,8 @@ static int decode_b(Dav1dTaskContext *const t, } if (has_chroma) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir uvmode, off, mul * DC_PRED) - case_set(cbh4, l., 1, cby4); - case_set(cbw4, a->, 0, cbx4); -#undef set_ctx + dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED); + dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED); } } return 0; @@ -1240,39 +1226,39 @@ static int decode_b(Dav1dTaskContext *const t, has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL, has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL); } - // update contexts -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \ - rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \ - rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \ - rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \ - rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \ - rep_macro(type, t->dir skip_mode, off, 0); \ - rep_macro(type, t->dir intra, off, mul); \ - rep_macro(type, t->dir skip, off, mul * b->skip); \ - /* see aomedia bug 2183 for why we use luma coordinates here */ \ - rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \ - if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \ - rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \ - rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \ - rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \ - rep_macro(type, t->dir filter[0], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \ - rep_macro(type, t->dir filter[1], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \ - } const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode; - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); + BlockContext *edge = t->a; + for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { + int t_lsz = ((uint8_t *) &t_dim->lw)[i]; // lw then lh +#define set_ctx(rep_macro) \ + rep_macro(edge->tx_intra, off, t_lsz); \ + rep_macro(edge->tx, off, t_lsz); \ + rep_macro(edge->mode, off, y_mode_nofilt); \ + rep_macro(edge->pal_sz, off, b->pal_sz[0]); \ + rep_macro(edge->seg_pred, off, seg_pred); \ + rep_macro(edge->skip_mode, off, 0); \ + rep_macro(edge->intra, off, 1); \ + rep_macro(edge->skip, off, b->skip); \ + /* see aomedia bug 2183 for why we use luma coordinates here */ \ + rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \ + if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \ + rep_macro(edge->comp_type, off, COMP_INTER_NONE); \ + rep_macro(edge->ref[0], off, ((uint8_t) -1)); \ + rep_macro(edge->ref[1], off, ((uint8_t) -1)); \ + rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \ + rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \ + } + case_set(b_dim[2 + i]); #undef set_ctx + } if (b->pal_sz[0]) f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4); if (has_chroma) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir uvmode, off, mul * b->uv_mode) - case_set(cbh4, l., 1, cby4); - case_set(cbw4, a->, 0, cbx4); -#undef set_ctx + uint8_t uv_mode = b->uv_mode; + dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode); + dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode); if (b->pal_sz[1]) f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4); } @@ -1374,26 +1360,24 @@ static int decode_b(Dav1dTaskContext *const t, } splat_intrabc_mv(f->c, t, bs, b, bw4, bh4); - -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \ - rep_macro(type, t->dir mode, off, mul * DC_PRED); \ - rep_macro(type, t->dir pal_sz, off, 0); \ - /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \ - rep_macro(type, t->pal_sz_uv[diridx], off, 0); \ - rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \ - rep_macro(type, t->dir skip_mode, off, 0); \ - rep_macro(type, t->dir intra, off, 0); \ - rep_macro(type, t->dir skip, off, mul * b->skip) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); + BlockContext *edge = t->a; + for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { +#define set_ctx(rep_macro) \ + rep_macro(edge->tx_intra, off, b_dim[2 + i]); \ + rep_macro(edge->mode, off, DC_PRED); \ + rep_macro(edge->pal_sz, off, 0); \ + /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \ + rep_macro(t->pal_sz_uv[i], off, 0); \ + rep_macro(edge->seg_pred, off, seg_pred); \ + rep_macro(edge->skip_mode, off, 0); \ + rep_macro(edge->intra, off, 0); \ + rep_macro(edge->skip, off, b->skip) + case_set(b_dim[2 + i]); #undef set_ctx + } if (has_chroma) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir uvmode, off, mul * DC_PRED) - case_set(cbh4, l., 1, cby4); - case_set(cbw4, a->, 0, cbx4); -#undef set_ctx + dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED); + dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED); } } else { // inter-specific mode/mv coding @@ -1922,32 +1906,29 @@ static int decode_b(Dav1dTaskContext *const t, splat_tworef_mv(f->c, t, bs, b, bw4, bh4); else splat_oneref_mv(f->c, t, bs, b, bw4, bh4); - -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \ - rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \ - rep_macro(type, t->dir intra, off, 0); \ - rep_macro(type, t->dir skip, off, mul * b->skip); \ - rep_macro(type, t->dir pal_sz, off, 0); \ - /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \ - rep_macro(type, t->pal_sz_uv[diridx], off, 0); \ - rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \ - rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \ - rep_macro(type, t->dir filter[0], off, mul * filter[0]); \ - rep_macro(type, t->dir filter[1], off, mul * filter[1]); \ - rep_macro(type, t->dir mode, off, mul * b->inter_mode); \ - rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \ - rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1])) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); + BlockContext *edge = t->a; + for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { +#define set_ctx(rep_macro) \ + rep_macro(edge->seg_pred, off, seg_pred); \ + rep_macro(edge->skip_mode, off, b->skip_mode); \ + rep_macro(edge->intra, off, 0); \ + rep_macro(edge->skip, off, b->skip); \ + rep_macro(edge->pal_sz, off, 0); \ + /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \ + rep_macro(t->pal_sz_uv[i], off, 0); \ + rep_macro(edge->tx_intra, off, b_dim[2 + i]); \ + rep_macro(edge->comp_type, off, b->comp_type); \ + rep_macro(edge->filter[0], off, filter[0]); \ + rep_macro(edge->filter[1], off, filter[1]); \ + rep_macro(edge->mode, off, b->inter_mode); \ + rep_macro(edge->ref[0], off, b->ref[0]); \ + rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1])) + case_set(b_dim[2 + i]); #undef set_ctx - + } if (has_chroma) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir uvmode, off, mul * DC_PRED) - case_set(cbh4, l., 1, cby4); - case_set(cbw4, a->, 0, cbx4); -#undef set_ctx + dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED); + dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED); } } @@ -1956,12 +1937,12 @@ static int decode_b(Dav1dTaskContext *const t, f->frame_hdr->segmentation.update_map) { uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx]; -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ +#define set_ctx(rep_macro) \ for (int y = 0; y < bh4; y++) { \ - rep_macro(type, seg_ptr, 0, mul * b->seg_id); \ + rep_macro(seg_ptr, 0, b->seg_id); \ seg_ptr += f->b4_stride; \ } - case_set(bw4, NULL, 0, 0); + case_set(b_dim[2]); #undef set_ctx } if (!b->skip) { @@ -2398,10 +2379,10 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, } if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \ - rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp]) - case_set_upto16(hsz,,,); +#define set_ctx(rep_macro) \ + rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \ + rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp]) + case_set_upto16(ulog2(hsz)); #undef set_ctx } diff --git a/src/itx_1d.c b/src/itx_1d.c index 8f75c653afefb10120d2efd488a0e67648f7b773..14e89ca0c886863013976cb0d71bb41136ebf8ab 100644 --- a/src/itx_1d.c +++ b/src/itx_1d.c @@ -89,8 +89,8 @@ inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride, c[3 * stride] = CLIP(t0 - t3); } -void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { inv_dct4_1d_internal_c(c, stride, min, max, 0); } @@ -142,8 +142,8 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride, c[7 * stride] = CLIP(t0 - t7); } -void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { inv_dct8_1d_internal_c(c, stride, min, max, 0); } @@ -237,8 +237,8 @@ inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride, c[15 * stride] = CLIP(t0 - t15a); } -void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { inv_dct16_1d_internal_c(c, stride, min, max, 0); } @@ -427,14 +427,14 @@ inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride, c[31 * stride] = CLIP(t0 - t31); } -void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { inv_dct32_1d_internal_c(c, stride, min, max, 0); } -void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { assert(stride > 0); inv_dct32_1d_internal_c(c, stride << 1, min, max, 1); @@ -962,13 +962,13 @@ inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s, } #define inv_adst_1d(sz) \ -void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ - const int min, const int max) \ +static void inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ + const int min, const int max) \ { \ inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \ } \ -void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ - const int min, const int max) \ +static void inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \ + const int min, const int max) \ { \ inv_adst##sz##_1d_internal_c(c, stride, min, max, \ &c[(sz - 1) * stride], -stride); \ @@ -980,8 +980,8 @@ inv_adst_1d(16) #undef inv_adst_1d -void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { assert(stride > 0); for (int i = 0; i < 4; i++) { @@ -990,16 +990,16 @@ void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride, } } -void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { assert(stride > 0); for (int i = 0; i < 8; i++) c[stride * i] *= 2; } -void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { assert(stride > 0); for (int i = 0; i < 16; i++) { @@ -1008,14 +1008,57 @@ void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride, } } -void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride, - const int min, const int max) +static void inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride, + const int min, const int max) { assert(stride > 0); for (int i = 0; i < 32; i++) c[stride * i] *= 4; } +const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES] = { + [TX_4X4] = { + [DCT] = inv_dct4_1d_c, + [ADST] = inv_adst4_1d_c, + [FLIPADST] = inv_flipadst4_1d_c, + [IDENTITY] = inv_identity4_1d_c, + }, [TX_8X8] = { + [DCT] = inv_dct8_1d_c, + [ADST] = inv_adst8_1d_c, + [FLIPADST] = inv_flipadst8_1d_c, + [IDENTITY] = inv_identity8_1d_c, + }, [TX_16X16] = { + [DCT] = inv_dct16_1d_c, + [ADST] = inv_adst16_1d_c, + [FLIPADST] = inv_flipadst16_1d_c, + [IDENTITY] = inv_identity16_1d_c, + }, [TX_32X32] = { + [DCT] = inv_dct32_1d_c, + [IDENTITY] = inv_identity32_1d_c, + }, [TX_64X64] = { + [DCT] = inv_dct64_1d_c, + }, +}; + +const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2] = { + [DCT_DCT] = { DCT, DCT }, + [ADST_DCT] = { ADST, DCT }, + [DCT_ADST] = { DCT, ADST }, + [ADST_ADST] = { ADST, ADST }, + [FLIPADST_DCT] = { FLIPADST, DCT }, + [DCT_FLIPADST] = { DCT, FLIPADST }, + [FLIPADST_FLIPADST] = { FLIPADST, FLIPADST }, + [ADST_FLIPADST] = { ADST, FLIPADST }, + [FLIPADST_ADST] = { FLIPADST, ADST }, + [IDTX] = { IDENTITY, IDENTITY }, + [V_DCT] = { DCT, IDENTITY }, + [H_DCT] = { IDENTITY, DCT }, + [V_ADST] = { ADST, IDENTITY }, + [H_ADST] = { IDENTITY, ADST }, + [V_FLIPADST] = { FLIPADST, IDENTITY }, + [H_FLIPADST] = { IDENTITY, FLIPADST }, +}; + #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ ARCH_AARCH64 || \ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ diff --git a/src/itx_1d.h b/src/itx_1d.h index b63d71b020bdac7da9cd27cdb615874476471ad3..880ac99a358cfa14ebe4ddcb11678c75720c25c0 100644 --- a/src/itx_1d.h +++ b/src/itx_1d.h @@ -28,31 +28,25 @@ #include <stddef.h> #include <stdint.h> +#include "src/levels.h" + #ifndef DAV1D_SRC_ITX_1D_H #define DAV1D_SRC_ITX_1D_H +enum Tx1dType { + DCT, + ADST, + IDENTITY, + FLIPADST, + N_TX_1D_TYPES, +}; + #define decl_itx_1d_fn(name) \ void (name)(int32_t *c, ptrdiff_t stride, int min, int max) typedef decl_itx_1d_fn(*itx_1d_fn); -decl_itx_1d_fn(dav1d_inv_dct4_1d_c); -decl_itx_1d_fn(dav1d_inv_dct8_1d_c); -decl_itx_1d_fn(dav1d_inv_dct16_1d_c); -decl_itx_1d_fn(dav1d_inv_dct32_1d_c); -decl_itx_1d_fn(dav1d_inv_dct64_1d_c); - -decl_itx_1d_fn(dav1d_inv_adst4_1d_c); -decl_itx_1d_fn(dav1d_inv_adst8_1d_c); -decl_itx_1d_fn(dav1d_inv_adst16_1d_c); - -decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c); -decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c); -decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c); - -decl_itx_1d_fn(dav1d_inv_identity4_1d_c); -decl_itx_1d_fn(dav1d_inv_identity8_1d_c); -decl_itx_1d_fn(dav1d_inv_identity16_1d_c); -decl_itx_1d_fn(dav1d_inv_identity32_1d_c); +EXTERN const itx_1d_fn dav1d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES]; +EXTERN const uint8_t /* enum Tx1dType */ dav1d_tx1d_types[N_TX_TYPES][2]; void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride); diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c index a226223c960273a6d83d50644d7a58c2cda2662a..bafe0a86a6c0909bd3040d6d3ce8a39bbf709c36 100644 --- a/src/itx_tmpl.c +++ b/src/itx_tmpl.c @@ -29,6 +29,7 @@ #include <stddef.h> #include <stdint.h> +#include <stdlib.h> #include <string.h> #include "common/attributes.h" @@ -36,13 +37,17 @@ #include "src/itx.h" #include "src/itx_1d.h" +#include "src/scan.h" +#include "src/tables.h" static NOINLINE void inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, - const int eob, const int w, const int h, const int shift, - const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn, - const int has_dconly HIGHBD_DECL_SUFFIX) + const int eob, const /*enum RectTxfmSize*/ int tx, const int shift, + const enum TxfmType txtp HIGHBD_DECL_SUFFIX) { + const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; + const int w = 4 * t_dim->w, h = 4 * t_dim->h; + const int has_dconly = txtp == DCT_DCT; assert(w >= 4 && w <= 64); assert(h >= 4 && h <= 64); assert(eob >= 0); @@ -64,6 +69,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, return; } + const uint8_t *const txtps = dav1d_tx1d_types[txtp]; + const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]]; + const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]]; const int sh = imin(h, 32), sw = imin(w, 32); #if BITDEPTH == 8 const int row_clip_min = INT16_MIN; @@ -76,7 +84,16 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, const int col_clip_max = ~col_clip_min; int32_t tmp[64 * 64], *c = tmp; - for (int y = 0; y < sh; y++, c += w) { + int last_nonzero_col; // in first 1d itx + if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) { + last_nonzero_col = imin(sh - 1, eob); + } else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) { + last_nonzero_col = eob >> (t_dim->lw + 2); + } else { + last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob]; + } + assert(last_nonzero_col < sh); + for (int y = 0; y <= last_nonzero_col; y++, c += w) { if (is_rect2) for (int x = 0; x < sw; x++) c[x] = (coeff[y + x * sh] * 181 + 128) >> 8; @@ -85,6 +102,8 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, c[x] = coeff[y + x * sh]; first_1d_fn(c, 1, row_clip_min, row_clip_max); } + if (last_nonzero_col + 1 < sh) + memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w); memset(coeff, 0, sizeof(*coeff) * sw * sh); for (int i = 0; i < w * sh; i++) @@ -99,7 +118,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4)); } -#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \ +#define inv_txfm_fn(type1, type2, type, pfx, w, h, shift) \ static void \ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \ const ptrdiff_t stride, \ @@ -107,57 +126,56 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \ const int eob \ HIGHBD_DECL_SUFFIX) \ { \ - inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \ - dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \ - has_dconly HIGHBD_TAIL_SUFFIX); \ + inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \ + HIGHBD_TAIL_SUFFIX); \ } -#define inv_txfm_fn64(w, h, shift) \ -inv_txfm_fn(dct, dct, w, h, shift, 1) +#define inv_txfm_fn64(pfx, w, h, shift) \ +inv_txfm_fn(dct, dct, DCT_DCT, pfx, w, h, shift) -#define inv_txfm_fn32(w, h, shift) \ -inv_txfm_fn64(w, h, shift) \ -inv_txfm_fn(identity, identity, w, h, shift, 0) +#define inv_txfm_fn32(pfx, w, h, shift) \ +inv_txfm_fn64(pfx, w, h, shift) \ +inv_txfm_fn(identity, identity, IDTX, pfx, w, h, shift) -#define inv_txfm_fn16(w, h, shift) \ -inv_txfm_fn32(w, h, shift) \ -inv_txfm_fn(adst, dct, w, h, shift, 0) \ -inv_txfm_fn(dct, adst, w, h, shift, 0) \ -inv_txfm_fn(adst, adst, w, h, shift, 0) \ -inv_txfm_fn(dct, flipadst, w, h, shift, 0) \ -inv_txfm_fn(flipadst, dct, w, h, shift, 0) \ -inv_txfm_fn(adst, flipadst, w, h, shift, 0) \ -inv_txfm_fn(flipadst, adst, w, h, shift, 0) \ -inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \ -inv_txfm_fn(identity, dct, w, h, shift, 0) \ -inv_txfm_fn(dct, identity, w, h, shift, 0) \ +#define inv_txfm_fn16(pfx, w, h, shift) \ +inv_txfm_fn32(pfx, w, h, shift) \ +inv_txfm_fn(adst, dct, ADST_DCT, pfx, w, h, shift) \ +inv_txfm_fn(dct, adst, DCT_ADST, pfx, w, h, shift) \ +inv_txfm_fn(adst, adst, ADST_ADST, pfx, w, h, shift) \ +inv_txfm_fn(dct, flipadst, DCT_FLIPADST, pfx, w, h, shift) \ +inv_txfm_fn(flipadst, dct, FLIPADST_DCT, pfx, w, h, shift) \ +inv_txfm_fn(adst, flipadst, ADST_FLIPADST, pfx, w, h, shift) \ +inv_txfm_fn(flipadst, adst, FLIPADST_ADST, pfx, w, h, shift) \ +inv_txfm_fn(flipadst, flipadst, FLIPADST_FLIPADST, pfx, w, h, shift) \ +inv_txfm_fn(identity, dct, H_DCT, pfx, w, h, shift) \ +inv_txfm_fn(dct, identity, V_DCT, pfx, w, h, shift) \ -#define inv_txfm_fn84(w, h, shift) \ -inv_txfm_fn16(w, h, shift) \ -inv_txfm_fn(identity, flipadst, w, h, shift, 0) \ -inv_txfm_fn(flipadst, identity, w, h, shift, 0) \ -inv_txfm_fn(identity, adst, w, h, shift, 0) \ -inv_txfm_fn(adst, identity, w, h, shift, 0) \ +#define inv_txfm_fn84(pfx, w, h, shift) \ +inv_txfm_fn16(pfx, w, h, shift) \ +inv_txfm_fn(identity, flipadst, H_FLIPADST, pfx, w, h, shift) \ +inv_txfm_fn(flipadst, identity, V_FLIPADST, pfx, w, h, shift) \ +inv_txfm_fn(identity, adst, H_ADST, pfx, w, h, shift) \ +inv_txfm_fn(adst, identity, V_ADST, pfx, w, h, shift) \ -inv_txfm_fn84( 4, 4, 0) -inv_txfm_fn84( 4, 8, 0) -inv_txfm_fn84( 4, 16, 1) -inv_txfm_fn84( 8, 4, 0) -inv_txfm_fn84( 8, 8, 1) -inv_txfm_fn84( 8, 16, 1) -inv_txfm_fn32( 8, 32, 2) -inv_txfm_fn84(16, 4, 1) -inv_txfm_fn84(16, 8, 1) -inv_txfm_fn16(16, 16, 2) -inv_txfm_fn32(16, 32, 1) -inv_txfm_fn64(16, 64, 2) -inv_txfm_fn32(32, 8, 2) -inv_txfm_fn32(32, 16, 1) -inv_txfm_fn32(32, 32, 2) -inv_txfm_fn64(32, 64, 1) -inv_txfm_fn64(64, 16, 2) -inv_txfm_fn64(64, 32, 1) -inv_txfm_fn64(64, 64, 2) +inv_txfm_fn84( , 4, 4, 0) +inv_txfm_fn84(R, 4, 8, 0) +inv_txfm_fn84(R, 4, 16, 1) +inv_txfm_fn84(R, 8, 4, 0) +inv_txfm_fn84( , 8, 8, 1) +inv_txfm_fn84(R, 8, 16, 1) +inv_txfm_fn32(R, 8, 32, 2) +inv_txfm_fn84(R, 16, 4, 1) +inv_txfm_fn84(R, 16, 8, 1) +inv_txfm_fn16( , 16, 16, 2) +inv_txfm_fn32(R, 16, 32, 1) +inv_txfm_fn64(R, 16, 64, 2) +inv_txfm_fn32(R, 32, 8, 2) +inv_txfm_fn32(R, 32, 16, 1) +inv_txfm_fn32( , 32, 32, 2) +inv_txfm_fn64(R, 32, 64, 1) +inv_txfm_fn64(R, 64, 16, 2) +inv_txfm_fn64(R, 64, 32, 1) +inv_txfm_fn64( , 64, 64, 2) #if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ ARCH_AARCH64 || \ @@ -190,6 +208,8 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, #include "src/arm/itx.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/itx.h" +#elif ARCH_PPC64LE +#include "src/ppc/itx.h" #elif ARCH_RISCV #include "src/riscv/itx.h" #elif ARCH_X86 @@ -267,18 +287,25 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { assign_itx_all_fn64(64, 32, R); assign_itx_all_fn64(64, 64, ); + int all_simd = 0; #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - itx_dsp_init_arm(c, bpc); + itx_dsp_init_arm(c, bpc, &all_simd); #endif #if ARCH_LOONGARCH64 itx_dsp_init_loongarch(c, bpc); #endif +#if ARCH_PPC64LE + itx_dsp_init_ppc(c, bpc); +#endif #if ARCH_RISCV itx_dsp_init_riscv(c, bpc); #endif #if ARCH_X86 - itx_dsp_init_x86(c, bpc); + itx_dsp_init_x86(c, bpc, &all_simd); #endif #endif + + if (!all_simd) + dav1d_init_last_nonzero_col_from_eob_tables(); } diff --git a/src/lf_mask.c b/src/lf_mask.c index 09a5c532c4b8435db56e1f9a0add3aa27dbba55d..c81bd9b5f9bd0958e5abb3825481ebd3aef6db9e 100644 --- a/src/lf_mask.c +++ b/src/lf_mask.c @@ -64,18 +64,15 @@ static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /* } else { const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh); -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ +#define set_ctx(rep_macro) \ for (int y = 0; y < t_dim->h; y++) { \ - rep_macro(type, txa[0][0][y], off, mul * lw); \ - rep_macro(type, txa[1][0][y], off, mul * lh); \ + rep_macro(txa[0][0][y], 0, lw); \ + rep_macro(txa[1][0][y], 0, lh); \ txa[0][1][y][0] = t_dim->w; \ } - case_set_upto16(t_dim->w,,, 0); -#undef set_ctx -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, txa[1][1][0], off, mul * t_dim->h) - case_set_upto16(t_dim->w,,, 0); + case_set_upto16(t_dim->lw); #undef set_ctx + dav1d_memset_pow2[t_dim->lw](txa[1][1][0], t_dim->h); } } @@ -196,20 +193,8 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2], if (inner2) masks[1][by4 + y][thl4c][1] |= inner2; } -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, a, off, mul * thl4c) -#define default_memset(dir, diridx, off, var) \ - memset(a, thl4c, var) - case_set_upto32_with_default(w4,,, 0); -#undef default_memset -#undef set_ctx -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, l, off, mul * twl4c) -#define default_memset(dir, diridx, off, var) \ - memset(l, twl4c, var) - case_set_upto32_with_default(h4,,, 0); -#undef default_memset -#undef set_ctx + dav1d_memset_likely_pow2(a, thl4c, w4); + dav1d_memset_likely_pow2(l, twl4c, h4); } static void mask_edges_chroma(uint16_t (*const masks)[32][2][2], @@ -267,20 +252,8 @@ static void mask_edges_chroma(uint16_t (*const masks)[32][2][2], } } -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, a, off, mul * thl4c) -#define default_memset(dir, diridx, off, var) \ - memset(a, thl4c, var) - case_set_upto32_with_default(cw4,,, 0); -#undef default_memset -#undef set_ctx -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, l, off, mul * twl4c) -#define default_memset(dir, diridx, off, var) \ - memset(l, twl4c, var) - case_set_upto32_with_default(ch4,,, 0); -#undef default_memset -#undef set_ctx + dav1d_memset_likely_pow2(a, thl4c, cw4); + dav1d_memset_likely_pow2(l, twl4c, ch4); } void dav1d_create_lf_mask_intra(Av1Filter *const lflvl, diff --git a/src/lib.c b/src/lib.c index 4d9a2d30e343e3e13c44c23065347bd18030de65..6d2d80dd93aa99389132529a8a31e72739d41d69 100644 --- a/src/lib.c +++ b/src/lib.c @@ -31,7 +31,7 @@ #include <errno.h> #include <string.h> -#if defined(__linux__) && defined(HAVE_DLSYM) +#if defined(__linux__) && HAVE_DLSYM #include <dlfcn.h> #endif @@ -90,7 +90,7 @@ static void close_internal(Dav1dContext **const c_out, int flush); NO_SANITIZE("cfi-icall") // CFI is broken with dlsym() static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) { -#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__) +#if defined(__linux__) && HAVE_DLSYM && defined(__GLIBC__) /* glibc has an issue where the size of the TLS is subtracted from the stack * size instead of allocated separately. As a result the specified stack * size may be insufficient when used in an application with large amounts diff --git a/src/loongarch/cpu.c b/src/loongarch/cpu.c index a79ade5472e6a4fef8c8ce95e2ca89134463377b..383aa01e5de537b715245c56a66635f716f23777 100644 --- a/src/loongarch/cpu.c +++ b/src/loongarch/cpu.c @@ -26,9 +26,11 @@ #include "config.h" #include "common/attributes.h" + +#include "src/cpu.h" #include "src/loongarch/cpu.h" -#if defined(HAVE_GETAUXVAL) +#if HAVE_GETAUXVAL #include <sys/auxv.h> #define LA_HWCAP_LSX ( 1 << 4 ) @@ -36,8 +38,8 @@ #endif COLD unsigned dav1d_get_cpu_flags_loongarch(void) { - unsigned flags = 0; -#if defined(HAVE_GETAUXVAL) + unsigned flags = dav1d_get_default_cpu_flags(); +#if HAVE_GETAUXVAL unsigned long hw_cap = getauxval(AT_HWCAP); flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0; flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0; diff --git a/src/mem.c b/src/mem.c index 7e6eb4c066d1902ad6b5ee7b56c2135f921b2a17..9f0e3944728878550eef48875158c9fa0da168a6 100644 --- a/src/mem.c +++ b/src/mem.c @@ -109,16 +109,7 @@ void *dav1d_malloc(const enum AllocationType type, const size_t sz) { void *dav1d_alloc_aligned(const enum AllocationType type, const size_t sz, const size_t align) { - assert(!(align & (align - 1))); - void *ptr; -#ifdef _WIN32 - ptr = _aligned_malloc(sz + align, align); -#elif defined(HAVE_POSIX_MEMALIGN) - if (posix_memalign(&ptr, align, sz + align)) return NULL; -#else - ptr = memalign(align, sz + align); -#endif - + void *const ptr = dav1d_alloc_aligned_internal(align, sz + align); return track_alloc(type, ptr, sz, align); } @@ -140,12 +131,7 @@ void dav1d_free(void *ptr) { void dav1d_free_aligned(void *ptr) { if (ptr) { - ptr = track_free(ptr); -#ifdef _WIN32 - _aligned_free(ptr); -#else - free(ptr); -#endif + dav1d_free_aligned_internal(track_free(ptr)); } } diff --git a/src/mem.h b/src/mem.h index 0a8c18d709b85bee708f7f66eb44709dc31244dc..c8c45d314f597399468625a1d6b19ff2299e2c52 100644 --- a/src/mem.h +++ b/src/mem.h @@ -32,7 +32,7 @@ #include <stdlib.h> -#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN) +#if defined(_WIN32) || HAVE_MEMALIGN #include <malloc.h> #endif @@ -79,39 +79,33 @@ typedef struct Dav1dMemPool { #endif } Dav1dMemPool; - -#if TRACK_HEAP_ALLOCATIONS -void *dav1d_malloc(enum AllocationType type, size_t sz); -void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz); -void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align); -void dav1d_free(void *ptr); -void dav1d_free_aligned(void *ptr); -void dav1d_log_alloc_stats(Dav1dContext *c); -#else -#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool) -#define dav1d_malloc(type, sz) malloc(sz) -#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz) -#define dav1d_free(ptr) free(ptr) +// TODO: Move this to a common location? +#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1)) /* * Allocate align-byte aligned memory. The return value can be released * by calling the dav1d_free_aligned() function. */ -static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) { +static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) { assert(!(align & (align - 1))); #ifdef _WIN32 return _aligned_malloc(sz, align); -#elif defined(HAVE_POSIX_MEMALIGN) +#elif HAVE_POSIX_MEMALIGN void *ptr; if (posix_memalign(&ptr, align, sz)) return NULL; return ptr; -#else +#elif HAVE_MEMALIGN return memalign(align, sz); +#elif HAVE_ALIGNED_ALLOC + // The C11 standard specifies that the size parameter + // must be an integral multiple of alignment. + return aligned_alloc(align, ROUND_UP(sz, align)); +#else +#error No aligned allocation functions are available #endif } -#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align) -static inline void dav1d_free_aligned(void *ptr) { +static inline void dav1d_free_aligned_internal(void *ptr) { #ifdef _WIN32 _aligned_free(ptr); #else @@ -119,6 +113,20 @@ static inline void dav1d_free_aligned(void *ptr) { #endif } +#if TRACK_HEAP_ALLOCATIONS +void *dav1d_malloc(enum AllocationType type, size_t sz); +void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz); +void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align); +void dav1d_free(void *ptr); +void dav1d_free_aligned(void *ptr); +void dav1d_log_alloc_stats(Dav1dContext *c); +#else +#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool) +#define dav1d_malloc(type, sz) malloc(sz) +#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz) +#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align) +#define dav1d_free(ptr) free(ptr) +#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr) #endif /* TRACK_HEAP_ALLOCATIONS */ void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf); diff --git a/src/meson.build b/src/meson.build index c754668053920f31ba8d86dcd8442db1382e53c1..8dbdc0cc53aaf8c1baa6a90fc69ced68bfef5065 100644 --- a/src/meson.build +++ b/src/meson.build @@ -30,6 +30,7 @@ libdav1d_sources = files( 'cdf.c', 'cpu.c', + 'ctx.c', 'data.c', 'decode.c', 'dequant_tables.c', @@ -119,6 +120,7 @@ if is_asm_enabled 'arm/64/loopfilter16.S', 'arm/64/looprestoration16.S', 'arm/64/mc16.S', + 'arm/64/mc16_sve.S', ) endif elif host_machine.cpu_family().startswith('arm') @@ -256,6 +258,7 @@ if is_asm_enabled )} arch_flags += {'pwr9': ['-mcpu=power9', '-DDAV1D_PWR9']} libdav1d_arch_tmpl_sources += {'pwr9': files( + 'ppc/itx_tmpl.c', 'ppc/loopfilter_tmpl.c', )} elif host_machine.cpu_family().startswith('riscv') @@ -370,7 +373,7 @@ libdav1d = library('dav1d', ) dav1d_dep = declare_dependency(link_with: libdav1d, - include_directories : include_directories('../include/dav1d') + include_directories : include_directories('../include') ) # diff --git a/src/picture.c b/src/picture.c index 94365bce8c3ee8453436e03cae223810d0a224cf..290bd095eaa8b285c101b7855029dee1c42ca924 100644 --- a/src/picture.c +++ b/src/picture.c @@ -201,16 +201,6 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f (void **) &p->progress); if (res) return res; - dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref, - c->mastering_display, c->mastering_display_ref, - c->itut_t35, c->itut_t35_ref, c->n_itut_t35, - &f->tile[0].data.m); - - // Must be removed from the context after being attached to the frame - dav1d_ref_dec(&c->itut_t35_ref); - c->itut_t35 = NULL; - c->n_itut_t35 = 0; - // Don't clear these flags from c->frame_flags if the frame is not going to be output. // This way they will be added to the next visible frame too. const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) && @@ -221,6 +211,22 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f p->visible = f->frame_hdr->show_frame; p->showable = f->frame_hdr->showable_frame; + + if (p->visible) { + // Only add HDR10+ and T35 metadata when show frame flag is enabled + dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref, + c->mastering_display, c->mastering_display_ref, + c->itut_t35, c->itut_t35_ref, c->n_itut_t35, + &f->tile[0].data.m); + + // Must be removed from the context after being attached to the frame + dav1d_ref_dec(&c->itut_t35_ref); + c->itut_t35 = NULL; + c->n_itut_t35 = 0; + } else { + dav1d_data_props_copy(&p->p.m, &f->tile[0].data.m); + } + if (c->n_fc > 1) { atomic_init(&p->progress[0], 0); atomic_init(&p->progress[1], 0); diff --git a/src/ppc/cpu.c b/src/ppc/cpu.c index 53287639de8eebb8c39f28cb2f15307816065433..f58e8fbf07e7f801ab453876f7509706d5696b8b 100644 --- a/src/ppc/cpu.c +++ b/src/ppc/cpu.c @@ -29,25 +29,26 @@ #include "common/attributes.h" +#include "src/cpu.h" #include "src/ppc/cpu.h" -#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE +#define HAVE_AUX ((HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO) && ARCH_PPC64LE) +#if HAVE_AUX #include <sys/auxv.h> -#define HAVE_AUX #endif COLD unsigned dav1d_get_cpu_flags_ppc(void) { - unsigned flags = 0; -#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE + unsigned flags = dav1d_get_default_cpu_flags(); +#if HAVE_GETAUXVAL && ARCH_PPC64LE unsigned long hw_cap = getauxval(AT_HWCAP); unsigned long hw_cap2 = getauxval(AT_HWCAP2); -#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE +#elif HAVE_ELF_AUX_INFO && ARCH_PPC64LE unsigned long hw_cap = 0; unsigned long hw_cap2 = 0; elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2)); #endif -#ifdef HAVE_AUX +#if HAVE_AUX flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0; flags |= (hw_cap2 & PPC_FEATURE2_ARCH_3_00) ? DAV1D_PPC_CPU_FLAG_PWR9 : 0; #endif diff --git a/src/ppc/itx.h b/src/ppc/itx.h new file mode 100644 index 0000000000000000000000000000000000000000..6bddf7a38fda20574ea282f6413d22f52e5d28f1 --- /dev/null +++ b/src/ppc/itx.h @@ -0,0 +1,65 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2023, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +decl_itx17_fns( 4, 4, pwr9); +decl_itx16_fns( 4, 8, pwr9); +decl_itx16_fns( 4, 16, pwr9); +decl_itx16_fns( 8, 4, pwr9); +decl_itx16_fns( 8, 8, pwr9); +decl_itx16_fns( 8, 16, pwr9); +decl_itx2_fns ( 8, 32, pwr9); +decl_itx16_fns(16, 4, pwr9); +decl_itx16_fns(16, 8, pwr9); +decl_itx12_fns(16, 16, pwr9); +decl_itx2_fns (16, 32, pwr9); +decl_itx2_fns (32, 8, pwr9); +decl_itx2_fns (32, 16, pwr9); +decl_itx2_fns (32, 32, pwr9); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, pwr9)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, pwr9)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, pwr9)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, pwr9)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, pwr9)); + +static ALWAYS_INLINE void itx_dsp_init_ppc(Dav1dInvTxfmDSPContext *const c, const int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_PPC_CPU_FLAG_PWR9)) return; + +#if BITDEPTH == 8 + assign_itx17_fn( , 4, 4, pwr9); + assign_itx16_fn(R, 4, 8, pwr9); + assign_itx16_fn(R, 8, 4, pwr9); + assign_itx16_fn(, 8, 8, pwr9); + assign_itx16_fn(R, 4, 16, pwr9); + assign_itx16_fn(R, 16, 4, pwr9); +#endif +} diff --git a/src/ppc/itx_tmpl.c b/src/ppc/itx_tmpl.c new file mode 100644 index 0000000000000000000000000000000000000000..818065522e51ea82927fa4922521e8dca3f1cd10 --- /dev/null +++ b/src/ppc/itx_tmpl.c @@ -0,0 +1,2006 @@ +/* + * Copyright © 2024, VideoLAN and dav1d authors + * Copyright © 2024, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/ppc/dav1d_types.h" +#include "src/ppc/itx.h" +#include "src/ppc/utils.h" + +#if BITDEPTH == 8 + +#define LOAD_4(src, stride, a, b, c, d) \ +{ \ + uint8_t *s = src; \ + a = vec_xl(0, s); \ + s += stride; \ + b = vec_xl(0, s); \ + s += stride; \ + c = vec_xl(0, s); \ + s += stride; \ + d = vec_xl(0, s); \ +} + +#define LOAD_DECLARE_2_I16(src, a, b) \ + i16x8 a = vec_xl(0, src); \ + i16x8 b = vec_xl(0, src + 8); + +#define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \ + i32x4 a = i16h_to_i32(sa); \ + i32x4 b = i16l_to_i32(sa); \ + i32x4 c = i16h_to_i32(sb); \ + i32x4 d = i16l_to_i32(sb); + +#define LOAD_COEFF_4(coeff) \ + LOAD_DECLARE_2_I16(coeff, c01, c23) \ + UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) + +#define LOAD_SCALE_COEFF_4x8(coeff, scale) \ + LOAD_DECLARE_2_I16(coeff, c04, c15) \ + LOAD_DECLARE_2_I16(coeff+16, c26, c37) \ + i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \ + i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \ + i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \ + i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \ + c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ + c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ + UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ + c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ + c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ + UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) + +#define LOAD_SCALE_COEFF_8x4(coeff, scale) \ + LOAD_DECLARE_2_I16(coeff, c01, c23) \ + LOAD_DECLARE_2_I16(coeff+16, c45, c67) \ + c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ + c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ + UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ + c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ + c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ + UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) + +#define LOAD_COEFF_8x8(coeff) \ + LOAD_DECLARE_2_I16(coeff, c0, c1) \ + LOAD_DECLARE_2_I16(coeff+16, c2, c3) \ + LOAD_DECLARE_2_I16(coeff+32, c4, c5) \ + LOAD_DECLARE_2_I16(coeff+48, c6, c7) \ + UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \ + UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \ + UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \ + UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \ + +#define LOAD_COEFF_4x16(coeff) \ + LOAD_DECLARE_2_I16(coeff, a0b0, c0d0) \ + LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \ + LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \ + LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \ + UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \ + UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \ + UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \ + UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3) + +#define LOAD_DECLARE_4(src, stride, a, b, c, d) \ + u8x16 a, b, c, d; \ + LOAD_4(src, stride, a, b, c, d) + +#define STORE_LEN(l, dst, stride, a, b, c, d) \ +{ \ + uint8_t *dst2 = dst; \ + vec_xst_len(a, dst2, l); \ + dst2 += stride; \ + vec_xst_len(b, dst2, l); \ + dst2 += stride; \ + vec_xst_len(c, dst2, l); \ + dst2 += stride; \ + vec_xst_len(d, dst2, l); \ +} + +#define STORE_4(dst, stride, a, b, c, d) \ + STORE_LEN(4, dst, stride, a, b, c, d) + +#define STORE_8(dst, stride, ab, cd, ef, gh) \ + STORE_LEN(8, dst, stride, ab, cd, ef, gh) + +#define STORE_16(dst, stride, l0, l1, l2, l3) \ +{ \ + uint8_t *dst##2 = dst; \ + vec_xst(l0, 0, dst##2); \ + dst##2 += stride; \ + vec_xst(l1, 0, dst##2); \ + dst##2 += stride; \ + vec_xst(l2, 0, dst##2); \ + dst##2 += stride; \ + vec_xst(l3, 0, dst##2); \ +} + +#define APPLY_COEFF_4(a, b, c, d, c01, c23) \ +{ \ + u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \ + u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \ + \ + c01 = vec_adds(c01, vec_splat_s16(8)); \ + c23 = vec_adds(c23, vec_splat_s16(8)); \ + c01 = vec_sra(c01, vec_splat_u16(4)); \ + c23 = vec_sra(c23, vec_splat_u16(4)); \ + \ + i16x8 abs = u8h_to_i16(ab); \ + i16x8 cds = u8h_to_i16(cd); \ + \ + abs = vec_adds(abs, c01); \ + cds = vec_adds(cds, c23); \ + \ + a = vec_packsu(abs, abs); \ + c = vec_packsu(cds, cds); \ + \ + b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \ + d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \ +} + +#define APPLY_COEFF_8x4(ab, cd, c01, c23) \ +{ \ + i16x8 abs = u8h_to_i16(ab); \ + i16x8 cds = u8h_to_i16(cd); \ + c01 = vec_adds(c01, vec_splat_s16(8)); \ + c23 = vec_adds(c23, vec_splat_s16(8)); \ + c01 = vec_sra(c01, vec_splat_u16(4)); \ + c23 = vec_sra(c23, vec_splat_u16(4)); \ + \ + abs = vec_adds(abs, c01); \ + cds = vec_adds(cds, c23); \ + \ + ab = vec_packsu(abs, abs); \ + cd = vec_packsu(cds, cds); \ +} + +#define APPLY_COEFF_16x4(a, b, c, d, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + i16x8 ah = u8h_to_i16(a); \ + i16x8 al = u8l_to_i16(a); \ + i16x8 bh = u8h_to_i16(b); \ + i16x8 bl = u8l_to_i16(b); \ + i16x8 ch = u8h_to_i16(c); \ + i16x8 cl = u8l_to_i16(c); \ + i16x8 dh = u8h_to_i16(d); \ + i16x8 dl = u8l_to_i16(d); \ + SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \ + SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \ + \ + ah = vec_adds(ah, c00c01); \ + al = vec_adds(al, c02c03); \ + bh = vec_adds(bh, c04c05); \ + bl = vec_adds(bl, c06c07); \ + ch = vec_adds(ch, c08c09); \ + cl = vec_adds(cl, c10c11); \ + dh = vec_adds(dh, c12c13); \ + dl = vec_adds(dl, c14c15); \ + \ + a = vec_packsu(ah, al); \ + b = vec_packsu(bh, bl); \ + c = vec_packsu(ch, cl); \ + d = vec_packsu(dh, dl); \ +} + +#define IDCT_4_INNER(c0, c1, c2, c3) \ +{ \ + i32x4 o0 = vec_add(c0, c2); \ + i32x4 o1 = vec_sub(c0, c2); \ + \ + i32x4 v2896 = vec_splats(2896); \ + i32x4 v1567 = vec_splats(1567); \ + i32x4 v3784 = vec_splats(3784); \ + i32x4 v2048 = vec_splats(2048); \ + \ + o0 = vec_mul(o0, v2896); \ + o1 = vec_mul(o1, v2896); \ + \ + i32x4 o2a = vec_mul(c1, v1567); \ + i32x4 o2b = vec_mul(c3, v3784); \ + i32x4 o3a = vec_mul(c1, v3784); \ + i32x4 o3b = vec_mul(c3, v1567); \ + \ + i32x4 o2 = vec_sub(o2a, o2b); \ + i32x4 o3 = vec_add(o3a, o3b); \ + \ + u32x4 v12 = vec_splat_u32(12); \ + \ + o0 = vec_add(o0, v2048); \ + o1 = vec_add(o1, v2048); \ + o2 = vec_add(o2, v2048); \ + o3 = vec_add(o3, v2048); \ + \ + o0 = vec_sra(o0, v12); \ + o1 = vec_sra(o1, v12); \ + o2 = vec_sra(o2, v12); \ + o3 = vec_sra(o3, v12); \ + \ + c0 = vec_add(o0, o3); \ + c1 = vec_add(o1, o2); \ + c2 = vec_sub(o1, o2); \ + c3 = vec_sub(o0, o3); \ + \ +} + +#define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \ + IDCT_4_INNER(c0, c1, c2, c3) \ + c03 = vec_packs(c0, c3); \ + c12 = vec_packs(c1, c2); \ + +#define dct_4_in(c0, c1, c2, c3, c01, c23) \ +{ \ + IDCT_4_INNER(c0, c1, c2, c3) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + c0 = i16h_to_i32(c01); \ + c1 = i16l_to_i32(c01); \ + c2 = i16h_to_i32(c23); \ + c3 = i16l_to_i32(c23); \ +} + +#define dct_4_out(c0, c1, c2, c3, c01, c23) \ + IDCT_4_INNER(c0, c1, c2, c3) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + + +#define IDENTITY_4(c01, c23) \ +{ \ + i16x8 v1697 = vec_splats((int16_t)(1697*8)); \ + i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \ + i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \ + c01 = vec_adds(c01, o01); \ + c23 = vec_adds(c23, o23); \ +} + +#define identity_4_in(c0, c1, c2, c3, c01, c23) \ +{ \ + IDENTITY_4(c01, c23) \ + c0 = i16h_to_i32(c01); \ + c1 = i16l_to_i32(c01); \ + c2 = i16h_to_i32(c23); \ + c3 = i16l_to_i32(c23); \ +} + +#define identity_4_out(c0, c1, c2, c3, c01, c23) \ +{ \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + IDENTITY_4(c01, c23) \ +} + +#define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \ +{ \ + i32x4 v1321 = vec_splats(1321); \ + i32x4 v3803 = vec_splats(3803); \ + i32x4 v2482 = vec_splats(2482); \ + i32x4 v3344 = vec_splats(3344); \ + i32x4 v2048 = vec_splats(2048); \ + i32x4 i0_v1321 = vec_mul(c0, v1321); \ + i32x4 i0_v2482 = vec_mul(c0, v2482); \ + i32x4 i0_v3803 = vec_mul(c0, v3803); \ + i32x4 i1 = vec_mul(c1, v3344); \ + i32x4 i2_v1321 = vec_mul(c2, v1321); \ + i32x4 i2_v2482 = vec_mul(c2, v2482); \ + i32x4 i2_v3803 = vec_mul(c2, v3803); \ + i32x4 i3_v1321 = vec_mul(c3, v1321); \ + i32x4 i3_v2482 = vec_mul(c3, v2482); \ + i32x4 i3_v3803 = vec_mul(c3, v3803); \ + \ + i32x4 n1 = vec_sub(i1, v2048); \ + i1 = vec_add(i1, v2048); \ + \ + \ + i32x4 o0 = vec_add(i0_v1321, i2_v3803); \ + i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \ + i32x4 o2 = vec_sub(c0, c2); \ + i32x4 o3 = vec_add(i0_v3803, i2_v2482); \ + \ + o0 = vec_add(o0, i3_v2482); \ + o1 = vec_sub(o1, i3_v3803); \ + o2 = vec_add(o2, c3); \ + o3 = vec_sub(o3, i3_v1321); \ + \ + o0 = vec_add(o0, i1); \ + o1 = vec_add(o1, i1); \ + o2 = vec_mul(o2, v3344); \ + o3 = vec_sub(o3, n1); \ + \ + o2 = vec_add(o2, v2048); \ + \ + oc0 = vec_sra(o0, vec_splat_u32(12)); \ + oc1 = vec_sra(o1, vec_splat_u32(12)); \ + oc2 = vec_sra(o2, vec_splat_u32(12)); \ + oc3 = vec_sra(o3, vec_splat_u32(12)); \ +} + +#define adst_4_in(c0, c1, c2, c3, c01, c23) \ +{ \ + ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ +} + +#define flipadst_4_in(c0, c1, c2, c3, c01, c23) \ +{ \ + ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ +} + +#define adst_4_out(c0, c1, c2, c3, c01, c23) \ +{ \ + ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ +} + +#define flipadst_4_out(c0, c1, c2, c3, c01, c23) \ +{ \ + ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ +} + +static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) +{ + int dc = coeff[0]; + const int rnd = (1 << shift) >> 1; + if (is_rect2) + dc = (dc * 181 + 128) >> 8; + dc = (dc * 181 + 128) >> 8; + dc = (dc + rnd) >> shift; + dc = (dc * 181 + 128 + 2048) >> 12; + + i16x8 vdc = vec_splats((int16_t)dc); + coeff[0] = 0; + for (int i = 0; i < n; i++, dst += 4 * stride) { + LOAD_DECLARE_4(dst, stride, a, b, c, d) + + i16x8 as = u8h_to_i16(a); + i16x8 bs = u8h_to_i16(b); + i16x8 cs = u8h_to_i16(c); + i16x8 ds = u8h_to_i16(d); + + as = vec_adds(as, vdc); + bs = vec_adds(bs, vdc); + cs = vec_adds(cs, vdc); + ds = vec_adds(ds, vdc); + + a = vec_packsu(as, as); + b = vec_packsu(bs, bs); + c = vec_packsu(cs, cs); + d = vec_packsu(ds, ds); + + STORE_4(dst, stride, a, b, c, d) + } +} + +static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) +{ + int dc = coeff[0]; + const int rnd = (1 << shift) >> 1; + if (is_rect2) + dc = (dc * 181 + 128) >> 8; + dc = (dc * 181 + 128) >> 8; + dc = (dc + rnd) >> shift; + dc = (dc * 181 + 128 + 2048) >> 12; + + i16x8 vdc = vec_splats((int16_t)dc); + coeff[0] = 0; + + for (int i = 0; i < n; i++, dst += 4 * stride) { + LOAD_DECLARE_4(dst, stride, a, b, c, d) + + i16x8 as = u8h_to_i16(a); + i16x8 bs = u8h_to_i16(b); + i16x8 cs = u8h_to_i16(c); + i16x8 ds = u8h_to_i16(d); + + as = vec_adds(as, vdc); + bs = vec_adds(bs, vdc); + cs = vec_adds(cs, vdc); + ds = vec_adds(ds, vdc); + + a = vec_packsu(as, as); + b = vec_packsu(bs, bs); + c = vec_packsu(cs, cs); + d = vec_packsu(ds, ds); + + STORE_8(dst, stride, a, b, c, d) + } +} + +static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) +{ + int dc = coeff[0]; + const int rnd = (1 << shift) >> 1; + if (is_rect2) + dc = (dc * 181 + 128) >> 8; + dc = (dc * 181 + 128) >> 8; + dc = (dc + rnd) >> shift; + dc = (dc * 181 + 128 + 2048) >> 12; + + i16x8 vdc = vec_splats((int16_t)dc); + coeff[0] = 0; + + for (int i = 0; i < n; i++, dst += 4 * stride) { + LOAD_DECLARE_4(dst, stride, a, b, c, d) + + i16x8 ah = u8h_to_i16(a); + i16x8 bh = u8h_to_i16(b); + i16x8 ch = u8h_to_i16(c); + i16x8 dh = u8h_to_i16(d); + i16x8 al = u8l_to_i16(a); + i16x8 bl = u8l_to_i16(b); + i16x8 cl = u8l_to_i16(c); + i16x8 dl = u8l_to_i16(d); + + ah = vec_adds(ah, vdc); + bh = vec_adds(bh, vdc); + ch = vec_adds(ch, vdc); + dh = vec_adds(dh, vdc); + al = vec_adds(al, vdc); + bl = vec_adds(bl, vdc); + cl = vec_adds(cl, vdc); + dl = vec_adds(dl, vdc); + + a = vec_packsu(ah, al); + b = vec_packsu(bh, bl); + c = vec_packsu(ch, cl); + d = vec_packsu(dh, dl); + + STORE_16(dst, stride, a, b, c, d) + } +} + +void dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, + int16_t *const coeff, const int eob) +{ + assert(eob >= 0); + + if (eob < 1) { + return dc_only_4xN(dst, stride, coeff, 1, 0, 0); + } + + LOAD_COEFF_4(coeff) + + dct_4_in(c0, c1, c2, c3, c01, c23) + + TRANSPOSE4_I32(c0, c1, c2, c3) + + memset(coeff, 0, sizeof(*coeff) * 4 * 4); + + dct_4_out(c0, c1, c2, c3, c01, c23) + + LOAD_DECLARE_4(dst, stride, a, b, c, d) + + APPLY_COEFF_4(a, b, c, d, c01, c23) + + STORE_4(dst, stride, a, b, c, d) +} + +void dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride, + coef *const coeff, const int eob) +{ + LOAD_COEFF_4(coeff) + + u32x4 v2 = vec_splat_u32(2); + + c0 = vec_sra(c0, v2); + c1 = vec_sra(c1, v2); + c2 = vec_sra(c2, v2); + c3 = vec_sra(c3, v2); + + i32x4 t0 = vec_add(c0, c1); + i32x4 t2 = vec_sub(c2, c3); + i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); + i32x4 t3 = vec_sub(t4, c3); + i32x4 t1 = vec_sub(t4, c1); + c0 = vec_sub(t0, t3); + c1 = t3; + c2 = t1; + c3 = vec_add(t2, t1); + + memset(coeff, 0, sizeof(*coeff) * 4 * 4); + + TRANSPOSE4_I32(c0, c1, c2, c3) + + t0 = vec_add(c0, c1); + t2 = vec_sub(c2, c3); + t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); + t3 = vec_sub(t4, c3); + t1 = vec_sub(t4, c1); + c0 = vec_sub(t0, t3); + c1 = t3; + c2 = t1; + c3 = vec_add(t2, t1); + + c01 = vec_packs(c0, c1); + c23 = vec_packs(c2, c3); + + LOAD_DECLARE_4(dst, stride, a, b, c, d) + + u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); + u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); + + i16x8 abs = u8h_to_i16(ab); + i16x8 cds = u8h_to_i16(cd); + + abs = vec_adds(abs, c01); + cds = vec_adds(cds, c23); + + a = vec_packsu(abs, abs); + c = vec_packsu(cds, cds); + + b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); + d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); + + STORE_4(dst, stride, a, b, c, d) +} + +#define inv_txfm_fn4x4(type1, type2) \ +void dav1d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + LOAD_COEFF_4(coeff) \ + type1##_4_in(c0, c1, c2, c3, c01, c23) \ + memset(coeff, 0, sizeof(*coeff) * 4 * 4); \ + TRANSPOSE4_I32(c0, c1, c2, c3) \ + type2##_4_out(c0, c1, c2, c3, c01, c23) \ + LOAD_DECLARE_4(dst, stride, a, b, c, d) \ + APPLY_COEFF_4(a, b, c, d, c01, c23) \ + STORE_4(dst, stride, a, b, c, d) \ +} + +inv_txfm_fn4x4(adst, dct ) +inv_txfm_fn4x4(dct, adst ) +inv_txfm_fn4x4(dct, flipadst) +inv_txfm_fn4x4(flipadst, dct ) +inv_txfm_fn4x4(adst, flipadst) +inv_txfm_fn4x4(flipadst, adst ) +inv_txfm_fn4x4(identity, dct ) +inv_txfm_fn4x4(dct, identity) +inv_txfm_fn4x4(identity, flipadst) +inv_txfm_fn4x4(flipadst, identity) +inv_txfm_fn4x4(identity, adst ) +inv_txfm_fn4x4(adst, identity) +inv_txfm_fn4x4(identity, identity) +inv_txfm_fn4x4(adst, adst ) +inv_txfm_fn4x4(flipadst, flipadst) + + +#define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ + dct4_for_dct8(c0, c2, c4, c6, c03, c12) \ + \ + i32x4 v799 = vec_splats(799); \ + i32x4 v4017 = vec_splats(4017); \ + i32x4 v3406 = vec_splats(3406); \ + i32x4 v2276 = vec_splats(2276); \ + i32x4 v2048 = vec_splats(2048); \ + u32x4 v12 = vec_splat_u32(12); \ + \ + i32x4 c1v799 = vec_mul(c1, v799); \ + i32x4 c7v4017 = vec_mul(c7, v4017); \ + i32x4 c5v3406 = vec_mul(c5, v3406); \ + i32x4 c3v2276 = vec_mul(c3, v2276); \ + i32x4 c5v2276 = vec_mul(c5, v2276); \ + i32x4 c3v3406 = vec_mul(c3, v3406); \ + i32x4 c1v4017 = vec_mul(c1, v4017); \ + i32x4 c7v799 = vec_mul(c7, v799); \ + \ + i32x4 t4a = vec_subs(c1v799, c7v4017); \ + i32x4 t5a = vec_subs(c5v3406, c3v2276); \ + i32x4 t6a = vec_adds(c5v2276, c3v3406); \ + i32x4 t7a = vec_adds(c1v4017, c7v799); \ + \ + t4a = vec_adds(t4a, v2048); \ + t5a = vec_adds(t5a, v2048); \ + t6a = vec_adds(t6a, v2048); \ + t7a = vec_adds(t7a, v2048); \ + \ + t4a = vec_sra(t4a, v12); \ + t7a = vec_sra(t7a, v12); \ + t5a = vec_sra(t5a, v12); \ + t6a = vec_sra(t6a, v12); \ + \ + i16x8 t7at4a = vec_packs(t7a, t4a); \ + i16x8 t6at5a = vec_packs(t6a, t5a); \ + \ + i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \ + t6at5a = vec_subs(t7at4a, t6at5a); \ + \ + t6a = i16h_to_i32(t6at5a); \ + t5a = i16l_to_i32(t6at5a); \ + \ + i32x4 t6 = vec_add(t6a, t5a); \ + i32x4 t5 = vec_sub(t6a, t5a); \ + \ + t6 = vec_mul(t6, vec_splats(181)); \ + t5 = vec_mul(t5, vec_splats(181)); \ + t6 = vec_add(t6, vec_splats(128)); \ + t5 = vec_add(t5, vec_splats(128)); \ + \ + t6 = vec_sra(t6, vec_splat_u32(8)); \ + t5 = vec_sra(t5, vec_splat_u32(8)); \ + \ + i16x8 t6t5 = vec_packs(t6, t5); \ + \ + c74 = vec_subs(c03, t7t4); \ + c65 = vec_subs(c12, t6t5); \ + c03 = vec_adds(c03, t7t4); \ + c12 = vec_adds(c12, t6t5); \ + +#define UNPACK_4_I16_I32(t0, t1, t2, t3) \ + t0 = i16h_to_i32(t0##t1); \ + t1 = i16l_to_i32(t0##t1); \ + t2 = i16h_to_i32(t2##t3); \ + t3 = i16l_to_i32(t2##t3); + +#define UNPACK_PAIR_I16_I32(hi, lo, v) \ + hi = i16h_to_i32(v); \ + lo = i16l_to_i32(v); \ + + +#define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \ +{ \ + i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \ + IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \ + UNPACK_4_I16_I32(c0, c3, c1, c2) \ + UNPACK_4_I16_I32(c7, c4, c6, c5) \ +} + +#define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ +{ \ + i16x8 c03, c12, c74, c65; \ + IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ + c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \ + c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \ + c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \ + c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \ +} + +#define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \ + dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \ +} + +#define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + i16x8 c03h, c12h, c74h, c65h; \ + i16x8 c03l, c12l, c74l, c65l; \ + { \ + IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \ + } \ + { \ + IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \ + } \ + c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \ + c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \ + c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \ + c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \ + c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \ + c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \ + c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \ + c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \ +} + +#define IDENTITY_8(c01, c23, c45, c67) \ +{ \ + c01 = vec_adds(c01, c01); \ + c23 = vec_adds(c23, c23); \ + c45 = vec_adds(c45, c45); \ + c67 = vec_adds(c67, c67); \ +} + +#define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ +{ \ + IDENTITY_8(c01, c23, c45, c67) \ + UNPACK_PAIR_I16_I32(c0, c1, c01) \ + UNPACK_PAIR_I16_I32(c2, c3, c23) \ + UNPACK_PAIR_I16_I32(c4, c5, c45) \ + UNPACK_PAIR_I16_I32(c6, c7, c67) \ +} + +#define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + c45 = vec_packs(c4, c5); \ + c67 = vec_packs(c6, c7); \ + IDENTITY_8(c01, c23, c45, c67) + +#define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + IDENTITY_8(c0, c1, c2, c3) \ + IDENTITY_8(c4, c5, c6, c7) \ + UNPACK_PAIR_I16_I32(c0h, c0l, c0) \ + UNPACK_PAIR_I16_I32(c1h, c1l, c1) \ + UNPACK_PAIR_I16_I32(c2h, c2l, c2) \ + UNPACK_PAIR_I16_I32(c3h, c3l, c3) \ + UNPACK_PAIR_I16_I32(c4h, c4l, c4) \ + UNPACK_PAIR_I16_I32(c5h, c5l, c5) \ + UNPACK_PAIR_I16_I32(c6h, c6l, c6) \ + UNPACK_PAIR_I16_I32(c7h, c7l, c7) \ +} + +#define PACK_4(c0, c1, c2, c3, \ + c0h, c1h, c2h, c3h, \ + c0l, c1l, c2l, c3l) \ +{ \ + c0 = vec_packs(c0h, c0l); \ + c1 = vec_packs(c1h, c1l); \ + c2 = vec_packs(c2h, c2l); \ + c3 = vec_packs(c3h, c3l); \ +} + +#define DECLARE_PACK_4(c0, c1, c2, c3, \ + c0h, c1h, c2h, c3h, \ + c0l, c1l, c2l, c3l) \ + i16x8 c0, c1, c2, c3; \ + PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l); + +#define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ +{ \ + c0 = vec_packs(c0h, c0l); \ + c1 = vec_packs(c1h, c1l); \ + c2 = vec_packs(c2h, c2l); \ + c3 = vec_packs(c3h, c3l); \ + c4 = vec_packs(c4h, c4l); \ + c5 = vec_packs(c5h, c5l); \ + c6 = vec_packs(c6h, c6l); \ + c7 = vec_packs(c7h, c7l); \ +} + +#define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ + IDENTITY_8(c0, c1, c2, c3) \ + IDENTITY_8(c4, c5, c6, c7) \ +} + +#define DECLARE_SPLAT_I32(val) \ + i32x4 v##val = vec_splats(val); + +#define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \ + i32x4 ca##va = vec_mul(ca, va); \ + i32x4 cb##vb = vec_mul(cb, vb); \ + i32x4 ca##vb = vec_mul(ca, vb); \ + i32x4 cb##va = vec_mul(cb, va); + +#define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ + r0 = vec_adds(ca##va, cb##vb); \ + r1 = vec_subs(ca##vb, cb##va); + +#define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ + i32x4 r0, r1; \ + ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) + +#define SCALE_ROUND_4(a, b, c, d, rnd, shift) \ + a = vec_adds(a, rnd); \ + b = vec_adds(b, rnd); \ + c = vec_adds(c, rnd); \ + d = vec_adds(d, rnd); \ + a = vec_sra(a, shift); \ + b = vec_sra(b, shift); \ + c = vec_sra(c, shift); \ + d = vec_sra(d, shift); + +#define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + o0, o1, o2, o3, o4, o5, o6, o7) \ +{ \ + DECLARE_SPLAT_I32(4076) \ + DECLARE_SPLAT_I32(401) \ + \ + DECLARE_SPLAT_I32(3612) \ + DECLARE_SPLAT_I32(1931) \ + \ + DECLARE_SPLAT_I32(2598) \ + DECLARE_SPLAT_I32(3166) \ + \ + DECLARE_SPLAT_I32(1189) \ + DECLARE_SPLAT_I32(3920) \ + \ + DECLARE_SPLAT_I32(3784) \ + DECLARE_SPLAT_I32(1567) \ + \ + DECLARE_SPLAT_I32(2048) \ + u32x4 v12 = vec_splat_u32(12); \ + \ + DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \ + DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \ + DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \ + DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \ + \ + DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \ + DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \ + DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \ + DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \ + \ + SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \ + SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ + \ + i32x4 t0 = vec_add(t0a, t4a); \ + i32x4 t1 = vec_add(t1a, t5a); \ + i32x4 t2 = vec_add(t2a, t6a); \ + i32x4 t3 = vec_add(t3a, t7a); \ + i32x4 t4 = vec_sub(t0a, t4a); \ + i32x4 t5 = vec_sub(t1a, t5a); \ + i32x4 t6 = vec_sub(t2a, t6a); \ + i32x4 t7 = vec_sub(t3a, t7a); \ + \ + i16x8 t0t1 = vec_packs(t0, t1); \ + i16x8 t2t3 = vec_packs(t2, t3); \ + i16x8 t4t5 = vec_packs(t4, t5); \ + i16x8 t6t7 = vec_packs(t6, t7); \ + \ + UNPACK_4_I16_I32(t4, t5, t6, t7) \ + UNPACK_4_I16_I32(t0, t1, t2, t3) \ + \ + DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \ + DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \ + \ + ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \ + ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \ + \ + SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ + \ + o0 = vec_add(t0, t2); \ + o1 = vec_add(t4a, t6a); \ + o7 = vec_add(t1, t3); \ + o6 = vec_add(t5a, t7a); \ + t2 = vec_sub(t0, t2); \ + t3 = vec_sub(t1, t3); \ + t6 = vec_sub(t4a, t6a); \ + t7 = vec_sub(t5a, t7a); \ + \ + i16x8 o7##o1 = vec_packs(o7, o1); \ + i16x8 o0##o6 = vec_packs(o0, o6); \ + t2t3 = vec_packs(t2, t3); \ + t6t7 = vec_packs(t6, t7); \ + \ + UNPACK_4_I16_I32(t2, t3, t6, t7) \ + UNPACK_4_I16_I32(o7, o1, o0, o6) \ + \ + o7 = -o7; \ + o1 = -o1; \ + \ + o3 = vec_add(t2, t3); \ + o4 = vec_sub(t2, t3); \ + o5 = vec_sub(t6, t7); \ + o2 = vec_add(t6, t7); \ + \ + i32x4 v181 = vec_splats(181); \ + i32x4 v128 = vec_splats(128); \ + u32x4 v8 = vec_splat_u32(8); \ + \ + o2 = vec_mul(o2, v181); \ + o3 = vec_mul(o3, v181); \ + o4 = vec_mul(o4, v181); \ + o5 = vec_mul(o5, v181); \ + \ + SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \ + \ + o3 = -o3; \ + o5 = -o5; \ +} + +#define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ +{\ + ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + c45 = vec_packs(c4, c5); \ + c67 = vec_packs(c6, c7); \ + UNPACK_PAIR_I16_I32(c0, c1, c01) \ + UNPACK_PAIR_I16_I32(c2, c3, c23) \ + UNPACK_PAIR_I16_I32(c4, c5, c45) \ + UNPACK_PAIR_I16_I32(c6, c7, c67) \ +} + +#define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ +{\ + ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + c45 = vec_packs(c4, c5); \ + c67 = vec_packs(c6, c7); \ +} + +#define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ + ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ +} + +#define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ + ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ + PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ +} + +#define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ +{\ + ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c7, c6, c5, c4, c3, c2, c1, c0) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + c45 = vec_packs(c4, c5); \ + c67 = vec_packs(c6, c7); \ + UNPACK_PAIR_I16_I32(c0, c1, c01) \ + UNPACK_PAIR_I16_I32(c2, c3, c23) \ + UNPACK_PAIR_I16_I32(c4, c5, c45) \ + UNPACK_PAIR_I16_I32(c6, c7, c67) \ +} + +#define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ +{\ + ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c7, c6, c5, c4, c3, c2, c1, c0) \ + c01 = vec_packs(c0, c1); \ + c23 = vec_packs(c2, c3); \ + c45 = vec_packs(c4, c5); \ + c67 = vec_packs(c6, c7); \ +} + +#define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ + ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ +} + +#define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ +{ \ + ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ + ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ + PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ + c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ +} + +void dav1d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, + int16_t *const coeff, const int eob) +{ + i16x8 v = vec_splats((int16_t)(2896*8)); + + if (eob < 1) { + return dc_only_4xN(dst, stride, coeff, 2, 1, 0); + } + + LOAD_SCALE_COEFF_4x8(coeff, v) + + dct_4_in(c0, c1, c2, c3, c01, c23) + dct_4_in(c4, c5, c6, c7, c45, c67) + + + memset(coeff, 0, sizeof(*coeff) * 4 * 8); + + TRANSPOSE4_I32(c0, c1, c2, c3); + TRANSPOSE4_I32(c4, c5, c6, c7); + + dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) + + LOAD_DECLARE_4(dst, stride, a, b, cc, d) + LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) + + APPLY_COEFF_4(a, b, cc, d, c01, c23) + APPLY_COEFF_4(e, f, g, hh, c45, c67) + + STORE_4(dst, stride, a, b, cc, d) + STORE_4(dst + 4 * stride, stride, e, f, g, hh) +} + + +#define inv_txfm_fn4x8(type1, type2) \ +void dav1d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + i16x8 v = vec_splats((int16_t)(2896*8)); \ + LOAD_SCALE_COEFF_4x8(coeff, v) \ + type1##_4_in(c0, c1, c2, c3, c01, c23) \ + type1##_4_in(c4, c5, c6, c7, c45, c67) \ + memset(coeff, 0, sizeof(*coeff) * 4 * 8); \ + TRANSPOSE4_I32(c0, c1, c2, c3); \ + TRANSPOSE4_I32(c4, c5, c6, c7); \ + type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ + LOAD_DECLARE_4(dst, stride, a, b, c, d) \ + LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ + APPLY_COEFF_4(a, b, c, d, c01, c23) \ + APPLY_COEFF_4(e, f, g, h, c45, c67) \ + STORE_4(dst, stride, a, b, c, d) \ + STORE_4(dst + 4 * stride, stride, e, f, g, h) \ +} + +inv_txfm_fn4x8(adst, dct ) +inv_txfm_fn4x8(dct, adst ) +inv_txfm_fn4x8(dct, flipadst) +inv_txfm_fn4x8(flipadst, dct ) +inv_txfm_fn4x8(adst, flipadst) +inv_txfm_fn4x8(flipadst, adst ) +inv_txfm_fn4x8(identity, dct ) +inv_txfm_fn4x8(dct, identity) +inv_txfm_fn4x8(identity, flipadst) +inv_txfm_fn4x8(flipadst, identity) +inv_txfm_fn4x8(identity, adst ) +inv_txfm_fn4x8(adst, identity) +inv_txfm_fn4x8(identity, identity) +inv_txfm_fn4x8(adst, adst ) +inv_txfm_fn4x8(flipadst, flipadst) + + +void dav1d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, + int16_t *const coeff, const int eob) +{ + i16x8 v = vec_splats((int16_t)(2896*8)); + + if (eob < 1) { + return dc_only_8xN(dst, stride, coeff, 1, 1, 0); + } + + LOAD_SCALE_COEFF_8x4(coeff, v) + + dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) + + memset(coeff, 0, sizeof(*coeff) * 8 * 4); + + TRANSPOSE4_I32(c0, c1, c2, c3) + TRANSPOSE4_I32(c4, c5, c6, c7) + + dct_4_out(c0, c1, c2, c3, c01, c23) + dct_4_out(c4, c5, c6, c7, c45, c67) + + LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) + + i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); + i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); + i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); + i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); + + APPLY_COEFF_8x4(ae, bf, c04, c15) + APPLY_COEFF_8x4(cg, dh, c26, c37) + + STORE_8(dst, stride, ae, bf, cg, dh) +} + + +#define inv_txfm_fn8x4(type1, type2) \ +void dav1d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + i16x8 v = vec_splats((int16_t)(2896*8)); \ + LOAD_SCALE_COEFF_8x4(coeff, v) \ + type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ + memset(coeff, 0, sizeof(*coeff) * 8 * 4); \ + TRANSPOSE4_I32(c0, c1, c2, c3) \ + TRANSPOSE4_I32(c4, c5, c6, c7) \ + type2##_4_out(c0, c1, c2, c3, c01, c23) \ + type2##_4_out(c4, c5, c6, c7, c45, c67) \ + LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \ + i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \ + i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \ + i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \ + i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \ + APPLY_COEFF_8x4(ae, bf, c04, c15) \ + APPLY_COEFF_8x4(cg, dh, c26, c37) \ + STORE_8(dst, stride, ae, bf, cg, dh) \ +} +inv_txfm_fn8x4(adst, dct ) +inv_txfm_fn8x4(dct, adst ) +inv_txfm_fn8x4(dct, flipadst) +inv_txfm_fn8x4(flipadst, dct ) +inv_txfm_fn8x4(adst, flipadst) +inv_txfm_fn8x4(flipadst, adst ) +inv_txfm_fn8x4(identity, dct ) +inv_txfm_fn8x4(dct, identity) +inv_txfm_fn8x4(identity, flipadst) +inv_txfm_fn8x4(flipadst, identity) +inv_txfm_fn8x4(identity, adst ) +inv_txfm_fn8x4(adst, identity) +inv_txfm_fn8x4(identity, identity) +inv_txfm_fn8x4(adst, adst ) +inv_txfm_fn8x4(flipadst, flipadst) + +void dav1d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, + int16_t *const coeff, const int eob) +{ + if (eob < 1) { + return dc_only_8xN(dst, stride, coeff, 2, 0, 1); + } + + LOAD_COEFF_8x8(coeff) + + dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, + c0, c1, c2, c3, c4, c5, c6, c7) + + memset(coeff, 0, sizeof(*coeff) * 8 * 8); + + SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) + + TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) + + dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, + c0, c1, c2, c3, c4, c5, c6, c7) + + LOAD_DECLARE_4(dst, stride, a, b, cc, d) + LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) + + APPLY_COEFF_8x4(a, b, c0, c1) + APPLY_COEFF_8x4(cc, d, c2, c3) + APPLY_COEFF_8x4(e, f, c4, c5) + APPLY_COEFF_8x4(g, hh, c6, c7) + + STORE_8(dst, stride, a, b, cc, d) + STORE_8(dst + 4 * stride, stride, e, f, g, hh) +} + +#define inv_txfm_fn8x8(type1, type2) \ +void dav1d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + LOAD_COEFF_8x8(coeff) \ + type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \ + memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ + TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ + type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + LOAD_DECLARE_4(dst, stride, a, b, c, d) \ + LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ + APPLY_COEFF_8x4(a, b, c0, c1) \ + APPLY_COEFF_8x4(c, d, c2, c3) \ + APPLY_COEFF_8x4(e, f, c4, c5) \ + APPLY_COEFF_8x4(g, h, c6, c7) \ + STORE_8(dst, stride, a, b, c, d) \ + STORE_8(dst + 4 * stride, stride, e, f, g, h) \ +} +inv_txfm_fn8x8(adst, dct ) +inv_txfm_fn8x8(dct, adst ) +inv_txfm_fn8x8(dct, flipadst) +inv_txfm_fn8x8(flipadst, dct ) +inv_txfm_fn8x8(adst, flipadst) +inv_txfm_fn8x8(flipadst, adst ) +inv_txfm_fn8x8(dct, identity) +inv_txfm_fn8x8(flipadst, identity) +inv_txfm_fn8x8(adst, identity) +inv_txfm_fn8x8(adst, adst ) +inv_txfm_fn8x8(flipadst, flipadst) + +// identity + scale is a no op +#define inv_txfm_fn8x8_identity(type2) \ +void dav1d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + LOAD_COEFF_8x8(coeff) \ + memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ + TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ + type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ + c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + LOAD_DECLARE_4(dst, stride, a, b, c, d) \ + LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ + APPLY_COEFF_8x4(a, b, c0, c1) \ + APPLY_COEFF_8x4(c, d, c2, c3) \ + APPLY_COEFF_8x4(e, f, c4, c5) \ + APPLY_COEFF_8x4(g, h, c6, c7) \ + STORE_8(dst, stride, a, b, c, d) \ + STORE_8(dst + 4 * stride, stride, e, f, g, h) \ +} +inv_txfm_fn8x8_identity(dct ) +inv_txfm_fn8x8_identity(flipadst) +inv_txfm_fn8x8_identity(adst ) +inv_txfm_fn8x8_identity(identity) + +#define CLIP16_I32_8(a, b, c, d, e, f, g, h, \ + ab, cd, ef, gh) \ +{ \ + ab = vec_packs(a, b); \ + cd = vec_packs(c, d); \ + ef = vec_packs(e, f); \ + gh = vec_packs(g, h); \ + UNPACK_PAIR_I16_I32(a, b, ab) \ + UNPACK_PAIR_I16_I32(c, d, cd) \ + UNPACK_PAIR_I16_I32(e, f, ef) \ + UNPACK_PAIR_I16_I32(g, h, gh) \ +} + +#define MUL_4_INPLACE(a, b, c, d, v) \ + a = vec_mul(a, v); \ + b = vec_mul(b, v); \ + c = vec_mul(c, v); \ + d = vec_mul(d, v); \ + +#define IDENTITY_16_V(v) \ +{ \ + i16x8 v_ = vec_adds(v, v); \ + v = vec_mradds(v, v1697_16, v_); \ +} + +#define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \ + IDENTITY_16_V(c00c01) \ + IDENTITY_16_V(c02c03) \ + IDENTITY_16_V(c04c05) \ + IDENTITY_16_V(c06c07) \ + IDENTITY_16_V(c08c09) \ + IDENTITY_16_V(c10c11) \ + IDENTITY_16_V(c12c13) \ + IDENTITY_16_V(c14c15) \ +} + +#define IDENTITY_16_4_I32(a, b, c, d) \ +{ \ + i32x4 a2 = vec_add(a, a); \ + i32x4 b2 = vec_add(b, b); \ + i32x4 c2 = vec_add(c, c); \ + i32x4 d2 = vec_add(d, d); \ + MUL_4_INPLACE(a, b, c, d, v1697) \ + SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \ + a = vec_add(a2, a); \ + b = vec_add(b2, b); \ + c = vec_add(c2, c); \ + d = vec_add(d2, d); \ +} + + +#define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + DECLARE_SPLAT_I32(1697) \ + DECLARE_SPLAT_I32(1024) \ + IDENTITY_16_4_I32(c00, c01, c02, c03) \ + IDENTITY_16_4_I32(c04, c05, c06, c07) \ + IDENTITY_16_4_I32(c08, c09, c10, c11) \ + IDENTITY_16_4_I32(c12, c13, c14, c15) \ +} + +#define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ + c00, c02, c04, c06, c08, c10, c12, c14, \ + c01, c03, c05, c07, c09, c11, c13, c15) \ + IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +} + +#define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c03, c01c02, c07c04, c06c05, \ + c08c11, c09c10, c14c13, c15c12) \ + IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \ + c00c03, c01c02, c07c04, c06c05) \ + DECLARE_SPLAT_I32(128) \ + DECLARE_SPLAT_I32(181) \ + DECLARE_SPLAT_I32(401) \ + DECLARE_SPLAT_I32(4076) \ + DECLARE_SPLAT_I32(3166) \ + DECLARE_SPLAT_I32(2598) \ + DECLARE_SPLAT_I32(1931) \ + DECLARE_SPLAT_I32(3612) \ + DECLARE_SPLAT_I32(3920) \ + DECLARE_SPLAT_I32(1189) \ + DECLARE_SPLAT_I32(1567) \ + DECLARE_SPLAT_I32(3784) \ +\ + DECLARE_MUL_PAIR_I32(c01, c15, v401, v4076) \ + DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \ + DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \ + DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \ +\ + DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076, v401) \ + DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \ + DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \ + DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \ +\ + SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \ + SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \ +\ + CLIP16_I32_8(t15a, t08a, t14a, t09a, \ + t13a, t10a, t12a, t11a, \ + c08c11, c09c10, c14c13, c15c12) \ + DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \ + DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \ + DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \ + DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \ +\ + CLIP16_I32_8(t08, t09, t11, t10, \ + t12, t13, t15, t14, \ + c08c11, c09c10, c14c13, c15c12) \ +\ + DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \ + DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \ + \ + ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \ + ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \ + t10a = -t10a; \ +\ + SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \ +\ + ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \ + ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \ + ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \ + ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \ +\ + CLIP16_I32_8(t08a, t11a, t09, t10, \ + t15a, t12a, t14, t13, \ + c08c11, c09c10, c14c13, c15c12) \ + ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \ + ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \ +\ + MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \ + SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \ +\ + DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \ + t15a, t14, t08a, t09, \ + t12, t13a, t11, t10a) \ +\ + c15c12 = vec_subs(c00c03, t15at12); \ + c14c13 = vec_subs(c01c02, t14t13a); \ + c08c11 = vec_subs(c07c04, t08at11); \ + c09c10 = vec_subs(c06c05, t09t10a); \ + c00c03 = vec_adds(c00c03, t15at12); \ + c01c02 = vec_adds(c01c02, t14t13a); \ + c07c04 = vec_adds(c07c04, t08at11); \ + c06c05 = vec_adds(c06c05, t09t10a); \ + +#define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ +\ + i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \ + IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ + c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \ + c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \ + c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \ + c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \ + c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \ + c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \ + c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \ + c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \ + +#define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ +\ + IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ + UNPACK_PAIR_I16_I32(c00, c03, c00c03) \ + UNPACK_PAIR_I16_I32(c01, c02, c01c02) \ + UNPACK_PAIR_I16_I32(c07, c04, c07c04) \ + UNPACK_PAIR_I16_I32(c06, c05, c06c05) \ + UNPACK_PAIR_I16_I32(c08, c11, c08c11) \ + UNPACK_PAIR_I16_I32(c09, c10, c09c10) \ + UNPACK_PAIR_I16_I32(c14, c13, c14c13) \ + UNPACK_PAIR_I16_I32(c15, c12, c15c12) \ + + +#define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ + dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ + dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ + dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ + dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) + + +#define PACK_4x4(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \ + c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \ + c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \ + c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \ +} + + + +#define dct_4x4_out(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + IDCT_4_INNER(c00, c01, c02, c03) \ + IDCT_4_INNER(c04, c05, c06, c07) \ + IDCT_4_INNER(c08, c09, c10, c11) \ + IDCT_4_INNER(c12, c13, c14, c15) \ +\ + PACK_4x4(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +} + +#define IDENTITY_4_I32(a, b, c, d) \ +{ \ + DECLARE_SPLAT_I32(5793) \ + DECLARE_SPLAT_I32(2048) \ + MUL_4_INPLACE(a, b, c, d, v5793) \ + SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \ +} + +#define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ +{ \ + IDENTITY_4_I32(cA0, cA1, cA2, cA3) \ + IDENTITY_4_I32(cB0, cB1, cB2, cB3) \ + IDENTITY_4_I32(cC0, cC1, cC2, cC3) \ + IDENTITY_4_I32(cD0, cD1, cD2, cD3) \ +} + +#define identity_4x4_out(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + PACK_4x4(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ + IDENTITY_4(c00c01, c02c03) \ + IDENTITY_4(c04c05, c06c07) \ + IDENTITY_4(c08c09, c10c11) \ + IDENTITY_4(c12c13, c14c15) \ +} + +#define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ + adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ + adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ + adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ + adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) + +#define adst_4x4_out(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \ + ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \ + ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \ + ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \ +\ + PACK_4x4(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +} + +#define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ + flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ + flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ + flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ + flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) + +#define flipadst_4x4_out(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +{ \ + ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \ + ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \ + ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \ + ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \ +\ + PACK_4x4(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ +} + +#define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + o00, o01, o02, o03, o04, o05, o06, o07, \ + o08, o09, o10, o11, o12, o13, o14, o15, \ + c00c01, c02c03, c04c05, c06c07) \ + DECLARE_SPLAT_I32(2048); \ + u32x4 v12 = vec_splat_u32(12); \ + DECLARE_SPLAT_I32(4091) \ + DECLARE_SPLAT_I32(201) \ + DECLARE_SPLAT_I32(3973) \ + DECLARE_SPLAT_I32(995) \ + DECLARE_SPLAT_I32(3703) \ + DECLARE_SPLAT_I32(1751) \ + DECLARE_SPLAT_I32(3290) \ + DECLARE_SPLAT_I32(2440) \ + DECLARE_SPLAT_I32(2751) \ + DECLARE_SPLAT_I32(3035) \ + DECLARE_SPLAT_I32(2106) \ + DECLARE_SPLAT_I32(3513) \ + DECLARE_SPLAT_I32(1380) \ + DECLARE_SPLAT_I32(3857) \ + DECLARE_SPLAT_I32(601) \ + DECLARE_SPLAT_I32(4052) \ +\ + DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \ + DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \ + DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \ + DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \ + DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \ + DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \ + DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \ + DECLARE_MUL_PAIR_I32(c01, c14, v601, v4052) \ +\ + DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\ + DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \ + DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \ + DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \ + DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \ + DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \ + DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \ + DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14, v601, v4052) \ +\ + SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \ + SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \ + SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ + SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \ +\ + DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \ + DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \ + DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \ + DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \ + DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \ + DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \ + DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \ + DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \ +\ + CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \ + c00c01, c02c03, c04c05, c06c07); \ + CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \ + c00c01, c02c03, c04c05, c06c07); \ +\ + DECLARE_SPLAT_I32(4017) \ + DECLARE_SPLAT_I32(799) \ + DECLARE_SPLAT_I32(2276) \ + DECLARE_SPLAT_I32(3406) \ +\ + DECLARE_MUL_PAIR_I32(t08a, t09a, v4017, v799); \ + DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \ + DECLARE_MUL_PAIR_I32(t13a, t12a, v799, v4017); \ + DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \ +\ + ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017, v799); \ + ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \ + ADD_SUB_PAIR(t13, t12, t13a, t12a, v799, v4017); \ + ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \ +\ + SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ + SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \ +\ + ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \ + ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \ + ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \ + ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \ + ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \ + ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \ + ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \ + ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \ +\ + CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \ + c00c01, c02c03, c04c05, c06c07) \ + CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \ + c00c01, c02c03, c04c05, c06c07) \ +\ + DECLARE_SPLAT_I32(3784) \ + DECLARE_SPLAT_I32(1567) \ +\ + DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \ + DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \ + DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \ + DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \ +\ + ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \ + ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \ + ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \ + ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \ +\ + SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \ + SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \ +\ + ADD_SUB_PAIR(o00, t02a, t00, t02,,) \ + ADD_SUB_PAIR(o15, t03a, t01, t03,,) \ + ADD_SUB_PAIR(o03, t06, t04a, t06a,,) \ + ADD_SUB_PAIR(o12, t07, t05a, t07a,,) \ + ADD_SUB_PAIR(o01, t10, t08a, t10a,,) \ + ADD_SUB_PAIR(o14, t11, t09a, t11a,,) \ + ADD_SUB_PAIR(o02, t14a, t12, t14,,) \ + ADD_SUB_PAIR(o13, t15a, t13, t15,,) \ +\ + CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \ + c00c01, c02c03, c04c05, c06c07) \ + CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \ + c00c01, c02c03, c04c05, c06c07) \ +\ + DECLARE_SPLAT_I32(181) \ + DECLARE_SPLAT_I32(128) \ + u32x4 v8 = vec_splat_u32(8); \ +\ + ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \ + ADD_SUB_PAIR(o04, o11, t06, t07,,) \ + ADD_SUB_PAIR(o06, o09, t10, t11,,) \ + ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \ +\ + MUL_4_INPLACE(o07, o08, o04, o11, v181) \ + MUL_4_INPLACE(o06, o09, o05, o10, v181) \ +\ + SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \ + SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \ +\ + o01 = -o01; \ + o03 = -o03; \ + o05 = -o05; \ + o07 = -o07; \ + o09 = -o09; \ + o11 = -o11; \ + o13 = -o13; \ + o15 = -o15; \ + +#define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ +{ \ + ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07) \ +} + +#define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ +{ \ + ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07) \ + PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ + c00, c02, c04, c06, c08, c10, c12, c14, \ + c01, c03, c05, c07, c09, c11, c13, c15) \ +} + +#define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ +{ \ + ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ + c00c01, c02c03, c04c05, c06c07) \ +} + +#define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ +{ \ + ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ + c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ + c00c01, c02c03, c04c05, c06c07) \ + PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ + c00, c02, c04, c06, c08, c10, c12, c14, \ + c01, c03, c05, c07, c09, c11, c13, c15) \ +} + + +void dav1d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, + int16_t *const coeff, const int eob + HIGHBD_DECL_SUFFIX) +{ + if (eob < 1) { + return dc_only_4xN(dst, stride, coeff, 4, 0, 1); + } + + LOAD_COEFF_4x16(coeff) + + dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) + + memset(coeff, 0, sizeof(*coeff) * 4 * 16); + + SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) + TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) + + dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) + + LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) + LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) + LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) + LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) + + APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); + APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); + APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); + APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); + + STORE_4(dst, stride, l00, l01, l02, l03); + STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); + STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); + STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); +} + +#define inv_txfm_fn4x16(type1, type2) \ +void dav1d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + LOAD_COEFF_4x16(coeff) \ + type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ + memset(coeff, 0, sizeof(*coeff) * 4 * 16); \ + SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \ + TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \ + type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ + cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ + a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ + LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \ + LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \ + LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \ + LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \ + APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \ + APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \ + APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \ + APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \ + STORE_4(dst, stride, l00, l01, l02, l03); \ + STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); \ + STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); \ + STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \ +} +inv_txfm_fn4x16(adst, dct ) +inv_txfm_fn4x16(dct, adst ) +inv_txfm_fn4x16(dct, flipadst) +inv_txfm_fn4x16(flipadst, dct ) +inv_txfm_fn4x16(adst, flipadst) +inv_txfm_fn4x16(flipadst, adst ) +inv_txfm_fn4x16(identity, dct ) +inv_txfm_fn4x16(dct, identity) +inv_txfm_fn4x16(identity, flipadst) +inv_txfm_fn4x16(flipadst, identity) +inv_txfm_fn4x16(identity, adst ) +inv_txfm_fn4x16(adst, identity) +inv_txfm_fn4x16(identity, identity) +inv_txfm_fn4x16(adst, adst ) +inv_txfm_fn4x16(flipadst, flipadst) + +void dav1d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, + int16_t *const coeff, const int eob) +{ + + if (eob < 1) { + return dc_only_16xN(dst, stride, coeff, 1, 0, 1); + } + + LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ + LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ + LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ + LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ + UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) + UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) + UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) + UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) + + dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, + c08, c09, c10, c11, c12, c13, c14, c15, + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) + memset(coeff, 0, sizeof(*coeff) * 16 * 4); + SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) + SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) + + TRANSPOSE4_I32(c00, c01, c02, c03); + TRANSPOSE4_I32(c04, c05, c06, c07); + TRANSPOSE4_I32(c08, c09, c10, c11); + TRANSPOSE4_I32(c12, c13, c14, c15); + + dct_4x4_out(c00, c01, c02, c03, + c04, c05, c06, c07, + c08, c09, c10, c11, + c12, c13, c14, c15, + c00c01, c02c03, c04c05, c06c07, + c08c09, c10c11, c12c13, c14c15) + + LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) + + APPLY_COEFF_16x4(l0, l1, l2, l3, + c00c01, c02c03, c04c05, c06c07, + c08c09, c10c11, c12c13, c14c15) + + STORE_16(dst, stride, l0, l1, l2, l3) +} + +#define inv_txfm_fn16x4(type1, type2) \ +void dav1d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ + LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ + LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ + LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ + UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ + UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ + UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ + UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ + type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ + memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ + SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ + TRANSPOSE4_I32(c00, c01, c02, c03); \ + TRANSPOSE4_I32(c04, c05, c06, c07); \ + TRANSPOSE4_I32(c08, c09, c10, c11); \ + TRANSPOSE4_I32(c12, c13, c14, c15); \ + type2##_4x4_out(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15); \ + LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ + APPLY_COEFF_16x4(l0, l1, l2, l3, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ + STORE_16(dst, stride, l0, l1, l2, l3) \ +} + +inv_txfm_fn16x4(adst, dct ) +inv_txfm_fn16x4(dct, adst ) +inv_txfm_fn16x4(dct, flipadst) +inv_txfm_fn16x4(flipadst, dct ) +inv_txfm_fn16x4(adst, flipadst) +inv_txfm_fn16x4(flipadst, adst ) +inv_txfm_fn16x4(dct, identity) +inv_txfm_fn16x4(flipadst, identity) +inv_txfm_fn16x4(adst, identity) +inv_txfm_fn16x4(identity, identity) +inv_txfm_fn16x4(adst, adst ) +inv_txfm_fn16x4(flipadst, flipadst) + +#define inv_txfm_fn16x4_identity(type2) \ +void dav1d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ + int16_t *const coeff, const int eob) \ +{ \ + LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ + LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ + LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ + LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ + UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ + UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ + UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ + UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ + identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ + c08, c09, c10, c11, c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ + memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ + SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ + SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ + CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \ + CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \ + TRANSPOSE4_I32(c00, c01, c02, c03); \ + TRANSPOSE4_I32(c04, c05, c06, c07); \ + TRANSPOSE4_I32(c08, c09, c10, c11); \ + TRANSPOSE4_I32(c12, c13, c14, c15); \ + type2##_4x4_out(c00, c01, c02, c03, \ + c04, c05, c06, c07, \ + c08, c09, c10, c11, \ + c12, c13, c14, c15, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15); \ + LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ + APPLY_COEFF_16x4(l0, l1, l2, l3, \ + c00c01, c02c03, c04c05, c06c07, \ + c08c09, c10c11, c12c13, c14c15) \ + STORE_16(dst, stride, l0, l1, l2, l3) \ +} + +inv_txfm_fn16x4_identity(dct) +inv_txfm_fn16x4_identity(adst) +inv_txfm_fn16x4_identity(flipadst) + +#endif // BITDEPTH diff --git a/src/ppc/loopfilter_tmpl.c b/src/ppc/loopfilter_tmpl.c index 4e658a701a886d70e8ec7b7c7c0c71feb5f7933b..107192f8361241ee5652a89a1e567ed61566b7c4 100644 --- a/src/ppc/loopfilter_tmpl.c +++ b/src/ppc/loopfilter_tmpl.c @@ -342,8 +342,7 @@ static inline void store_h_8(u8x16 outa, u8x16 outb, uint8_t *dst, int stridea) static inline void loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t stridea, b32x4 apply - HIGHBD_DECL_SUFFIX) + const ptrdiff_t stridea, b32x4 apply) { dst -= 2; uint8_t *dst2 = dst; @@ -428,8 +427,7 @@ loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, static inline void loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t stridea, b32x4 apply, b32x4 m6 - HIGHBD_DECL_SUFFIX) + const ptrdiff_t stridea, b32x4 apply, b32x4 m6) { uint8_t *dst2 = dst - 2; dst -= 3; @@ -572,8 +570,7 @@ loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, static inline void loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t stridea, b32x4 apply, b32x4 m8 - HIGHBD_DECL_SUFFIX) + const ptrdiff_t stridea, b32x4 apply, b32x4 m8) { uint8_t *dst2 = dst - 3; dst -= 4; @@ -718,8 +715,7 @@ loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, static inline void loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16 - HIGHBD_DECL_SUFFIX) + const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16) { uint8_t *dst2 = dst -6 ; dst -= 7; @@ -960,8 +956,7 @@ loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, static inline void loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t strideb, b32x4 apply - HIGHBD_DECL_SUFFIX) + const ptrdiff_t strideb, b32x4 apply) { uint8_t *p1d = dst + strideb * -2; uint8_t *p0d = dst + strideb * -1; @@ -1007,8 +1002,7 @@ loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, static inline void loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t strideb, b32x4 apply, b32x4 m6 - HIGHBD_DECL_SUFFIX) + const ptrdiff_t strideb, b32x4 apply, b32x4 m6) { uint8_t *p2d = dst + strideb * -3; uint8_t *p1d = dst + strideb * -2; @@ -1114,9 +1108,7 @@ loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, static inline void loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t strideb, b32x4 apply, b32x4 m8 - HIGHBD_DECL_SUFFIX) - + const ptrdiff_t strideb, b32x4 apply, b32x4 m8) { uint8_t *p3d = dst + strideb * -4; @@ -1216,9 +1208,7 @@ loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, static inline void loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, - const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16 - HIGHBD_DECL_SUFFIX) - + const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16) { uint8_t *p6d = dst + strideb * -7; @@ -1373,8 +1363,7 @@ loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, - const Av1FilterLUT *lut, const int h - HIGHBD_DECL_SUFFIX) + const Av1FilterLUT *lut, const int h) { unsigned vm = vmask[0] | vmask[1] | vmask[2]; @@ -1449,11 +1438,11 @@ void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride, apply = vec_and(m4, apply); if (vec_any_ne(wd16, zero)) { - loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX); + loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16); } else if (vec_any_ne(wd8, zero)) { - loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX); + loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8); } else { // wd4 == 0 already tested - loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply); } } } @@ -1461,8 +1450,7 @@ void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride, void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, - const Av1FilterLUT *lut, const int w - HIGHBD_DECL_SUFFIX) + const Av1FilterLUT *lut, const int w) { unsigned vm = vmask[0] | vmask[1] | vmask[2]; @@ -1530,11 +1518,11 @@ void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride, apply = vec_and(apply, m4); if (vec_any_ne(wd16, zero)) { - loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX); + loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16); } else if (vec_any_ne(wd8, zero)) { - loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX); + loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8); } else { - loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply); } } @@ -1543,8 +1531,7 @@ void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride, void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, - const Av1FilterLUT *lut, const int h - HIGHBD_DECL_SUFFIX) + const Av1FilterLUT *lut, const int h) { unsigned vm = vmask[0] | vmask[1]; u32x4 vm0 = vec_splats(vm); @@ -1614,10 +1601,10 @@ void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride, apply = vec_and(m4, apply); if (vec_any_ne(wd6, zero)) { - loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX); + loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6); // loop_filter_h_8 } else { // wd4 == 0 already tested - loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply); // loop_filter_h_4 } @@ -1628,8 +1615,7 @@ void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride, void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, - const Av1FilterLUT *lut, const int w - HIGHBD_DECL_SUFFIX) + const Av1FilterLUT *lut, const int w) { unsigned vm = vmask[0] | vmask[1]; @@ -1694,9 +1680,9 @@ void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride, apply = vec_and(apply, m4); if (vec_any_ne(wd6, zero)) { - loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX); + loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6); } else { - loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply); } } } diff --git a/src/ppc/looprestoration.h b/src/ppc/looprestoration.h index 3fe16318bd53f1a5fec46297fb6680a1feb2b451..614234abfc897180c46371f869cba4f2301ba658 100644 --- a/src/ppc/looprestoration.h +++ b/src/ppc/looprestoration.h @@ -35,7 +35,7 @@ void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, const uint8_t *lpf, const int w, const int h, const LooprestorationParams *const params, - const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + const enum LrEdgeFlags edges); static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) { const unsigned flags = dav1d_get_cpu_flags(); diff --git a/src/ppc/looprestoration_tmpl.c b/src/ppc/looprestoration_tmpl.c index c0c64e180023cfd1939be3727d0fa8988edf282f..76c1d07f802b110bc50a1c702d8a1404226703ab 100644 --- a/src/ppc/looprestoration_tmpl.c +++ b/src/ppc/looprestoration_tmpl.c @@ -305,7 +305,7 @@ void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, const uint8_t *lpf, const int w, const int h, const LooprestorationParams *const params, - const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) + const enum LrEdgeFlags edges) { const int16_t (*const filter)[8] = params->filter; diff --git a/src/ppc/utils.h b/src/ppc/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..3b0a5445bca2bbe195a92164740e7171cca5bc5c --- /dev/null +++ b/src/ppc/utils.h @@ -0,0 +1,105 @@ +/* + * Copyright © 2024, VideoLAN and dav1d authors + * Copyright © 2024, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_PPC_UTILS_H +#define DAV1D_SRC_PPC_UTILS_H + +#include "src/ppc/dav1d_types.h" + +#define assert_eq(a, b) \ + if ((a) != (b)) \ + printf("%d: %d vs %d\n", __LINE__, a, b); \ + assert((a) == (b)); + +#define MERGE_I32(a, b, h, l) \ +{ \ + h = vec_mergeh(a, b); \ + l = vec_mergel(a, b); \ +} + +#define DECLARE_MERGE_I32(a, b, h, l) \ + i32x4 h, l; \ + MERGE_I32(a, b, h, l) + + +// Transpose a 4x4 matrix of i32x4 vectors +#define TRANSPOSE4_I32(c0, c1, c2, c3) \ +{ \ + DECLARE_MERGE_I32(c0, c2, m02h, m02l) \ + DECLARE_MERGE_I32(c1, c3, m13h, m13l) \ +\ + MERGE_I32(m02h, m13h, c0, c1) \ + MERGE_I32(m02l, m13l, c2, c3) \ +} + +// Transpose a 8x8 matrix of i32x4 vectors +#define TRANSPOSE8_I32(c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, cA, cB, cC, cD, cE, cF) \ +{ \ + DECLARE_MERGE_I32(c0, c2, m02h, m02l) \ + DECLARE_MERGE_I32(c1, c3, m13h, m13l) \ + DECLARE_MERGE_I32(c4, c6, m46h, m46l) \ + DECLARE_MERGE_I32(c5, c7, m57h, m57l) \ + DECLARE_MERGE_I32(c8, cA, m8Ah, m8Al) \ + DECLARE_MERGE_I32(c9, cB, m9Bh, m9Bl) \ + DECLARE_MERGE_I32(cC, cE, mCEh, mCEl) \ + DECLARE_MERGE_I32(cD, cF, mDFh, mDFl) \ +\ + MERGE_I32(m02h, m13h, c0, c1) \ + MERGE_I32(m02l, m13l, c2, c3) \ + MERGE_I32(m46h, m57h, c8, c9) \ + MERGE_I32(m46l, m57l, cA, cB) \ + MERGE_I32(m8Ah, m9Bh, c4, c5) \ + MERGE_I32(m8Al, m9Bl, c6, c7) \ + MERGE_I32(mCEh, mDFh, cC, cD) \ + MERGE_I32(mCEl, mDFl, cE, cF) \ +} + +// Transpose a 4x16 matrix of i32x4 vectors +#define TRANSPOSE4x16_I32(c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, cA, cB, cC, cD, cE, cF) \ +{ \ + DECLARE_MERGE_I32(c0, c2, m02h, m02l) \ + DECLARE_MERGE_I32(c1, c3, m13h, m13l) \ + DECLARE_MERGE_I32(c4, c6, m46h, m46l) \ + DECLARE_MERGE_I32(c5, c7, m57h, m57l) \ + DECLARE_MERGE_I32(c8, cA, m8Ah, m8Al) \ + DECLARE_MERGE_I32(c9, cB, m9Bh, m9Bl) \ + DECLARE_MERGE_I32(cC, cE, mCEh, mCEl) \ + DECLARE_MERGE_I32(cD, cF, mDFh, mDFl) \ +\ + MERGE_I32(m02h, m13h, c0, c1) \ + MERGE_I32(m02l, m13l, c2, c3) \ + MERGE_I32(m46h, m57h, c4, c5) \ + MERGE_I32(m46l, m57l, c6, c7) \ + MERGE_I32(m8Ah, m9Bh, c8, c9) \ + MERGE_I32(m8Al, m9Bl, cA, cB) \ + MERGE_I32(mCEh, mDFh, cC, cD) \ + MERGE_I32(mCEl, mDFl, cE, cF) \ +} + +#endif // DAV1D_SRC_PPC_UTILS_H diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c index 0afd06c16bc763b1f55a6c87cbc5d28e6feb9859..426fa406ed1d1a29243c8d15158e90a1985e99e1 100644 --- a/src/recon_tmpl.c +++ b/src/recon_tmpl.c @@ -402,7 +402,8 @@ static int decode_coefs(Dav1dTaskContext *const t, // find end-of-block (eob) int eob_bin; - const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32); + const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32); + const int tx2dszctx = slw + slh; const enum TxClass tx_class = dav1d_tx_type_class[*txtp]; const int is_1d = tx_class != TX_CLASS_2D; switch (tx2dszctx) { @@ -449,10 +450,9 @@ static int decode_coefs(Dav1dTaskContext *const t, if (eob) { uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma]; uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok - const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8); /* eob */ - unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4); + unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx); int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2); int tok = eob_tok + 1; int level_tok = tok * 0x41; @@ -460,6 +460,7 @@ static int decode_coefs(Dav1dTaskContext *const t, #define DECODE_COEFS_CLASS(tx_class) \ unsigned x, y; \ + uint8_t *level; \ if (tx_class == TX_CLASS_2D) \ rc = scan[eob], x = rc >> shift, y = rc & mask; \ else if (tx_class == TX_CLASS_H) \ @@ -480,7 +481,11 @@ static int decode_coefs(Dav1dTaskContext *const t, ts->msac.rng); \ } \ cf[rc] = tok << 11; \ - levels[x * stride + y] = (uint8_t) level_tok; \ + if (TX_CLASS_2D) \ + level = levels + rc; \ + else \ + level = levels + x * stride + y; \ + *level = (uint8_t) level_tok; \ for (int i = eob - 1; i > 0; i--) { /* ac */ \ unsigned rc_i; \ if (tx_class == TX_CLASS_2D) \ @@ -490,7 +495,10 @@ static int decode_coefs(Dav1dTaskContext *const t, else /* tx_class == TX_CLASS_V */ \ x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \ assert(x < 32 && y < 32); \ - uint8_t *const level = levels + x * stride + y; \ + if (TX_CLASS_2D) \ + level = levels + rc; \ + else \ + level = levels + x * stride + y; \ ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \ if (tx_class == TX_CLASS_2D) \ y |= x; \ @@ -547,26 +555,26 @@ static int decode_coefs(Dav1dTaskContext *const t, const uint8_t (*const lo_ctx_offsets)[5] = dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)]; scan = dav1d_scans[tx]; - const ptrdiff_t stride = 4 * sh; - const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0; - const unsigned mask = 4 * sh - 1; - memset(levels, 0, stride * (4 * sw + 2)); + const ptrdiff_t stride = 4 << slh; + const unsigned shift = slh + 2, shift2 = 0; + const unsigned mask = (4 << slh) - 1; + memset(levels, 0, stride * ((4 << slw) + 2)); DECODE_COEFS_CLASS(TX_CLASS_2D); } case TX_CLASS_H: { const uint8_t (*const lo_ctx_offsets)[5] = NULL; const ptrdiff_t stride = 16; - const unsigned shift = t_dim->lh + 2, shift2 = 0; - const unsigned mask = 4 * sh - 1; - memset(levels, 0, stride * (4 * sh + 2)); + const unsigned shift = slh + 2, shift2 = 0; + const unsigned mask = (4 << slh) - 1; + memset(levels, 0, stride * ((4 << slh) + 2)); DECODE_COEFS_CLASS(TX_CLASS_H); } case TX_CLASS_V: { const uint8_t (*const lo_ctx_offsets)[5] = NULL; const ptrdiff_t stride = 16; - const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2; - const unsigned mask = 4 * sw - 1; - memset(levels, 0, stride * (4 * sw + 2)); + const unsigned shift = slw + 2, shift2 = slh + 2; + const unsigned mask = (4 << slw) - 1; + memset(levels, 0, stride * ((4 << slw) + 2)); DECODE_COEFS_CLASS(TX_CLASS_V); } #undef DECODE_COEFS_CLASS @@ -785,21 +793,15 @@ static void read_coef_tree(Dav1dTaskContext *const t, if (DEBUG_BLOCK_INFO) printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", ytx, txtp, eob, ts->msac.rng); -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir lcoef, off, mul * cf_ctx) -#define default_memset(dir, diridx, off, sz) \ - memset(&t->dir lcoef[off], cf_ctx, sz) - case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4); - case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4); -#undef default_memset -#undef set_ctx -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ + dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx)); + dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by)); +#define set_ctx(rep_macro) \ for (int y = 0; y < txh; y++) { \ - rep_macro(type, txtp_map, 0, mul * txtp); \ + rep_macro(txtp_map, 0, txtp); \ txtp_map += 32; \ } uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4]; - case_set_upto16(txw,,,); + case_set_upto16(t_dim->lw); #undef set_ctx if (t->frame_thread.pass == 1) *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; @@ -838,18 +840,16 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, (bh4 > ss_ver || t->by & 1); if (b->skip) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir lcoef, off, mul * 0x40) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); -#undef set_ctx + BlockContext *const a = t->a; + dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); + dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); if (has_chroma) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \ - rep_macro(type, t->dir ccoef[1], off, mul * 0x40) - case_set(cbh4, l., 1, cby4); - case_set(cbw4, a->, 0, cbx4); -#undef set_ctx + dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; + dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; + memset_cw(&a->ccoef[0][cbx4], 0x40); + memset_cw(&a->ccoef[1][cbx4], 0x40); + memset_ch(&t->l.ccoef[0][cby4], 0x40); + memset_ch(&t->l.ccoef[1][cby4], 0x40); } return; } @@ -890,16 +890,8 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, b->tx, txtp, eob, ts->msac.rng); *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir lcoef, off, mul * cf_ctx) -#define default_memset(dir, diridx, off, sz) \ - memset(&t->dir lcoef[off], cf_ctx, sz) - case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), - l., 1, by4 + y); - case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), - a->, 0, bx4 + x); -#undef default_memset -#undef set_ctx + dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); + dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); } } t->bx -= x; @@ -933,18 +925,10 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, pl, b->uvtx, txtp, eob, ts->msac.rng); *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16; -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) -#define default_memset(dir, diridx, off, sz) \ - memset(&t->dir ccoef[pl][off], cf_ctx, sz) - case_set_upto16_with_default( \ - imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver), - l., 1, cby4 + y); - case_set_upto16_with_default( \ - imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor), - a->, 0, cbx4 + x); -#undef default_memset -#undef set_ctx + int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); + int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); + dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); + dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); } t->bx -= x << ss_hor; } @@ -1329,16 +1313,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize if (DEBUG_BLOCK_INFO) printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", b->tx, txtp, eob, ts->msac.rng); -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir lcoef, off, mul * cf_ctx) -#define default_memset(dir, diridx, off, sz) \ - memset(&t->dir lcoef[off], cf_ctx, sz) - case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \ - l., 1, by4 + y); - case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \ - a->, 0, bx4 + x); -#undef default_memset -#undef set_ctx + dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); + dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); } if (eob >= 0) { if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) @@ -1353,11 +1329,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize t_dim->w * 4, t_dim->h * 4, "recon"); } } else if (!t->frame_thread.pass) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir lcoef, off, mul * 0x40) - case_set_upto16(t_dim->h, l., 1, by4 + y); - case_set_upto16(t_dim->w, a->, 0, bx4 + x); -#undef set_ctx + dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40); + dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40); } dst += 4 * t_dim->w; } @@ -1554,18 +1527,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize printf("Post-uv-cf-blk[pl=%d,tx=%d," "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n", pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4); -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) -#define default_memset(dir, diridx, off, sz) \ - memset(&t->dir ccoef[pl][off], cf_ctx, sz) - case_set_upto16_with_default( \ - imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver), - l., 1, cby4 + y); - case_set_upto16_with_default( \ - imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor), - a->, 0, cbx4 + x); -#undef default_memset -#undef set_ctx + int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); + int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); + dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); + dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); } if (eob >= 0) { if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) @@ -1579,11 +1544,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize uv_t_dim->h * 4, "recon"); } } else if (!t->frame_thread.pass) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir ccoef[pl], off, mul * 0x40) - case_set_upto16(uv_t_dim->h, l., 1, cby4 + y); - case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x); -#undef set_ctx + dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40); + dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40); } dst += uv_t_dim->w * 4; } @@ -1921,18 +1883,16 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize if (b->skip) { // reset coef contexts -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir lcoef, off, mul * 0x40) - case_set(bh4, l., 1, by4); - case_set(bw4, a->, 0, bx4); -#undef set_ctx + BlockContext *const a = t->a; + dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); + dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); if (has_chroma) { -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \ - rep_macro(type, t->dir ccoef[1], off, mul * 0x40) - case_set(cbh4, l., 1, cby4); - case_set(cbw4, a->, 0, cbx4); -#undef set_ctx + dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; + dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; + memset_cw(&a->ccoef[0][cbx4], 0x40); + memset_cw(&a->ccoef[1][cbx4], 0x40); + memset_ch(&t->l.ccoef[0][cby4], 0x40); + memset_ch(&t->l.ccoef[1][cby4], 0x40); } return 0; } @@ -1998,18 +1958,10 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize printf("Post-uv-cf-blk[pl=%d,tx=%d," "txtp=%d,eob=%d]: r=%d\n", pl, b->uvtx, txtp, eob, ts->msac.rng); -#define set_ctx(type, dir, diridx, off, mul, rep_macro) \ - rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) -#define default_memset(dir, diridx, off, sz) \ - memset(&t->dir ccoef[pl][off], cf_ctx, sz) - case_set_upto16_with_default( \ - imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver), - l., 1, cby4 + y); - case_set_upto16_with_default( \ - imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor), - a->, 0, cbx4 + x); -#undef default_memset -#undef set_ctx + int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor); + int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver); + dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); + dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); } if (eob >= 0) { if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) diff --git a/src/refmvs.h b/src/refmvs.h index d29feedce5ab5aaccd43dd82ffd620710d908c60..2c429844468710b099b9d5e845fc929e75bffde0 100644 --- a/src/refmvs.h +++ b/src/refmvs.h @@ -43,22 +43,26 @@ PACKED(typedef struct refmvs_temporal_block { mv mv; int8_t ref; }) refmvs_temporal_block; +CHECK_SIZE(refmvs_temporal_block, 5); -typedef union refmvs_refpair { +PACKED(typedef union refmvs_refpair { int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0 uint16_t pair; -} refmvs_refpair; +}) ALIGN(refmvs_refpair, 2); +CHECK_SIZE(refmvs_refpair, 2); typedef union refmvs_mvpair { mv mv[2]; uint64_t n; } refmvs_mvpair; +CHECK_SIZE(refmvs_mvpair, 8); PACKED(typedef struct refmvs_block { refmvs_mvpair mv; refmvs_refpair ref; uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv }) ALIGN(refmvs_block, 4); +CHECK_SIZE(refmvs_block, 12); typedef struct refmvs_frame { const Dav1dFrameHeader *frm_hdr; diff --git a/src/riscv/cpu.c b/src/riscv/cpu.c index 30e135435960b18df0e98a3a71b5880f2d78eb6a..345ff079e11ea15915a2b83303098aceb582cc65 100644 --- a/src/riscv/cpu.c +++ b/src/riscv/cpu.c @@ -29,9 +29,10 @@ #include "common/attributes.h" +#include "src/cpu.h" #include "src/riscv/cpu.h" -#if defined(HAVE_GETAUXVAL) +#if HAVE_GETAUXVAL #include <sys/auxv.h> #define HWCAP_RVV (1 << ('v' - 'a')) @@ -41,8 +42,8 @@ int dav1d_has_compliant_rvv(void); COLD unsigned dav1d_get_cpu_flags_riscv(void) { - unsigned flags = 0; -#if defined(HAVE_GETAUXVAL) + unsigned flags = dav1d_get_default_cpu_flags(); +#if HAVE_GETAUXVAL unsigned long hw_cap = getauxval(AT_HWCAP); flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0; #endif diff --git a/src/scan.c b/src/scan.c index 5261ccd3d1ad91dd91fafce1db832f89bfccec2b..6f9dc03691e4563d100bc6b19ca2a70f792c3296 100644 --- a/src/scan.c +++ b/src/scan.c @@ -28,7 +28,10 @@ #include "config.h" #include "common/attributes.h" +#include "common/intops.h" + #include "src/scan.h" +#include "src/thread.h" static const uint16_t ALIGN(scan_4x4[], 32) = { 0, 4, 1, 2, @@ -297,3 +300,76 @@ const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = { [RTX_16X64] = scan_16x32, [RTX_64X16] = scan_32x16, }; + +static uint8_t last_nonzero_col_from_eob_4x4[16]; +static uint8_t last_nonzero_col_from_eob_8x8[64]; +static uint8_t last_nonzero_col_from_eob_16x16[256]; +static uint8_t last_nonzero_col_from_eob_32x32[1024]; +static uint8_t last_nonzero_col_from_eob_4x8[32]; +static uint8_t last_nonzero_col_from_eob_8x4[32]; +static uint8_t last_nonzero_col_from_eob_8x16[128]; +static uint8_t last_nonzero_col_from_eob_16x8[128]; +static uint8_t last_nonzero_col_from_eob_16x32[512]; +static uint8_t last_nonzero_col_from_eob_32x16[512]; +static uint8_t last_nonzero_col_from_eob_4x16[64]; +static uint8_t last_nonzero_col_from_eob_16x4[64]; +static uint8_t last_nonzero_col_from_eob_8x32[256]; +static uint8_t last_nonzero_col_from_eob_32x8[256]; + +static COLD void init_tbl(uint8_t *const last_nonzero_col_from_eob, + const uint16_t *const scan, const int w, const int h) +{ + int max_col = 0; + for (int y = 0, n = 0; y < h; y++) { + for (int x = 0; x < w; x++, n++) { + const int rc = scan[n]; + const int rcx = rc & (h - 1); + max_col = imax(max_col, rcx); + last_nonzero_col_from_eob[n] = max_col; + } + } +} + +static COLD void init_internal(void) { + init_tbl(last_nonzero_col_from_eob_4x4, scan_4x4, 4, 4); + init_tbl(last_nonzero_col_from_eob_8x8, scan_8x8, 8, 8); + init_tbl(last_nonzero_col_from_eob_16x16, scan_16x16, 16, 16); + init_tbl(last_nonzero_col_from_eob_32x32, scan_32x32, 32, 32); + init_tbl(last_nonzero_col_from_eob_4x8, scan_4x8, 4, 8); + init_tbl(last_nonzero_col_from_eob_8x4, scan_8x4, 8, 4); + init_tbl(last_nonzero_col_from_eob_8x16, scan_8x16, 8, 16); + init_tbl(last_nonzero_col_from_eob_16x8, scan_16x8, 16, 8); + init_tbl(last_nonzero_col_from_eob_16x32, scan_16x32, 16, 32); + init_tbl(last_nonzero_col_from_eob_32x16, scan_32x16, 32, 16); + init_tbl(last_nonzero_col_from_eob_4x16, scan_4x16, 4, 16); + init_tbl(last_nonzero_col_from_eob_16x4, scan_16x4, 16, 4); + init_tbl(last_nonzero_col_from_eob_8x32, scan_8x32, 8, 32); + init_tbl(last_nonzero_col_from_eob_32x8, scan_32x8, 32, 8); +} + +COLD void dav1d_init_last_nonzero_col_from_eob_tables(void) { + static pthread_once_t initted = PTHREAD_ONCE_INIT; + pthread_once(&initted, init_internal); +} + +const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES] = { + [ TX_4X4 ] = last_nonzero_col_from_eob_4x4, + [ TX_8X8 ] = last_nonzero_col_from_eob_8x8, + [ TX_16X16] = last_nonzero_col_from_eob_16x16, + [ TX_32X32] = last_nonzero_col_from_eob_32x32, + [ TX_64X64] = last_nonzero_col_from_eob_32x32, + [RTX_4X8 ] = last_nonzero_col_from_eob_4x8, + [RTX_8X4 ] = last_nonzero_col_from_eob_8x4, + [RTX_8X16 ] = last_nonzero_col_from_eob_8x16, + [RTX_16X8 ] = last_nonzero_col_from_eob_16x8, + [RTX_16X32] = last_nonzero_col_from_eob_16x32, + [RTX_32X16] = last_nonzero_col_from_eob_32x16, + [RTX_32X64] = last_nonzero_col_from_eob_32x32, + [RTX_64X32] = last_nonzero_col_from_eob_32x32, + [RTX_4X16 ] = last_nonzero_col_from_eob_4x16, + [RTX_16X4 ] = last_nonzero_col_from_eob_16x4, + [RTX_8X32 ] = last_nonzero_col_from_eob_8x32, + [RTX_32X8 ] = last_nonzero_col_from_eob_32x8, + [RTX_16X64] = last_nonzero_col_from_eob_16x32, + [RTX_64X16] = last_nonzero_col_from_eob_32x16, +}; diff --git a/src/scan.h b/src/scan.h index 09df9887799efbc8d7b8cb631791358944e2e21b..2bd0b5b84e9521f77fac6b4bf23d4279f8f6d7b0 100644 --- a/src/scan.h +++ b/src/scan.h @@ -33,5 +33,8 @@ #include "src/levels.h" EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES]; +EXTERN const uint8_t *const dav1d_last_nonzero_col_from_eob[N_RECT_TX_SIZES]; + +void dav1d_init_last_nonzero_col_from_eob_tables(void); #endif /* DAV1D_SRC_SCAN_H */ diff --git a/src/thread.h b/src/thread.h index c44de736c3a788166c61bb3c8cef6f3306559a67..459aaced66292722b6f844102ab12d9ef28eb192 100644 --- a/src/thread.h +++ b/src/thread.h @@ -132,6 +132,14 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) { #else #include <pthread.h> +#if defined(__FreeBSD__) + /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */ +#define _SYS_PARAM_H_ +#include <sys/types.h> +#endif +#if HAVE_PTHREAD_NP_H +#include <pthread_np.h> +#endif #define dav1d_init_thread() do {} while (0) @@ -145,29 +153,28 @@ static inline void dav1d_set_thread_name(const char *const name) { prctl(PR_SET_NAME, name); } -#elif defined(__APPLE__) +#elif HAVE_PTHREAD_SETNAME_NP && defined(__APPLE__) static inline void dav1d_set_thread_name(const char *const name) { pthread_setname_np(name); } -#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__) +#elif HAVE_PTHREAD_SETNAME_NP && defined(__NetBSD__) -#if defined(__FreeBSD__) - /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */ -#define _SYS_PARAM_H_ -#include <sys/types.h> -#endif -#include <pthread_np.h> +static inline void dav1d_set_thread_name(const char *const name) { + pthread_setname_np(pthread_self(), "%s", (void*)name); +} + +#elif HAVE_PTHREAD_SETNAME_NP static inline void dav1d_set_thread_name(const char *const name) { - pthread_set_name_np(pthread_self(), name); + pthread_setname_np(pthread_self(), name); } -#elif defined(__NetBSD__) +#elif HAVE_PTHREAD_SET_NAME_NP static inline void dav1d_set_thread_name(const char *const name) { - pthread_setname_np(pthread_self(), "%s", (void*)name); + pthread_set_name_np(pthread_self(), name); } #elif defined(__HAIKU__) diff --git a/src/x86/cpu.c b/src/x86/cpu.c index f570fd7f391c535d19f84313a39e645007030423..80f91e16eb08fe97649ad923087b3727983bd1be 100644 --- a/src/x86/cpu.c +++ b/src/x86/cpu.c @@ -32,6 +32,7 @@ #include "common/attributes.h" +#include "src/cpu.h" #include "src/x86/cpu.h" typedef struct { @@ -52,7 +53,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) { }; } cpu; dav1d_cpu_cpuid(&cpu.r, 0, 0); - unsigned flags = 0; + unsigned flags = dav1d_get_default_cpu_flags(); if (cpu.max_leaf >= 1) { CpuidRegisters r; diff --git a/src/x86/itx.h b/src/x86/itx.h index 23d7a73806e19ab9e7549b8e8a8a0065b596da0d..a8a490fa47d751af940e5b21f5edfee74fe13fc5 100644 --- a/src/x86/itx.h +++ b/src/x86/itx.h @@ -107,7 +107,9 @@ decl_itx_fns(ssse3); decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); -static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { +static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, + const int bpc, int *const all_simd) +{ #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) @@ -167,6 +169,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons assign_itx1_fn (R, 64, 16, ssse3); assign_itx1_fn (R, 64, 32, ssse3); assign_itx1_fn ( , 64, 64, ssse3); + *all_simd = 1; #endif if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; @@ -192,6 +195,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons assign_itx1_fn (R, 64, 16, sse4); assign_itx1_fn (R, 64, 32, sse4); assign_itx1_fn (, 64, 64, sse4); + *all_simd = 1; } #endif diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm index b0c42597f7293c5ec5e7321f76498c1a52173b58..319dd45544c0064835f86faa19ecd38584c95097 100644 --- a/src/x86/mc16_sse.asm +++ b/src/x86/mc16_sse.asm @@ -67,6 +67,8 @@ pw_m512: times 8 dw -512 pd_63: times 4 dd 63 pd_64: times 4 dd 64 pd_512: times 4 dd 512 +pd_2560: times 2 dd 2560 +pd_8704: times 2 dd 8704 pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 pd_0x3ff: times 4 dd 0x3ff pd_0x4000: times 4 dd 0x4000 @@ -1158,7 +1160,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; prefix, type, type_h, type_v +%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1166,8 +1168,8 @@ cglobal %1_%2_16bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1180,40 +1182,25 @@ DECLARE_REG_TMP 7, 8, 8 %endif %define PUT_8TAP_FN FN put_8tap, -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc PUT_8TAP_FN regular, REGULAR, REGULAR +cglobal put_6tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my + %define base t2-put_ssse3 %if ARCH_X86_32 -cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my -%define mxb r0b -%define mxd r0 -%define mxq r0 -%define myb r1b -%define myd r1 -%define myq r1 -%define m8 [esp+16*0] -%define m9 [esp+16*1] -%define m10 [esp+16*2] -%define m11 [esp+16*3] -%define m12 [esp+16*4] -%define m13 [esp+16*5] -%define m14 [esp+16*6] -%define m15 [esp+16*7] -%else -cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my -%endif -%define base t2-put_ssse3 + %define mxb r0b + %define mxd r0 + %define mxq r0 + %define myb r1b + %define myd r1 + %define myq r1 +%endif imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v LEA t2, put_ssse3 movifnidn wd, wm movifnidn srcq, srcmp @@ -1223,6 +1210,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [base+put_ssse3_table+wq*2] movifnidn dstq, dstmp @@ -1233,24 +1221,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pop r7 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - mov myd, r8m - movd m5, r8m - shr myd, 11 - movddup m4, [base+put_8tap_h_rnd+myq*8] - movifnidn dsq, dsmp - pshufb m5, [base+pw_256] - cmp wd, 4 - jg .h_w8 - movzx mxd, mxb - lea srcq, [srcq-2] - movq m3, [base+subpel_filters+mxq*8] - movifnidn dstq, dstmp - punpcklbw m3, m3 - psraw m3, 8 ; sign-extend - je .h_w4 .h_w2: mova m2, [base+spel_h_shuf2] pshufd m3, m3, q2121 @@ -1277,89 +1247,111 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jg .h_w2_loop RET .h_w4: - WIN64_SPILL_XMM 8 - mova m6, [base+spel_h_shufA] - mova m7, [base+spel_h_shufB] + movzx mxd, mxb + lea srcq, [srcq-2] + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + jl .h_w2 + WIN64_SPILL_XMM 9 + mova m7, [base+spel_h_shufA] +%if ARCH_X86_32 + %define m8 [base+spel_h_shufB] +%else + mova m8, [base+spel_h_shufB] +%endif pshufd m2, m3, q1111 pshufd m3, m3, q2222 .h_w4_loop: - movu m1, [srcq] - add srcq, ssq - pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 - pshufb m1, m7 ; 2 3 3 4 4 5 5 6 - pmaddwd m0, m2 + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m6, m0, m7 ; 0 1 1 2 2 3 3 4 + pmaddwd m6, m2 + pshufb m0, m8 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m3 + paddd m0, m6 + pshufb m6, m1, m7 + pmaddwd m6, m2 + pshufb m1, m8 pmaddwd m1, m3 paddd m0, m4 - paddd m0, m1 + paddd m6, m4 + paddd m1, m6 psrad m0, 6 - packssdw m0, m0 + psrad m1, 6 + packssdw m0, m1 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 - movq [dstq], m0 - add dstq, dsq - dec hd + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 jg .h_w4_loop RET -.h_w8: - WIN64_SPILL_XMM 12 +.h: + RESET_STACK_STATE + test myd, 0xf00 + jnz .hv + mov myd, r8m + movd m5, r8m + shr myd, 11 + movddup m4, [base+put_8tap_h_rnd+myq*8] + movifnidn dsq, dsmp + pshufb m5, [base+pw_256] + sub wd, 4 + jle .h_w4 + WIN64_SPILL_XMM 11 shr mxd, 16 - movq m3, [base+subpel_filters+mxq*8] + movq m2, [base+subpel_filters+1+mxq*8] movifnidn dstq, dstmp mova m6, [base+spel_h_shufA] mova m7, [base+spel_h_shufB] -%if UNIX64 - mov wd, wd -%endif lea srcq, [srcq+wq*2] - punpcklbw m3, m3 + punpcklbw m2, m2 lea dstq, [dstq+wq*2] - psraw m3, 8 + psraw m2, 8 neg wq %if ARCH_X86_32 - ALLOC_STACK -16*4 - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 + ALLOC_STACK -16*3 + %define m8 [rsp+16*0] + %define m9 [rsp+16*1] + %define m10 [rsp+16*2] + pshufd m0, m2, q0000 + pshufd m1, m2, q1111 + pshufd m2, m2, q2222 mova m8, m0 mova m9, m1 mova m10, m2 - mova m11, m3 %else - pshufd m8, m3, q0000 - pshufd m9, m3, q1111 - pshufd m10, m3, q2222 - pshufd m11, m3, q3333 + pshufd m8, m2, q0000 + pshufd m9, m2, q1111 + pshufd m10, m2, q2222 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: - movu m0, [srcq+r6*2- 6] - movu m1, [srcq+r6*2+ 2] - pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 - pshufb m0, m7 ; 2 3 3 4 4 5 5 6 - pmaddwd m2, m8 ; abcd0 - pmaddwd m0, m9 ; abcd1 - pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 - pshufb m1, m7 ; 6 7 7 8 8 9 9 a - paddd m2, m4 - paddd m0, m2 + movu m3, [srcq+r6*2-4] + movu m2, [srcq+r6*2+8] + pshufb m0, m3, m6 ; 01 12 23 34 + pmaddwd m0, m8 ; abcd0 + pshufb m3, m7 ; 23 34 45 56 + pmaddwd m1, m9, m3 ; abcd1 + paddd m0, m1 + pshufb m1, m2, m6 ; 67 78 89 9a + shufpd m3, m1, 0x01 ; 45 56 67 78 + pmaddwd m1, m9 ; efgh1 + pshufb m2, m7 ; 89 9a ab bc + pmaddwd m2, m10 ; efgh2 + paddd m1, m2 pmaddwd m2, m10, m3 ; abcd2 pmaddwd m3, m8 ; efgh0 + paddd m0, m4 + paddd m1, m4 paddd m0, m2 - pmaddwd m2, m11, m1 ; abcd3 - pmaddwd m1, m9 ; efgh1 - paddd m0, m2 - movu m2, [srcq+r6*2+10] - paddd m3, m4 - paddd m1, m3 - pshufb m3, m2, m6 ; 8 9 9 a a b b c - pshufb m2, m7 ; a b b c c d d e - pmaddwd m3, m10 ; efgh2 - pmaddwd m2, m11 ; efgh3 paddd m1, m3 - paddd m1, m2 psrad m0, 6 psrad m1, 6 packssdw m0, m1 @@ -1379,78 +1371,71 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my shr myd, 16 cmp hd, 6 cmovb myd, mxd - movq m3, [base+subpel_filters+myq*8] - WIN64_SPILL_XMM 15 - movd m7, r8m + movq m2, [base+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 11, 16 + movd m5, r8m movifnidn dstq, dstmp movifnidn dsq, dsmp - punpcklbw m3, m3 - pshufb m7, [base+pw_256] - psraw m3, 8 ; sign-extend + punpcklbw m2, m2 + pshufb m5, [base+pw_256] + psraw m2, 8 ; sign-extend %if ARCH_X86_32 - ALLOC_STACK -16*7 - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 + ALLOC_STACK -16*4 + pshufd m0, m2, q0000 + mov r6, ssq + pshufd m1, m2, q1111 + neg r6 + pshufd m2, m2, q2222 mova m8, m0 mova m9, m1 mova m10, m2 - mova m11, m3 -%else - pshufd m8, m3, q0000 - pshufd m9, m3, q1111 - pshufd m10, m3, q2222 - pshufd m11, m3, q3333 -%endif - lea r6, [ssq*3] - sub srcq, r6 cmp wd, 2 jne .v_w4 +%else + mov r6, ssq + pshufd m8, m2, q0000 + neg r6 + cmp wd, 4 + jg .v_w8 + pshufd m9, m2, q1111 + pshufd m10, m2, q2222 + je .v_w4 +%endif .v_w2: - movd m1, [srcq+ssq*0] + movd m1, [srcq+r6 *2] + movd m3, [srcq+r6 *1] + movd m2, [srcq+ssq*0] movd m4, [srcq+ssq*1] - movd m2, [srcq+ssq*2] - add srcq, r6 - movd m5, [srcq+ssq*0] - movd m3, [srcq+ssq*1] - movd m6, [srcq+ssq*2] - add srcq, r6 + lea srcq, [srcq+ssq*2] movd m0, [srcq+ssq*0] - punpckldq m1, m4 ; 0 1 - punpckldq m4, m2 ; 1 2 - punpckldq m2, m5 ; 2 3 - punpckldq m5, m3 ; 3 4 - punpckldq m3, m6 ; 4 5 - punpckldq m6, m0 ; 5 6 - punpcklwd m1, m4 ; 01 12 - punpcklwd m2, m5 ; 23 34 - punpcklwd m3, m6 ; 45 56 + punpckldq m1, m3 ; 0 1 + punpckldq m3, m2 ; 1 2 + punpckldq m2, m4 ; 2 3 + punpckldq m4, m0 ; 3 4 + punpcklwd m1, m3 ; 01 12 + punpcklwd m2, m4 ; 23 34 pxor m6, m6 .v_w2_loop: - movd m4, [srcq+ssq*1] + movd m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pmaddwd m5, m8, m1 ; a0 b0 + pmaddwd m4, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 - paddd m5, m2 - mova m2, m3 - pmaddwd m3, m10 ; a2 b2 - paddd m5, m3 - punpckldq m3, m0, m4 ; 6 7 + paddd m4, m2 + punpckldq m2, m0, m3 ; 4 5 movd m0, [srcq+ssq*0] - punpckldq m4, m0 ; 7 8 - punpcklwd m3, m4 ; 67 78 - pmaddwd m4, m11, m3 ; a3 b3 - paddd m5, m4 - psrad m5, 5 - packssdw m5, m5 - pmaxsw m5, m6 - pavgw m5, m6 - pminsw m5, m7 - movd [dstq+dsq*0], m5 - pshuflw m5, m5, q3232 - movd [dstq+dsq*1], m5 + punpckldq m3, m0 ; 5 6 + punpcklwd m2, m3 ; 67 78 + pmaddwd m3, m10, m2 ; a2 b2 + paddd m4, m3 + psrad m4, 5 + packssdw m4, m4 + pmaxsw m4, m6 + pavgw m4, m6 + pminsw m4, m5 + movd [dstq+dsq*0], m4 + pshuflw m4, m4, q3232 + movd [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop @@ -1458,563 +1443,1991 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my .v_w4: %if ARCH_X86_32 shl wd, 14 -%if STACK_ALIGNMENT < 16 - mov [esp+4*29], srcq - mov [esp+4*30], dstq -%else - mov srcmp, srcq -%endif + lea srcq, [srcq+r6*2] lea wd, [wq+hq-(1<<16)] -%else - shl wd, 6 - mov r7, srcq - mov r8, dstq - lea wd, [wq+hq-(1<<8)] +%if STACK_ALIGNMENT < 16 + %define dstmp [esp+16*3] %endif .v_w4_loop0: + mov dstmp, dstq movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] - movq m3, [srcq+ssq*2] - add srcq, r6 - movq m4, [srcq+ssq*0] - movq m5, [srcq+ssq*1] - movq m6, [srcq+ssq*2] - add srcq, r6 - movq m0, [srcq+ssq*0] + lea r6, [srcq+ssq*2] + movq m3, [r6 +ssq*0] + movq m4, [r6 +ssq*1] + lea r6, [r6 +ssq*2] +%else + movq m1, [srcq+r6 *2] + movq m2, [srcq+r6 *1] + lea r6, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] +%endif + movq m0, [r6 +ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 - punpcklwd m4, m5 ; 34 - punpcklwd m5, m6 ; 45 - punpcklwd m6, m0 ; 56 -%if ARCH_X86_32 - jmp .v_w4_loop_start + punpcklwd m4, m0 ; 34 .v_w4_loop: - mova m1, m12 - mova m2, m13 - mova m3, m14 -.v_w4_loop_start: - pmaddwd m1, m8 ; a0 - pmaddwd m2, m8 ; b0 - mova m12, m3 - mova m13, m4 + pmaddwd m6, m8, m1 ; a0 + pmaddwd m7, m8, m2 ; b0 + mova m1, m3 pmaddwd m3, m9 ; a1 + mova m2, m4 pmaddwd m4, m9 ; b1 - paddd m1, m3 - paddd m2, m4 - mova m14, m5 - mova m4, m6 - pmaddwd m5, m10 ; a2 - pmaddwd m6, m10 ; b2 - paddd m1, m5 - paddd m2, m6 - movq m6, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpcklwd m5, m0, m6 ; 67 - movq m0, [srcq+ssq*0] - pmaddwd m3, m11, m5 ; a3 - punpcklwd m6, m0 ; 78 - paddd m1, m3 - pmaddwd m3, m11, m6 ; b3 - paddd m2, m3 - psrad m1, 5 - psrad m2, 5 - packssdw m1, m2 - pxor m2, m2 - pmaxsw m1, m2 - pavgw m1, m2 - pminsw m1, m7 - movq [dstq+dsq*0], m1 - movhps [dstq+dsq*1], m1 + paddd m6, m3 + movq m3, [r6+ssq*0] + paddd m7, m4 + movq m4, [r6+ssq*1] + lea r6, [r6+ssq*2] + movq m0, [r6+ssq*0] + punpcklwd m3, m4 ; 45 + punpcklwd m4, m0 ; 56 + pmaddwd m0, m10, m3 ; a2 + paddd m6, m0 + pmaddwd m0, m10, m4 ; b2 + paddd m7, m0 + psrad m6, 5 + psrad m7, 5 + packssdw m6, m7 + pxor m7, m7 + pmaxsw m6, m7 + pavgw m6, m7 + pminsw m6, m5 + movq [dstq+dsq*0], m6 + movhps [dstq+dsq*1], m6 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop -%if STACK_ALIGNMENT < 16 - mov srcq, [esp+4*29] - mov dstq, [esp+4*30] - movzx hd, ww - add srcq, 8 - add dstq, 8 - mov [esp+4*29], srcq - mov [esp+4*30], dstq -%else - mov srcq, srcmp +%if ARCH_X86_32 mov dstq, dstmp - movzx hd, ww add srcq, 8 + movzx hd, ww add dstq, 8 - mov srcmp, srcq - mov dstmp, dstq -%endif sub wd, 1<<16 + jg .v_w4_loop0 + RET %else -.v_w4_loop: - pmaddwd m12, m8, m1 ; a0 - pmaddwd m13, m8, m2 ; b0 - mova m1, m3 - mova m2, m4 - pmaddwd m3, m9 ; a1 - pmaddwd m4, m9 ; b1 - paddd m12, m3 - paddd m13, m4 - mova m3, m5 - mova m4, m6 - pmaddwd m5, m10 ; a2 - pmaddwd m6, m10 ; b2 - paddd m12, m5 - paddd m13, m6 - movq m6, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpcklwd m5, m0, m6 ; 67 - movq m0, [srcq+ssq*0] - pmaddwd m14, m11, m5 ; a3 - punpcklwd m6, m0 ; 78 - paddd m12, m14 - pmaddwd m14, m11, m6 ; b3 - paddd m13, m14 - psrad m12, 5 - psrad m13, 5 - packssdw m12, m13 - pxor m13, m13 - pmaxsw m12, m13 - pavgw m12, m13 - pminsw m12, m7 - movq [dstq+dsq*0], m12 - movhps [dstq+dsq*1], m12 - lea dstq, [dstq+dsq*2] + RET +.v_w8: + mova r6m, m8 + shl wd, 5 + pshufd m6, m2, q1111 + lea wd, [wq+hq-(1<<8)] + pshufd m7, m2, q2222 + WIN64_PUSH_XMM 16 +.v_w8_loop0: + movu m9, [srcq+ r6*2] + movu m11, [srcq+ r6*1] + lea r7, [srcq+ssq*2] + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + mov r8, dstq + movu m4, [r7 +ssq*0] + punpcklwd m8, m9, m11 ; 01 + punpckhwd m9, m11 + punpcklwd m10, m11, m13 ; 12 + punpckhwd m11, m13 + punpcklwd m12, m13, m15 ; 23 + punpckhwd m13, m15 + punpcklwd m14, m15, m4 ; 34 + punpckhwd m15, m4 +.v_w8_loop: + mova m3, r6m + pmaddwd m0, m8, m3 ; a0 + pmaddwd m2, m9, m3 ; a0' + pmaddwd m1, m10, m3 ; b0 + pmaddwd m3, m11 ; b0' + mova m8, m12 + pmaddwd m12, m6 ; a1 + mova m9, m13 + pmaddwd m13, m6 ; a1' + mova m10, m14 + pmaddwd m14, m6 ; b1 + mova m11, m15 + pmaddwd m15, m6 ; b1' + paddd m0, m12 + paddd m2, m13 + movu m13, [r7+ssq*0] + paddd m1, m14 + paddd m3, m15 + movu m15, [r7+ssq*1] + lea r7, [r7+ssq*2] + movu m4, [r7+ssq*0] + punpcklwd m12, m13, m15 ; 45 + punpckhwd m13, m15 + punpcklwd m14, m15, m4 ; 56 + punpckhwd m15, m4 + pmaddwd m4, m7, m12 ; a2 + paddd m0, m4 + pmaddwd m4, m7, m13 ; a2' + paddd m2, m4 + pmaddwd m4, m7, m14 ; b2 + paddd m1, m4 + pmaddwd m4, m7, m15 ; b2' + paddd m3, m4 + REPX {psrad x, 5}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + pxor m2, m2 + pmaxsw m0, m2 + pmaxsw m1, m2 + pavgw m0, m2 + pavgw m1, m2 + pminsw m0, m5 + pminsw m1, m5 + mova [r8+dsq*0], m0 + mova [r8+dsq*1], m1 + lea r8, [r8+dsq*2] sub hd, 2 - jg .v_w4_loop - add r7, 8 - add r8, 8 + jg .v_w8_loop + add srcq, 16 + add dstq, 16 movzx hd, wb - mov srcq, r7 - mov dstq, r8 sub wd, 1<<8 -%endif - jg .v_w4_loop0 + jg .v_w8_loop0 RET +%endif .hv: - RESET_STACK_STATE + cmp wd, 4 + jg .hv_w8 + WIN64_SPILL_XMM 12, 16 %if ARCH_X86_32 - movd m4, r8m - mova m6, [base+pd_512] - pshufb m4, [base+pw_256] + movd m3, r8m + pshufb m3, [base+pw_256] %else -%if WIN64 - ALLOC_STACK 16*6, 16 -%endif - movd m15, r8m - pshufb m15, [base+pw_256] + movd m11, r8m + pshufb m11, [base+pw_256] %endif - cmp wd, 4 - jg .hv_w8 movzx mxd, mxb - je .hv_w4 movq m0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd - movq m3, [base+subpel_filters+myq*8] -%if ARCH_X86_32 - mov dstq, dstmp - mov dsq, dsmp - mova m5, [base+spel_h_shuf2] - ALLOC_STACK -16*8 -%else - mova m6, [base+pd_512] - mova m9, [base+spel_h_shuf2] -%endif + movq m2, [base+subpel_filters+1+myq*8] + movddup m7, [base+pd_8704] + sub srcq, 2 pshuflw m0, m0, q2121 - pxor m7, m7 - punpcklbw m7, m0 - punpcklbw m3, m3 - psraw m3, 8 ; sign-extend + pxor m6, m6 + punpcklbw m6, m0 + punpcklbw m2, m2 + psraw m2, 8 ; sign-extend test dword r8m, 0x800 jz .hv_w2_10bpc - psraw m7, 2 - psllw m3, 2 + movddup m7, [base+pd_2560] + psraw m6, 2 + psllw m2, 2 .hv_w2_10bpc: - lea r6, [ssq*3] - sub srcq, 2 - sub srcq, r6 %if ARCH_X86_32 - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 - mova m9, m5 - mova m11, m0 - mova m12, m1 - mova m13, m2 - mova m14, m3 - mova m15, m4 +%assign regs_used 2 + ALLOC_STACK -16*7 +%assign regs_used 7 + mov dstq, r0mp + mov dsq, r1mp + %define m11 [esp+16*4] + pshufd m0, m2, q0000 + pshufd m1, m2, q1111 + pshufd m2, m2, q2222 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + neg ssq + movu m3, [srcq+ssq*2] + movu m4, [srcq+ssq*1] + neg ssq %else - pshufd m11, m3, q0000 - pshufd m12, m3, q1111 - pshufd m13, m3, q2222 - pshufd m14, m3, q3333 + pshufd m8, m2, q0000 + mov r6, ssq + pshufd m9, m2, q1111 + neg r6 + pshufd m10, m2, q2222 + movu m3, [srcq+r6 *2] + movu m4, [srcq+r6 *1] %endif + movu m1, [srcq+ssq*0] + movu m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] movu m2, [srcq+ssq*0] - movu m3, [srcq+ssq*1] - movu m1, [srcq+ssq*2] - add srcq, r6 - movu m4, [srcq+ssq*0] -%if ARCH_X86_32 - REPX {pshufb x, m5}, m2, m3, m1, m4 -%else - REPX {pshufb x, m9}, m2, m3, m1, m4 -%endif - REPX {pmaddwd x, m7}, m2, m3, m1, m4 - phaddd m2, m3 ; 0 1 - phaddd m1, m4 ; 2 3 - movu m3, [srcq+ssq*1] - movu m4, [srcq+ssq*2] - add srcq, r6 - movu m0, [srcq+ssq*0] -%if ARCH_X86_32 - REPX {pshufb x, m5}, m3, m4, m0 -%else - REPX {pshufb x, m9}, m3, m4, m0 -%endif - REPX {pmaddwd x, m7}, m3, m4, m0 - phaddd m3, m4 ; 4 5 - phaddd m0, m0 ; 6 6 - REPX {paddd x, m6}, m2, m1, m3, m0 - REPX {psrad x, 10}, m2, m1, m3, m0 - packssdw m2, m1 ; 0 1 2 3 - packssdw m3, m0 ; 4 5 6 _ - palignr m4, m3, m2, 4 ; 1 2 3 4 - pshufd m5, m3, q0321 ; 5 6 _ _ + cmp wd, 4 + je .hv_w4 + mova m5, [base+spel_h_shuf2] + REPX {pshufb x, m5}, m3, m4, m0, m1, m2 + REPX {pmaddwd x, m6}, m3, m0, m4, m1, m2 + phaddd m3, m0 ; 0 3 + phaddd m4, m1 ; 1 2 + phaddd m0, m2 ; 3 4 + REPX {paddd x, m7}, m3, m4, m0 + REPX {psrad x, 10}, m3, m4, m0 + packssdw m3, m4 ; 0 3 1 2 + packssdw m4, m0 ; 1 2 3 4 + pshufd m2, m3, q1320 ; 0 1 2 3 punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 - punpcklwd m3, m5 ; 45 56 .hv_w2_loop: - movu m4, [srcq+ssq*1] + movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - movu m5, [srcq+ssq*0] - pshufb m4, m9 - pshufb m5, m9 - pmaddwd m4, m7 - pmaddwd m5, m7 - phaddd m4, m5 - pmaddwd m5, m11, m1 ; a0 b0 + movu m4, [srcq+ssq*0] + pshufb m3, m5 + pshufb m4, m5 + pmaddwd m3, m6 + pmaddwd m4, m6 + phaddd m3, m4 + pmaddwd m4, m8, m1 ; a0 b0 mova m1, m2 - pmaddwd m2, m12 ; a1 b1 - paddd m5, m2 - mova m2, m3 - pmaddwd m3, m13 ; a2 b2 + pmaddwd m2, m9 ; a1 b1 + paddd m4, m2 + paddd m3, m7 + psrad m3, 10 ; 5 6 + packssdw m0, m3 + pshufd m2, m0, q2103 + punpckhwd m2, m0 ; 45 56 + mova m0, m3 + pmaddwd m3, m10, m2 ; a2 b2 + paddd m4, m3 + psrad m4, 10 + packssdw m4, m4 + pxor m3, m3 + pminsw m4, m11 + pmaxsw m4, m3 + movd [dstq+dsq*0], m4 + pshuflw m4, m4, q1032 + movd [dstq+dsq*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: +%if ARCH_X86_32 + %define m12 [esp+16*5] + %define m13 [esp+16*6] + %define m14 [base+spel_h_shufA] + %define m15 [base+spel_h_shufB] + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 + mova m12, m5 + mova m13, m6 +%else + WIN64_PUSH_XMM 16 + mova m14, [base+spel_h_shufA] + mova m15, [base+spel_h_shufB] + pshufd m12, m6, q0000 + pshufd m13, m6, q1111 +%endif +%macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB + pshufb %3, %2, m14 + pmaddwd %3, m12 + pshufb %2, %4 + pmaddwd %2, m13 + paddd %3, m7 + paddd %1, %2, %3 +%endmacro + HV_H_W4_6TAP m3, m3, m5 + HV_H_W4_6TAP m4, m4, m5 + HV_H_W4_6TAP m5, m1, m5 + HV_H_W4_6TAP m0, m0, m1 + HV_H_W4_6TAP m2, m2, m1 + REPX {psrad x, 10}, m3, m5, m4, m0, m2 + packssdw m3, m5 ; 0 2 + packssdw m4, m0 ; 1 3 + packssdw m5, m2 ; 2 4 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m5 ; 12 + punpckhwd m4, m5 ; 34 +.hv_w4_loop: + movu m0, [srcq+ssq*1] + pmaddwd m5, m8, m1 ; a0 + lea srcq, [srcq+ssq*2] + pmaddwd m6, m8, m2 ; b0 + mova m1, m3 + pmaddwd m3, m9 ; a1 + mova m2, m4 + pmaddwd m4, m9 ; b1 paddd m5, m3 - paddd m4, m6 - psrad m4, 10 ; 7 8 - packssdw m0, m4 - pshufd m3, m0, q2103 - punpckhwd m3, m0 ; 67 78 - mova m0, m4 - pmaddwd m4, m14, m3 ; a3 b3 - paddd m5, m6 - paddd m5, m4 + movu m3, [srcq+ssq*0] + paddd m6, m4 + HV_H_W4_6TAP m0, m0, m4 + HV_H_W4_6TAP m3, m3, m4 + psrad m4, m2, 16 + psrad m0, 10 + psrad m3, 10 + packssdw m4, m0 ; 4 5 + packssdw m0, m3 ; 5 6 + punpcklwd m3, m4, m0 ; 45 + punpckhwd m4, m0 ; 56 + pmaddwd m0, m10, m3 ; a2 + paddd m5, m0 + pmaddwd m0, m10, m4 ; b2 + paddd m6, m0 psrad m5, 10 - packssdw m5, m5 - pxor m4, m4 - pminsw m5, m15 - pmaxsw m5, m4 - movd [dstq+dsq*0], m5 - pshuflw m5, m5, q3232 - movd [dstq+dsq*1], m5 + psrad m6, 10 + packssdw m5, m6 + pxor m6, m6 + pminsw m5, m11 + pmaxsw m5, m6 + movq [dstq+dsq*0], m5 + movhps [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 - jg .hv_w2_loop + jg .hv_w4_loop RET .hv_w8: + RESET_STACK_STATE shr mxd, 16 -.hv_w4: - movq m2, [base+subpel_filters+mxq*8] + movq m2, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd - movq m3, [base+subpel_filters+myq*8] -%if ARCH_X86_32 - RESET_STACK_STATE - mov dstq, dstmp - mov dsq, dsmp - mova m0, [base+spel_h_shufA] - mova m1, [base+spel_h_shufB] - ALLOC_STACK -16*15 - mova m8, m0 - mova m9, m1 - mova m14, m6 -%else - mova m8, [base+spel_h_shufA] - mova m9, [base+spel_h_shufB] -%endif + movq m1, [base+subpel_filters+1+myq*8] + movd m3, r8m + movddup m4, [base+pd_8704] + pshufb m3, [base+pw_256] pxor m0, m0 punpcklbw m0, m2 - punpcklbw m3, m3 - psraw m3, 8 + punpcklbw m1, m1 + sub srcq, 4 + psraw m1, 8 ; sign-extend test dword r8m, 0x800 - jz .hv_w4_10bpc + jz .hv_w8_10bpc + movddup m4, [base+pd_2560] psraw m0, 2 - psllw m3, 2 -.hv_w4_10bpc: - lea r6, [ssq*3] - sub srcq, 6 - sub srcq, r6 + psllw m1, 2 +.hv_w8_10bpc: +%if ARCH_X86_32 +%assign regs_used 2 + ALLOC_STACK -16*9 +%assign regs_used 7 + mov dstq, r0mp + mov dsq, r1mp + mova [rsp+16*7], m4 +%else + ALLOC_STACK 16*7, 16 +%endif + mova [rsp+16*6], m3 + pshufd m2, m0, q0000 + mova [rsp+16*0], m2 + pshufd m2, m0, q1111 + mova [rsp+16*1], m2 + pshufd m0, m0, q2222 + mova [rsp+16*2], m0 + pshufd m2, m1, q0000 + mova [rsp+16*3], m2 + pshufd m2, m1, q1111 + mova [rsp+16*4], m2 + pshufd m1, m1, q2222 + mova [rsp+16*5], m1 + mov r6, ssq + neg r6 %if ARCH_X86_32 - %define tmp esp+16*8 shl wd, 14 + lea r4d, [wq+hq-(1<<16)] %if STACK_ALIGNMENT < 16 - mov [esp+4*61], srcq - mov [esp+4*62], dstq -%else + %define srcmp [esp+16*8+4*0] + %define dstmp [esp+16*8+4*1] +%endif +%macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3] + punpcklwd %1, %2, %3 ; 01 12 23 34 + punpckhwd %2, %3 ; 45 56 67 78 + pmaddwd %3, %4, %1 ; a0 + shufpd %1, %2, 0x01 ; 23 34 45 56 + pmaddwd %2, %6 ; a2 + pmaddwd %1, %5 ; a1 + paddd %2, %3 + paddd %1, %2 +%endmacro +.hv_w8_loop0: mov srcmp, srcq -%endif - mova [tmp+16*5], m4 - lea wd, [wq+hq-(1<<16)] - pshufd m1, m0, q0000 - pshufd m2, m0, q1111 - pshufd m5, m0, q2222 - pshufd m0, m0, q3333 - mova m10, m1 - mova m11, m2 - mova m12, m5 - mova m13, m0 -%else -%if WIN64 - %define tmp rsp -%else - %define tmp rsp-104 ; red zone -%endif - shl wd, 6 - mov r7, srcq - mov r8, dstq - lea wd, [wq+hq-(1<<8)] - pshufd m10, m0, q0000 - pshufd m11, m0, q1111 - pshufd m12, m0, q2222 - pshufd m13, m0, q3333 - mova [tmp+16*5], m15 -%endif - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 - mova [tmp+16*1], m0 - mova [tmp+16*2], m1 - mova [tmp+16*3], m2 - mova [tmp+16*4], m3 -%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] - pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 - pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 - pmaddwd m%3, m10 - pmaddwd m%1, m11 - paddd m%3, %5 - paddd m%1, m%3 - pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 - pshufb m%2, m9 ; 6 7 7 8 8 9 9 a - pmaddwd m%3, m12 - pmaddwd m%2, m13 - paddd m%1, m%3 - paddd m%1, m%2 - psrad m%1, %4 -%endmacro -.hv_w4_loop0: -%if ARCH_X86_64 - mova m14, [pd_512] -%endif - movu m4, [srcq+ssq*0+0] - movu m1, [srcq+ssq*0+8] + mov dstmp, dstq + movu m5, [srcq+r6*2+0] + movu m6, [srcq+r6*2+2] + mova m7, [rsp+16*0] + mova m1, [rsp+16*1] + mova m0, [rsp+16*2] + HV_H_6TAP m2, m5, m6, m7, m1, m0 + movu m5, [srcq+r6*1+0] + movu m6, [srcq+r6*1+2] + HV_H_6TAP m3, m5, m6, m7, m1, m0 + movu m5, [srcq+ssq*0+0] + movu m6, [srcq+ssq*0+2] + HV_H_6TAP m4, m5, m6, m7, m1, m0 movu m5, [srcq+ssq*1+0] - movu m2, [srcq+ssq*1+8] - movu m6, [srcq+ssq*2+0] - movu m3, [srcq+ssq*2+8] - add srcq, r6 - PUT_8TAP_HV_H 4, 1, 0, 10 - PUT_8TAP_HV_H 5, 2, 0, 10 - PUT_8TAP_HV_H 6, 3, 0, 10 - movu m7, [srcq+ssq*0+0] - movu m2, [srcq+ssq*0+8] - movu m1, [srcq+ssq*1+0] - movu m3, [srcq+ssq*1+8] - PUT_8TAP_HV_H 7, 2, 0, 10 - PUT_8TAP_HV_H 1, 3, 0, 10 - movu m2, [srcq+ssq*2+0] - movu m3, [srcq+ssq*2+8] - add srcq, r6 - PUT_8TAP_HV_H 2, 3, 0, 10 - packssdw m4, m7 ; 0 3 - packssdw m5, m1 ; 1 4 - movu m0, [srcq+ssq*0+0] - movu m1, [srcq+ssq*0+8] - PUT_8TAP_HV_H 0, 1, 3, 10 - packssdw m6, m2 ; 2 5 - packssdw m7, m0 ; 3 6 - punpcklwd m1, m4, m5 ; 01 - punpckhwd m4, m5 ; 34 - punpcklwd m2, m5, m6 ; 12 - punpckhwd m5, m6 ; 45 - punpcklwd m3, m6, m7 ; 23 - punpckhwd m6, m7 ; 56 -%if ARCH_X86_32 - jmp .hv_w4_loop_start -.hv_w4_loop: - mova m1, [tmp+16*6] - mova m2, m15 -.hv_w4_loop_start: - mova m7, [tmp+16*1] - pmaddwd m1, m7 ; a0 - pmaddwd m2, m7 ; b0 - mova m7, [tmp+16*2] - mova [tmp+16*6], m3 - pmaddwd m3, m7 ; a1 - mova m15, m4 - pmaddwd m4, m7 ; b1 - mova m7, [tmp+16*3] - paddd m1, m3 - paddd m2, m4 - mova m3, m5 - pmaddwd m5, m7 ; a2 - mova m4, m6 - pmaddwd m6, m7 ; b2 - paddd m1, m5 - paddd m2, m6 - movu m7, [srcq+ssq*1+0] - movu m5, [srcq+ssq*1+8] + movu m6, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] - PUT_8TAP_HV_H 7, 5, 6, 10 - packssdw m0, m7 ; 6 7 - mova [tmp+16*0], m0 - movu m0, [srcq+ssq*0+0] - movu m5, [srcq+ssq*0+8] - PUT_8TAP_HV_H 0, 5, 6, 10 - mova m6, [tmp+16*0] - packssdw m7, m0 ; 7 8 - punpcklwd m5, m6, m7 ; 67 - punpckhwd m6, m7 ; 78 - pmaddwd m7, m5, [tmp+16*4] - paddd m1, m7 ; a3 - pmaddwd m7, m6, [tmp+16*4] - paddd m2, m7 ; b3 - psrad m1, 9 - psrad m2, 9 - packssdw m1, m2 - pxor m7, m7 - pmaxsw m1, m7 - pavgw m7, m1 - pminsw m7, [tmp+16*5] - movq [dstq+dsq*0], m7 - movhps [dstq+dsq*1], m7 + HV_H_6TAP m0, m5, m6, m7, m1 + movu m5, [srcq+ssq*0+0] + movu m6, [srcq+ssq*0+2] + HV_H_6TAP m1, m5, m6, m7 + mova m5, [rsp+16*7] + REPX {paddd x, m5}, m2, m3, m4, m0, m1 + REPX {psrad x, 10}, m2, m4, m3, m0, m1 + packssdw m2, m4 ; 0 2 + packssdw m3, m0 ; 1 3 + packssdw m4, m1 ; 2 4 + punpcklwd m0, m2, m3 ; 01 + punpckhwd m2, m3 ; 23 + punpcklwd m1, m3, m4 ; 12 + punpckhwd m3, m4 ; 34 +.hv_w8_loop: + mova m5, [rsp+16*3] + mova m6, [rsp+16*4] + pmaddwd m4, m0, m5 ; a0 + pmaddwd m5, m1 ; b0 + mova m0, m2 + pmaddwd m2, m6 ; a1 + mova m1, m3 + pmaddwd m3, m6 ; b1 + paddd m4, m2 + movu m2, [srcq+ssq*1+0] + paddd m5, m3 + movu m3, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + HV_H_6TAP m6, m2, m3 + movu m2, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+2] + HV_H_6TAP m7, m2, m3 + mova m2, [rsp+16*7] + psrad m3, m1, 16 + paddd m6, m2 + paddd m7, m2 + psrad m6, 10 + psrad m7, 10 + packssdw m3, m6 ; 4 5 + packssdw m6, m7 ; 5 6 + mova m7, [rsp+16*5] + punpcklwd m2, m3, m6 ; 45 + punpckhwd m3, m6 ; 56 + pmaddwd m6, m2, m7 ; a2 + pmaddwd m7, m3 ; b2 + paddd m4, m6 + paddd m5, m7 + psrad m4, 10 + psrad m5, 10 + packssdw m4, m5 + pxor m5, m5 + pminsw m4, [rsp+16*6] + pmaxsw m4, m5 + movq [dstq+dsq*0], m4 + movhps [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 - jg .hv_w4_loop -%if STACK_ALIGNMENT < 16 - mov srcq, [esp+4*61] - mov dstq, [esp+4*62] - add srcq, 8 - add dstq, 8 - mov [esp+4*61], srcq - mov [esp+4*62], dstq -%else + jg .hv_w8_loop mov srcq, srcmp mov dstq, dstmp + movzx hd, r4w add srcq, 8 add dstq, 8 - mov srcmp, srcq - mov dstmp, dstq -%endif - movzx hd, ww - sub wd, 1<<16 + sub r4d, 1<<16 %else -.hv_w4_loop: - mova m15, [tmp+16*1] - pmaddwd m14, m15, m1 ; a0 - pmaddwd m15, m2 ; b0 - mova m7, [tmp+16*2] - mova m1, m3 - pmaddwd m3, m7 ; a1 - mova m2, m4 - pmaddwd m4, m7 ; b1 - mova m7, [tmp+16*3] - paddd m14, m3 - paddd m15, m4 - mova m3, m5 - pmaddwd m5, m7 ; a2 - mova m4, m6 - pmaddwd m6, m7 ; b2 - paddd m14, m5 - paddd m15, m6 - movu m7, [srcq+ssq*1+0] - movu m5, [srcq+ssq*1+8] - lea srcq, [srcq+ssq*2] - PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] - packssdw m0, m7 ; 6 7 - mova [tmp+16*0], m0 - movu m0, [srcq+ssq*0+0] - movu m5, [srcq+ssq*0+8] - PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] - mova m6, [tmp+16*0] - packssdw m7, m0 ; 7 8 - punpcklwd m5, m6, m7 ; 67 - punpckhwd m6, m7 ; 78 - pmaddwd m7, m5, [tmp+16*4] - paddd m14, m7 ; a3 - pmaddwd m7, m6, [tmp+16*4] - paddd m15, m7 ; b3 - psrad m14, 9 - psrad m15, 9 - packssdw m14, m15 - pxor m7, m7 - pmaxsw m14, m7 - pavgw m7, m14 - pminsw m7, [tmp+16*5] - movq [dstq+dsq*0], m7 - movhps [dstq+dsq*1], m7 - lea dstq, [dstq+dsq*2] + shl wd, 5 + lea r8d, [wq+hq-256] +%macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3] +%ifid %6 + REPX {pshufb x, %6}, %2, %3, %4 +%else + mova %1, %6 + pshufb %2, %1 ; 01 12 23 34 + pshufb %3, %1 ; 45 56 67 78 + pshufb %4, %1 ; 89 9a ab bc +%endif + pmaddwd %1, %7, %2 + shufpd %2, %3, 0x01 ; 23 34 45 56 + pmaddwd %2, %8 + paddd %1, %2 + pmaddwd %2, %9, %3 + paddd %1, %2 + pmaddwd %2, %7, %3 + shufpd %3, %4, 0x01 ; 67 78 89 9a + pmaddwd %4, %9 + pmaddwd %3, %8 + paddd %1, m4 + paddd %2, m4 + paddd %3, %4 + paddd %2, %3 + psrad %1, %5 + psrad %2, %5 + packssdw %1, %2 +%endmacro +.hv_w8_loop0: + mova m5, [spel_h_shufA] + movu m0, [srcq+r6*2+ 0] + mova m6, [rsp+16*0] + movu m1, [srcq+r6*2+ 8] + mova m7, [rsp+16*1] + movu m2, [srcq+r6*2+16] + mova m8, [rsp+16*2] + HV_H_6TAP m9, m0, m1, m2, 10, m5, m6, m7, m8 + movu m0, [srcq+r6*1+ 0] + movu m1, [srcq+r6*1+ 8] + movu m2, [srcq+r6*1+16] + lea r4, [srcq+ssq*2] + HV_H_6TAP m11, m0, m1, m2, 10, m5, m6, m7, m8 + movu m0, [srcq+ssq*0+ 0] + movu m1, [srcq+ssq*0+ 8] + movu m2, [srcq+ssq*0+16] + mov r7, dstq + HV_H_6TAP m13, m0, m1, m2, 10, m5, m6, m7, m8 + movu m0, [srcq+ssq*1+ 0] + movu m1, [srcq+ssq*1+ 8] + movu m2, [srcq+ssq*1+16] + HV_H_6TAP m15, m0, m1, m2, 10, m5, m6, m7, m8 + movu m0, [r4+ssq*0+ 0] + movu m1, [r4+ssq*0+ 8] + movu m2, [r4+ssq*0+16] + HV_H_6TAP m5, m0, m1, m2, 10, m5, m6, m7, m8 + punpcklwd m8, m9, m11 ; 01 + punpckhwd m9, m11 + punpcklwd m10, m11, m13 ; 12 + punpckhwd m11, m13 + punpcklwd m12, m13, m15 ; 23 + punpckhwd m13, m15 + punpcklwd m14, m15, m5 ; 34 + punpckhwd m15, m5 +.hv_w8_loop: + mova m3, [rsp+16*3] + mova m7, [rsp+16*4] + pmaddwd m0, m8, m3 ; a0 + mova m8, m12 + pmaddwd m2, m9, m3 ; a0' + mova m9, m13 + pmaddwd m1, m10, m3 ; b0 + mova m10, m14 + pmaddwd m3, m11 ; b0' + mova m11, m15 + REPX {pmaddwd x, m7}, m12, m13, m14, m15 + movu m6, [r4+ssq*1+ 0] + paddd m0, m12 + movu m7, [r4+ssq*1+ 8] + paddd m2, m13 + movu m12, [r4+ssq*1+16] + paddd m1, m14 + lea r4, [r4+ssq*2] + paddd m3, m15 + HV_H_6TAP m15, m6, m7, m12, 10 + movu m6, [r4+ssq*0+ 0] + movu m7, [r4+ssq*0+ 8] + movu m14, [r4+ssq*0+16] + punpcklwd m12, m5, m15 ; 45 + punpckhwd m13, m5, m15 + HV_H_6TAP m5, m6, m7, m14, 10 + mova m7, [rsp+16*5] + punpcklwd m14, m15, m5 ; 56 + punpckhwd m15, m5 + pmaddwd m6, m12, m7 ; a2 + paddd m0, m6 + pmaddwd m6, m13, m7 ; a2' + paddd m2, m6 + pmaddwd m6, m14, m7 ; b2 + pmaddwd m7, m15 ; b2' + paddd m1, m6 + mova m6, [rsp+16*6] + paddd m3, m7 + REPX {psrad x, 10}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + pxor m2, m2 + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m2 + pmaxsw m1, m2 + mova [r7+dsq*0], m0 + mova [r7+dsq*1], m1 + lea r7, [r7+dsq*2] sub hd, 2 - jg .hv_w4_loop - add r7, 8 - add r8, 8 - movzx hd, wb - mov srcq, r7 - mov dstq, r8 - sub wd, 1<<8 + jg .hv_w8_loop + add srcq, 16 + add dstq, 16 + movzx hd, r8b + sub r8d, 1<<8 %endif - jg .hv_w4_loop0 + jg .hv_w8_loop0 RET -%undef tmp +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my %if ARCH_X86_32 -DECLARE_REG_TMP 2, 1, 6, 4 -%elif WIN64 -DECLARE_REG_TMP 6, 4, 7, 4 -%else -DECLARE_REG_TMP 6, 7, 7, 8 + %define mxb r0b + %define mxd r0 + %define mxq r0 + %define myb r1b + %define myd r1 + %define myq r1 + %define m8 [esp+16*0] + %define m9 [esp+16*1] + %define m10 [esp+16*2] + %define m11 [esp+16*3] + %define m12 [esp+16*4] + %define m13 [esp+16*5] + %define m14 [esp+16*6] + %define m15 [esp+16*7] %endif + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, put_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + movifnidn ssq, ssmp + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_16bpc_ssse3).put +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] + WIN64_SPILL_XMM 15 + movd m7, r8m + movifnidn dstq, dstmp + movifnidn dsq, dsmp + punpcklbw m3, m3 + pshufb m7, [base+pw_256] + psraw m3, 8 ; sign-extend +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + cmp wd, 2 + jne .v_w4 +.v_w2: + movd m1, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + movd m2, [srcq+ssq*2] + add srcq, r6 + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m6, [srcq+ssq*2] + add srcq, r6 + movd m0, [srcq+ssq*0] + punpckldq m1, m4 ; 0 1 + punpckldq m4, m2 ; 1 2 + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m6 ; 4 5 + punpckldq m6, m0 ; 5 6 + punpcklwd m1, m4 ; 01 12 + punpcklwd m2, m5 ; 23 34 + punpcklwd m3, m6 ; 45 56 + pxor m6, m6 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + punpckldq m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m6 + pavgw m5, m6 + pminsw m5, m7 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcmp, srcq +%endif + lea wd, [wq+hq-(1<<16)] +%else + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] +%endif +.v_w4_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, r6 + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, r6 + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_w4_loop_start +.v_w4_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_w4_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m3 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + pxor m2, m2 + pmaxsw m1, m2 + pavgw m1, m2 + pminsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*29] + mov dstq, [esp+4*30] + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + sub wd, 1<<16 +%else +.v_w4_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packssdw m12, m13 + pxor m13, m13 + pmaxsw m12, m13 + pavgw m12, m13 + pminsw m12, m7 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .v_w4_loop0 + RET +.h: + RESET_STACK_STATE + test myd, 0xf00 + jnz .hv + mov myd, r8m + movd m5, r8m + shr myd, 11 + movddup m4, [base+put_8tap_h_rnd+myq*8] + movifnidn dsq, dsmp + pshufb m5, [base+pw_256] + cmp wd, 4 + jle mangle(private_prefix %+ _put_6tap_16bpc_ssse3).h_w4 + WIN64_SPILL_XMM 12 + shr mxd, 16 + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] +%if UNIX64 + mov wd, wd +%endif + lea srcq, [srcq+wq*2] + punpcklbw m3, m3 + lea dstq, [dstq+wq*2] + psraw m3, 8 + neg wq +%if ARCH_X86_32 + ALLOC_STACK -16*4 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6*2- 6] + movu m1, [srcq+r6*2+ 2] + pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 + pshufb m0, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m8 ; abcd0 + pmaddwd m0, m9 ; abcd1 + pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 + pshufb m1, m7 ; 6 7 7 8 8 9 9 a + paddd m2, m4 + paddd m0, m2 + pmaddwd m2, m10, m3 ; abcd2 + pmaddwd m3, m8 ; efgh0 + paddd m0, m2 + pmaddwd m2, m11, m1 ; abcd3 + pmaddwd m1, m9 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6*2+10] + paddd m3, m4 + paddd m1, m3 + pshufb m3, m2, m6 ; 8 9 9 a a b b c + pshufb m2, m7 ; a b b c c d d e + pmaddwd m3, m10 ; efgh2 + pmaddwd m2, m11 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + mova [dstq+r6*2], m0 + add r6, 8 + jl .h_w8_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w8_loop0 + RET +.hv: + RESET_STACK_STATE +%if ARCH_X86_32 + movd m4, r8m + pshufb m4, [base+pw_256] +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + movd m15, r8m + pshufb m15, [base+pw_256] +%endif + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + je .hv_w4 + movq m0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] + movddup m6, [base+pd_8704] + pshuflw m0, m0, q2121 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_w2_10bpc + movddup m6, [base+pd_2560] + psraw m7, 2 + psllw m3, 2 +.hv_w2_10bpc: +%if ARCH_X86_32 + mov dstq, dstmp + mov dsq, dsmp + mova m5, [base+spel_h_shuf2] + ALLOC_STACK -16*8 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m9, m5 + mova m11, m0 + mova m12, m1 + mova m13, m2 + mova m14, m3 + mova m15, m4 +%else + mova m9, [base+spel_h_shuf2] + pshufd m11, m3, q0000 + pshufd m12, m3, q1111 + pshufd m13, m3, q2222 + pshufd m14, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m1, [srcq+ssq*2] + add srcq, r6 + movu m4, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m2, m3, m1, m4 +%else + REPX {pshufb x, m9}, m2, m3, m1, m4 +%endif + REPX {pmaddwd x, m7}, m2, m3, m1, m4 + phaddd m2, m3 ; 0 1 + phaddd m1, m4 ; 2 3 + movu m3, [srcq+ssq*1] + movu m4, [srcq+ssq*2] + add srcq, r6 + movu m0, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m3, m4, m0 +%else + REPX {pshufb x, m9}, m3, m4, m0 +%endif + REPX {pmaddwd x, m7}, m3, m4, m0 + phaddd m3, m4 ; 4 5 + phaddd m0, m0 ; 6 6 + REPX {paddd x, m6}, m2, m1, m3, m0 + REPX {psrad x, 10}, m2, m1, m3, m0 + packssdw m2, m1 ; 0 1 2 3 + packssdw m3, m0 ; 4 5 6 _ + palignr m4, m3, m2, 4 ; 1 2 3 4 + pshufd m5, m3, q0321 ; 5 6 _ _ + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + punpcklwd m3, m5 ; 45 56 +.hv_w2_loop: + movu m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu m5, [srcq+ssq*0] + pshufb m4, m9 + pshufb m5, m9 + pmaddwd m4, m7 + pmaddwd m5, m7 + phaddd m4, m5 + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + paddd m4, m6 + psrad m4, 10 ; 7 8 + packssdw m0, m4 + pshufd m3, m0, q2103 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + pxor m4, m4 + pminsw m5, m15 + pmaxsw m5, m4 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w8: + shr mxd, 16 +.hv_w4: + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + RESET_STACK_STATE + mov dstq, dstmp + mov dsq, dsmp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + mova m6, [base+pd_512] + ALLOC_STACK -16*15 + mova m8, m0 + mova m9, m1 + mova m14, m6 +%else + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m3, 8 + test dword r8m, 0x800 + jz .hv_w4_10bpc + psraw m0, 2 + psllw m3, 2 +.hv_w4_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 +%if ARCH_X86_32 + %define tmp esp+16*8 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcmp, srcq +%endif + mova [tmp+16*5], m4 + lea wd, [wq+hq-(1<<16)] + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-104 ; red zone +%endif + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + mova [tmp+16*5], m15 +%endif + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] + pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 + pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 + pmaddwd m%3, m10 + pmaddwd m%1, m11 + paddd m%3, %5 + paddd m%1, m%3 + pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + pmaddwd m%3, m12 + pmaddwd m%2, m13 + paddd m%1, m%3 + paddd m%1, m%2 + psrad m%1, %4 +%endmacro +.hv_w4_loop0: +%if ARCH_X86_64 + mova m14, [pd_512] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + movu m6, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 4, 1, 0, 10 + PUT_8TAP_HV_H 5, 2, 0, 10 + PUT_8TAP_HV_H 6, 3, 0, 10 + movu m7, [srcq+ssq*0+0] + movu m2, [srcq+ssq*0+8] + movu m1, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + PUT_8TAP_HV_H 7, 2, 0, 10 + PUT_8TAP_HV_H 1, 3, 0, 10 + movu m2, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 2, 3, 0, 10 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 10 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_w4_loop_start +.hv_w4_loop: + mova m1, [tmp+16*6] + mova m2, m15 +.hv_w4_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*6], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 9 + psrad m2, 9 + packssdw m1, m2 + pxor m7, m7 + pmaxsw m1, m7 + pavgw m7, m1 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*61] + mov dstq, [esp+4*62] + add srcq, 8 + add dstq, 8 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + movzx hd, ww + sub wd, 1<<16 +%else +.hv_w4_loop: + mova m15, [tmp+16*1] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 9 + psrad m15, 9 + packssdw m14, m15 + pxor m7, m7 + pmaxsw m14, m7 + pavgw m7, m14 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .hv_w4_loop0 + RET +%undef tmp + +%if ARCH_X86_32 +DECLARE_REG_TMP 2, 1, 6, 4 +%elif WIN64 +DECLARE_REG_TMP 6, 4, 7, 4 +%else +DECLARE_REG_TMP 6, 7, 7, 8 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_6tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my + %define base t2-prep_ssse3 +%if ARCH_X86_32 + %define mxb r0b + %define mxd r0 + %define mxq r0 + %define myb r2b + %define myd r2 + %define myq r2 +%endif + imul mxd, mxm, 0x010101 + add mxd, t0d ; 6tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 6tap_v, my, 4tap_v + LEA t2, prep_ssse3 + movifnidn wd, wm + movifnidn hd, hm + movifnidn srcq, srcmp + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + mov myd, r7m ; bitdepth_max + movzx wd, word [base+prep_ssse3_table+wq*2] + mova m5, [base+pw_8192] + shr myd, 11 + add wq, t2 + movddup m4, [base+prep_mul+myq*8] + movifnidn ssq, ssmp + movifnidn tmpq, tmpmp + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + RESET_STACK_STATE + test myd, 0xf00 + jnz .hv + movifnidn ssq, r2mp + movddup m5, [base+prep_8tap_1d_rnd] + cmp wd, 4 + je mangle(private_prefix %+ _prep_8tap_16bpc_ssse3).h_w4 + WIN64_SPILL_XMM 10 + shr mxd, 16 + movq m2, [base+subpel_filters+1+mxq*8] + movifnidn tmpq, r0mp + mova m4, [base+spel_h_shufA] + add wd, wd + mova m6, [base+spel_h_shufB] + add srcq, wq + punpcklbw m2, m2 + add tmpq, wq + psraw m2, 8 + neg wq + test dword r7m, 0x800 + jnz .h_w8_12bpc + psllw m2, 2 +.h_w8_12bpc: + pshufd m7, m2, q0000 +%if ARCH_X86_32 + ALLOC_STACK -16*2 + %define m8 [rsp+16*0] + %define m9 [rsp+16*1] + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + mova m8, m0 + mova m9, m1 +%else + pshufd m8, m2, q1111 + pshufd m9, m2, q2222 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m3, [srcq+r6-4] + movu m2, [srcq+r6+8] + pshufb m0, m3, m4 ; 01 12 23 34 + pmaddwd m0, m7 ; abcd0 + pshufb m3, m6 ; 23 34 45 56 + pmaddwd m1, m8, m3 ; abcd1 + paddd m0, m1 + pshufb m1, m2, m4 ; 67 78 89 9a + shufpd m3, m1, 0x01; 45 56 67 78 + pmaddwd m1, m8 ; efgh1 + pshufb m2, m6 ; 89 9a ab bc + pmaddwd m2, m9 ; efgh2 + paddd m1, m2 + pmaddwd m2, m9 , m3 ; abcd2 + pmaddwd m3, m7 ; efgh0 + paddd m0, m5 + paddd m1, m5 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq+r6], m0 + add r6, 16 + jl .h_w8_loop + add srcq, ssq + sub tmpq, wq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movddup m5, [base+prep_8tap_1d_rnd] + movq m2, [base+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 11, 16 + movifnidn ssq, r2mp + movifnidn tmpq, r0mp + punpcklbw m2, m2 + sub srcq, ssq + psraw m2, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m2, 2 +.v_12bpc: + sub srcq, ssq +%if ARCH_X86_32 + ALLOC_STACK -16*4 + pshufd m0, m2, q0000 + mov r6d, wd + pshufd m1, m2, q1111 + shl r6d, 14 + pshufd m2, m2, q2222 + lea r6d, [r6+hq-(1<<16)] + mova m8, m0 + mova m9, m1 + mova m10, m2 +%if STACK_ALIGNMENT < 16 + %define srcmp [esp+16*3+4*0] + %define tmpmp [esp+16*3+4*1] +%endif +.v_w4_loop0: + mov srcmp, srcq + mov tmpmp, tmpq +%else + pshufd m8, m2, q0000 + and wd, -8 + jnz .v_w8 + pshufd m9, m2, q1111 + pshufd m10, m2, q2222 +%endif + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m0 ; 34 +.v_w4_loop: + pmaddwd m6, m8, m1 ; a0 + pmaddwd m7, m8, m2 ; b0 + mova m1, m3 + pmaddwd m3, m9 ; a1 + mova m2, m4 + pmaddwd m4, m9 ; b1 + paddd m6, m3 + movq m3, [srcq+ssq*0] + paddd m7, m4 + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] + punpcklwd m3, m4 ; 45 + punpcklwd m4, m0 ; 56 + pmaddwd m0, m10, m3 ; a2 + paddd m6, m5 + paddd m6, m0 + pmaddwd m0, m10, m4 ; b2 + paddd m7, m5 + paddd m7, m0 + psrad m6, 4 + psrad m7, 4 + packssdw m6, m7 +%if ARCH_X86_32 + movq [tmpq+wq*0], m6 + movhps [tmpq+wq*2], m6 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w4_loop + mov srcq, srcmp + mov tmpq, tmpmp + movzx hd, r6w + add srcq, 8 + add tmpq, 8 + sub r6d, 1<<16 + jg .v_w4_loop0 + RET +%else + mova [tmpq], m6 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + mova r6m, m8 + lea r6d, [wq*4-(1<<5)] + pshufd m6, m2, q1111 + lea r6d, [hq+r6*8] + pshufd m7, m2, q2222 + WIN64_PUSH_XMM 16 +.v_w8_loop0: + movu m9, [srcq+ssq*0] + lea r5, [srcq+ssq*2] + movu m11, [srcq+ssq*1] + mov r7, tmpq + movu m13, [r5+ssq*0] + movu m15, [r5+ssq*1] + lea r5, [r5+ssq*2] + movu m4, [r5+ssq*0] + punpcklwd m8, m9, m11 ; 01 + punpckhwd m9, m11 + punpcklwd m10, m11, m13 ; 12 + punpckhwd m11, m13 + punpcklwd m12, m13, m15 ; 23 + punpckhwd m13, m15 + punpcklwd m14, m15, m4 ; 34 + punpckhwd m15, m4 +.v_w8_loop: + mova m3, r6m + pmaddwd m0, m8, m3 ; a0 + pmaddwd m2, m9, m3 ; a0' + pmaddwd m1, m10, m3 ; b0 + pmaddwd m3, m11 ; b0' + mova m8, m12 + pmaddwd m12, m6 ; a1 + mova m9, m13 + pmaddwd m13, m6 ; a1' + mova m10, m14 + pmaddwd m14, m6 ; b1 + mova m11, m15 + pmaddwd m15, m6 ; b1' + paddd m0, m12 + paddd m2, m13 + movu m13, [r5+ssq*0] + paddd m1, m14 + paddd m3, m15 + movu m15, [r5+ssq*1] + lea r5, [r5+ssq*2] + movu m4, [r5+ssq*0] + REPX {paddd x, m5}, m0, m2, m1, m3 + punpcklwd m12, m13, m15 ; 45 + punpckhwd m13, m15 + punpcklwd m14, m15, m4 ; 56 + punpckhwd m15, m4 + pmaddwd m4, m7, m12 ; a2 + paddd m0, m4 + pmaddwd m4, m7, m13 ; a2' + paddd m2, m4 + pmaddwd m4, m7, m14 ; b2 + paddd m1, m4 + pmaddwd m4, m7, m15 ; b2' + paddd m3, m4 + REPX {psrad x, 4}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + mova [r7+wq*0], m0 + mova [r7+wq*2], m1 + lea r7, [r7+wq*4] + sub hd, 2 + jg .v_w8_loop + add srcq, 16 + add tmpq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w8_loop0 + RET +%endif +.hv: + and wd, -8 + jnz .hv_w8 + movzx mxd, mxb + movq m0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m2, [base+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 15 + movifnidn ssq, r2mp + movifnidn tmpq, r0mp + mova m7, [base+prep_8tap_2d_rnd] + sub srcq, 2 + pshuflw m0, m0, q2121 + pxor m6, m6 + punpcklbw m6, m0 + punpcklbw m2, m2 + psraw m6, 4 + psraw m2, 8 + test dword r7m, 0x800 + jz .hv_w4_10bpc + psraw m6, 2 +.hv_w4_10bpc: +%if ARCH_X86_32 +%assign regs_used 4 + ALLOC_STACK -16*7 +%assign regs_used 7 + %define m10 [esp+16*3] + %define m12 [esp+16*5] + %define m13 [esp+16*6] + %define m14 [base+spel_h_shufA] + %define m11 [base+spel_h_shufB] + pshufd m0, m2, q0000 + pshufd m1, m2, q1111 + pshufd m2, m2, q2222 + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m12, m5 + mova m13, m6 + neg ssq + movu m3, [srcq+ssq*2] + movu m4, [srcq+ssq*1] + neg ssq +%else + mov r6, ssq + pshufd m8, m2, q0000 + neg r6 + pshufd m9, m2, q1111 + movu m3, [srcq+r6 *2] + pshufd m10, m2, q2222 + movu m4, [srcq+r6 *1] + pshufd m12, m6, q0000 + mova m14, [base+spel_h_shufA] + pshufd m13, m6, q1111 + mova m11, [base+spel_h_shufB] +%endif + movu m1, [srcq+ssq*0] + movu m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu m2, [srcq+ssq*0] + HV_H_W4_6TAP m3, m3, m5, m11 + HV_H_W4_6TAP m4, m4, m5, m11 + HV_H_W4_6TAP m5, m1, m5, m11 + HV_H_W4_6TAP m0, m0, m1, m11 + HV_H_W4_6TAP m2, m2, m1, m11 + REPX {psrad x, 6}, m3, m5, m4, m0, m2 + packssdw m3, m5 ; 0 2 + packssdw m4, m0 ; 1 3 + packssdw m5, m2 ; 2 4 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m5 ; 12 + punpckhwd m4, m5 ; 34 +.hv_w4_loop: + movu m0, [srcq+ssq*1] + pmaddwd m5, m8, m1 ; a0 + lea srcq, [srcq+ssq*2] + pmaddwd m6, m8, m2 ; b0 + mova m1, m3 + pmaddwd m3, m9 ; a1 + mova m2, m4 + pmaddwd m4, m9 ; b1 + paddd m5, m3 + movu m3, [srcq+ssq*0] + paddd m6, m4 + HV_H_W4_6TAP m0, m0, m4, m11 + HV_H_W4_6TAP m3, m3, m4, m11 + psrad m4, m2, 16 + psrad m0, 6 + psrad m3, 6 + packssdw m4, m0 ; 4 5 + packssdw m0, m3 ; 5 6 + punpcklwd m3, m4, m0 ; 45 + punpckhwd m4, m0 ; 56 + pmaddwd m0, m10, m3 ; a2 + paddd m5, m7 + paddd m5, m0 + pmaddwd m0, m10, m4 ; b2 + paddd m6, m7 + paddd m6, m0 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + mova [tmpq], m5 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + RESET_STACK_STATE + shr mxd, 16 + movq m2, [base+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m1, [base+subpel_filters+1+myq*8] + movifnidn ssq, r2mp + mova m4, [base+prep_8tap_2d_rnd] + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m1, m1 + sub srcq, 4 + psraw m0, 4 + psraw m1, 8 + test dword r7m, 0x800 + jz .hv_w8_10bpc + psraw m0, 2 +.hv_w8_10bpc: +%if ARCH_X86_32 +%assign regs_used 1 + ALLOC_STACK -16*9 +%assign regs_used 7 + mov tmpq, r0mp + mova [rsp+16*7], m4 +%else +%if WIN64 + PUSH r8 +%assign regs_used 9 +%endif + ALLOC_STACK 16*6, 16 +%endif + pshufd m2, m0, q0000 + mova [rsp+16*0], m2 + pshufd m2, m0, q1111 + mova [rsp+16*1], m2 + pshufd m0, m0, q2222 + mova [rsp+16*2], m0 + pshufd m2, m1, q0000 + mova [rsp+16*3], m2 + pshufd m2, m1, q1111 + mova [rsp+16*4], m2 + pshufd m1, m1, q2222 + mova [rsp+16*5], m1 + mov r6, ssq + neg r6 +%if ARCH_X86_32 + mov r5d, wd + shl r5d, 14 + lea r5d, [r5+hq-(1<<16)] +%if STACK_ALIGNMENT < 16 + %define srcmp [esp+16*8+4*0] + %define tmpmp [esp+16*8+4*1] +%endif +.hv_w8_loop0: + mov srcmp, srcq + mov tmpmp, tmpq + movu m5, [srcq+r6*2+0] + movu m6, [srcq+r6*2+2] + mova m7, [rsp+16*0] + mova m1, [rsp+16*1] + mova m0, [rsp+16*2] + HV_H_6TAP m2, m5, m6, m7, m1, m0 + movu m5, [srcq+r6*1+0] + movu m6, [srcq+r6*1+2] + HV_H_6TAP m3, m5, m6, m7, m1, m0 + movu m5, [srcq+ssq*0+0] + movu m6, [srcq+ssq*0+2] + HV_H_6TAP m4, m5, m6, m7, m1, m0 + movu m5, [srcq+ssq*1+0] + movu m6, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + HV_H_6TAP m0, m5, m6, m7, m1 + movu m5, [srcq+ssq*0+0] + movu m6, [srcq+ssq*0+2] + HV_H_6TAP m1, m5, m6, m7 + mova m5, [rsp+16*7] + REPX {paddd x, m5}, m2, m3, m4, m0, m1 + REPX {psrad x, 6 }, m2, m4, m3, m0, m1 + packssdw m2, m4 ; 0 2 + packssdw m3, m0 ; 1 3 + packssdw m4, m1 ; 2 4 + punpcklwd m0, m2, m3 ; 01 + punpckhwd m2, m3 ; 23 + punpcklwd m1, m3, m4 ; 12 + punpckhwd m3, m4 ; 34 +.hv_w8_loop: + mova m5, [rsp+16*3] + mova m6, [rsp+16*4] + pmaddwd m4, m0, m5 ; a0 + pmaddwd m5, m1 ; b0 + mova m0, m2 + pmaddwd m2, m6 ; a1 + mova m1, m3 + pmaddwd m3, m6 ; b1 + paddd m4, m2 + movu m2, [srcq+ssq*1+0] + paddd m5, m3 + movu m3, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + HV_H_6TAP m6, m2, m3 + movu m2, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+2] + HV_H_6TAP m7, m2, m3 + mova m2, [rsp+16*7] + psrad m3, m1, 16 + REPX {paddd x, m2}, m6, m7, m4, m5 + psrad m6, 6 + psrad m7, 6 + packssdw m3, m6 ; 4 5 + packssdw m6, m7 ; 5 6 + mova m7, [rsp+16*5] + punpcklwd m2, m3, m6 ; 45 + punpckhwd m3, m6 ; 56 + pmaddwd m6, m2, m7 ; a2 + pmaddwd m7, m3 ; b2 + paddd m4, m6 + paddd m5, m7 + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + movq [tmpq+wq*0], m4 + movhps [tmpq+wq*2], m4 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w8_loop + mov srcq, srcmp + mov tmpq, tmpmp + movzx hd, r5w + add srcq, 8 + add tmpq, 8 + sub r5d, 1<<16 +%else + lea r8d, [wq*4-(1<<5)] + lea r8d, [hq+r8*8] +.hv_w8_loop0: + mova m5, [spel_h_shufA] + movu m0, [srcq+r6*2+ 0] + mova m6, [rsp+16*0] + movu m1, [srcq+r6*2+ 8] + mova m7, [rsp+16*1] + movu m2, [srcq+r6*2+16] + mova m8, [rsp+16*2] + HV_H_6TAP m9, m0, m1, m2, 6, m5, m6, m7, m8 + movu m0, [srcq+r6*1+ 0] + movu m1, [srcq+r6*1+ 8] + movu m2, [srcq+r6*1+16] + lea r5, [srcq+ssq*2] + HV_H_6TAP m11, m0, m1, m2, 6, m5, m6, m7, m8 + movu m0, [srcq+ssq*0+ 0] + movu m1, [srcq+ssq*0+ 8] + movu m2, [srcq+ssq*0+16] + mov r7, tmpq + HV_H_6TAP m13, m0, m1, m2, 6, m5, m6, m7, m8 + movu m0, [srcq+ssq*1+ 0] + movu m1, [srcq+ssq*1+ 8] + movu m2, [srcq+ssq*1+16] + HV_H_6TAP m15, m0, m1, m2, 6, m5, m6, m7, m8 + movu m0, [r5+ssq*0+ 0] + movu m1, [r5+ssq*0+ 8] + movu m2, [r5+ssq*0+16] + HV_H_6TAP m5, m0, m1, m2, 6, m5, m6, m7, m8 + punpcklwd m8, m9, m11 ; 01 + punpckhwd m9, m11 + punpcklwd m10, m11, m13 ; 12 + punpckhwd m11, m13 + punpcklwd m12, m13, m15 ; 23 + punpckhwd m13, m15 + punpcklwd m14, m15, m5 ; 34 + punpckhwd m15, m5 +.hv_w8_loop: + mova m3, [rsp+16*3] + mova m7, [rsp+16*4] + pmaddwd m0, m8, m3 ; a0 + mova m8, m12 + pmaddwd m2, m9, m3 ; a0' + mova m9, m13 + pmaddwd m1, m10, m3 ; b0 + mova m10, m14 + pmaddwd m3, m11 ; b0' + mova m11, m15 + REPX {pmaddwd x, m7}, m12, m13, m14, m15 + movu m6, [r5+ssq*1+ 0] + paddd m0, m12 + movu m7, [r5+ssq*1+ 8] + paddd m2, m13 + movu m12, [r5+ssq*1+16] + paddd m1, m14 + lea r5, [r5+ssq*2] + paddd m3, m15 + HV_H_6TAP m15, m6, m7, m12, 6 + movu m6, [r5+ssq*0+ 0] + movu m7, [r5+ssq*0+ 8] + movu m14, [r5+ssq*0+16] + punpcklwd m12, m5, m15 ; 45 + punpckhwd m13, m5, m15 + HV_H_6TAP m5, m6, m7, m14, 6 + mova m7, [rsp+16*5] + REPX {paddd x, m4}, m0, m2, m1, m3 + punpcklwd m14, m15, m5 ; 56 + punpckhwd m15, m5 + pmaddwd m6, m12, m7 ; a2 + paddd m0, m6 + pmaddwd m6, m13, m7 ; a2' + paddd m2, m6 + pmaddwd m6, m14, m7 ; b2 + pmaddwd m7, m15 ; b2' + paddd m1, m6 + paddd m3, m7 + REPX {psrad x, 6}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + mova [r7+wq*0], m0 + mova [r7+wq*2], m1 + lea r7, [r7+wq*4] + sub hd, 2 + jg .hv_w8_loop + add srcq, 16 + add tmpq, 16 + movzx hd, r8b + sub r8d, 1<<8 +%endif + jg .hv_w8_loop0 + RET -%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN regular, REGULAR, REGULAR +cglobal prep_8tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my %if ARCH_X86_32 -cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my -%define mxb r0b -%define mxd r0 -%define mxq r0 -%define myb r2b -%define myd r2 -%define myq r2 -%else -cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my + %define mxb r0b + %define mxd r0 + %define mxq r0 + %define myb r2b + %define myd r2 + %define myq r2 + %define m8 [esp+16*0] + %define m9 [esp+16*1] + %define m10 [esp+16*2] + %define m11 [esp+16*3] + %define m12 [esp+16*4] + %define m13 [esp+16*5] + %define m14 [esp+16*6] + %define m15 [esp+16*7] %endif -%define base t2-prep_ssse3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 @@ -2026,138 +3439,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my jnz .h movifnidn hd, hm test myd, 0xf00 - jnz .v - tzcnt wd, wd - mov myd, r7m ; bitdepth_max - movzx wd, word [base+prep_ssse3_table+wq*2] - mova m5, [base+pw_8192] - shr myd, 11 - add wq, t2 - movddup m4, [base+prep_mul+myq*8] - movifnidn ssq, ssmp - movifnidn tmpq, tmpmp - lea r6, [ssq*3] -%if WIN64 - pop r7 -%endif - jmp wq -.h: - test myd, 0xf00 - jnz .hv - movifnidn ssq, r2mp - movifnidn hd, r4m - movddup m5, [base+prep_8tap_1d_rnd] - cmp wd, 4 - jne .h_w8 - movzx mxd, mxb - movq m0, [base+subpel_filters+mxq*8] - mova m3, [base+spel_h_shufA] - mova m4, [base+spel_h_shufB] - movifnidn tmpq, tmpmp - sub srcq, 2 - WIN64_SPILL_XMM 8 - punpcklbw m0, m0 - psraw m0, 8 - test dword r7m, 0x800 - jnz .h_w4_12bpc - psllw m0, 2 -.h_w4_12bpc: - pshufd m6, m0, q1111 - pshufd m7, m0, q2222 -.h_w4_loop: - movu m1, [srcq+ssq*0] - movu m2, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 - pshufb m1, m4 ; 2 3 3 4 4 5 5 6 - pmaddwd m0, m6 - pmaddwd m1, m7 - paddd m0, m5 - paddd m0, m1 - pshufb m1, m2, m3 - pshufb m2, m4 - pmaddwd m1, m6 - pmaddwd m2, m7 - paddd m1, m5 - paddd m1, m2 - psrad m0, 4 - psrad m1, 4 - packssdw m0, m1 - mova [tmpq], m0 - add tmpq, 16 - sub hd, 2 - jg .h_w4_loop - RET -.h_w8: - WIN64_SPILL_XMM 11 - shr mxd, 16 - movq m2, [base+subpel_filters+mxq*8] - mova m4, [base+spel_h_shufA] - mova m6, [base+spel_h_shufB] - movifnidn tmpq, r0mp - add wd, wd - punpcklbw m2, m2 - add srcq, wq - psraw m2, 8 - add tmpq, wq - neg wq - test dword r7m, 0x800 - jnz .h_w8_12bpc - psllw m2, 2 -.h_w8_12bpc: - pshufd m7, m2, q0000 -%if ARCH_X86_32 - ALLOC_STACK -16*3 - pshufd m0, m2, q1111 - pshufd m1, m2, q2222 - pshufd m2, m2, q3333 - mova m8, m0 - mova m9, m1 - mova m10, m2 -%else - pshufd m8, m2, q1111 - pshufd m9, m2, q2222 - pshufd m10, m2, q3333 -%endif -.h_w8_loop0: - mov r6, wq -.h_w8_loop: - movu m0, [srcq+r6- 6] - movu m1, [srcq+r6+ 2] - pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 - pshufb m0, m6 ; 2 3 3 4 4 5 5 6 - pmaddwd m2, m7 ; abcd0 - pmaddwd m0, m8 ; abcd1 - pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 - pshufb m1, m6 ; 6 7 7 8 8 9 9 a - paddd m2, m5 - paddd m0, m2 - pmaddwd m2, m9, m3 ; abcd2 - pmaddwd m3, m7 ; efgh0 - paddd m0, m2 - pmaddwd m2, m10, m1 ; abcd3 - pmaddwd m1, m8 ; efgh1 - paddd m0, m2 - movu m2, [srcq+r6+10] - paddd m3, m5 - paddd m1, m3 - pshufb m3, m2, m4 ; a b b c c d d e - pshufb m2, m6 ; 8 9 9 a a b b c - pmaddwd m3, m9 ; efgh2 - pmaddwd m2, m10 ; efgh3 - paddd m1, m3 - paddd m1, m2 - psrad m0, 4 - psrad m1, 4 - packssdw m0, m1 - mova [tmpq+r6], m0 - add r6, 16 - jl .h_w8_loop - add srcq, ssq - sub tmpq, wq - dec hd - jg .h_w8_loop0 - RET + jz mangle(private_prefix %+ _prep_6tap_16bpc_ssse3).prep .v: movzx mxd, myb shr myd, 16 @@ -2315,6 +3597,125 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my sub wd, 1<<8 jg .v_loop0 RET +.h: + RESET_STACK_STATE + test myd, 0xf00 + jnz .hv + movifnidn ssq, r2mp + movifnidn hd, r4m + movddup m5, [base+prep_8tap_1d_rnd] + cmp wd, 4 + jne .h_w8 +.h_w4: + movzx mxd, mxb + movq m0, [base+subpel_filters+mxq*8] + mova m3, [base+spel_h_shufA] + mova m4, [base+spel_h_shufB] + movifnidn tmpq, tmpmp + sub srcq, 2 + WIN64_SPILL_XMM 8 + punpcklbw m0, m0 + psraw m0, 8 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw m0, 2 +.h_w4_12bpc: + pshufd m6, m0, q1111 + pshufd m7, m0, q2222 +.h_w4_loop: + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + WIN64_SPILL_XMM 11 + shr mxd, 16 + movq m2, [base+subpel_filters+mxq*8] + mova m4, [base+spel_h_shufA] + mova m6, [base+spel_h_shufB] + movifnidn tmpq, r0mp + add wd, wd + punpcklbw m2, m2 + add srcq, wq + psraw m2, 8 + add tmpq, wq + neg wq + test dword r7m, 0x800 + jnz .h_w8_12bpc + psllw m2, 2 +.h_w8_12bpc: + pshufd m7, m2, q0000 +%if ARCH_X86_32 + ALLOC_STACK -16*3 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 +%else + pshufd m8, m2, q1111 + pshufd m9, m2, q2222 + pshufd m10, m2, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6- 6] + movu m1, [srcq+r6+ 2] + pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 + pshufb m0, m6 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m7 ; abcd0 + pmaddwd m0, m8 ; abcd1 + pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 + pshufb m1, m6 ; 6 7 7 8 8 9 9 a + paddd m2, m5 + paddd m0, m2 + pmaddwd m2, m9, m3 ; abcd2 + pmaddwd m3, m7 ; efgh0 + paddd m0, m2 + pmaddwd m2, m10, m1 ; abcd3 + pmaddwd m1, m8 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6+10] + paddd m3, m5 + paddd m1, m3 + pshufb m3, m2, m4 ; a b b c c d d e + pshufb m2, m6 ; 8 9 9 a a b b c + pmaddwd m3, m9 ; efgh2 + pmaddwd m2, m10 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq+r6], m0 + add r6, 16 + jl .h_w8_loop + add srcq, ssq + sub tmpq, wq + dec hd + jg .h_w8_loop0 + RET .hv: RESET_STACK_STATE movzx t3d, mxb @@ -6427,16 +7828,18 @@ DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif + +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put -FN put_8tap_scaled, sharp, SHARP, SHARP -FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH -FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP -FN put_8tap_scaled, smooth, SMOOTH, SMOOTH -FN put_8tap_scaled, sharp_regular, SHARP, REGULAR -FN put_8tap_scaled, regular_sharp, REGULAR, SHARP -FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR -FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH -FN put_8tap_scaled, regular, REGULAR, REGULAR +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 @@ -6446,16 +7849,18 @@ DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif + +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep -FN prep_8tap_scaled, sharp, SHARP, SHARP -FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH -FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP -FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH -FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR -FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP -FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR -FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH -FN prep_8tap_scaled, regular, REGULAR, REGULAR +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_64 diff --git a/tests/checkasm/arm/checkasm_32.S b/tests/checkasm/arm/checkasm_32.S index a186ef8fc25ebedafd8a5326b481fedd37c16e15..09b88aa34601f9e7ef2d51fe084d5f180f73f5a7 100644 --- a/tests/checkasm/arm/checkasm_32.S +++ b/tests/checkasm/arm/checkasm_32.S @@ -101,9 +101,10 @@ function checked_call_\variant, export=1 mov r12, r0 mov r0, r2 mov r1, r3 - ldrd r2, r3, [sp, #ARG_STACK_A + pushed] + ldr r2, [sp, #ARG_STACK_A + pushed] + ldr r3, [sp, #ARG_STACK_A + pushed + 4] @ Call the target function - blx r12 + v4blx r12 @ Load the number of stack parameters, stack canary and its reference ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)] @@ -120,7 +121,8 @@ function checked_call_\variant, export=1 movrel r12, register_init .ifc \variant, vfp .macro check_reg_vfp, dreg, offset - ldrd r2, r3, [r12, #8 * (\offset)] + ldr r2, [r12, #(8 * (\offset))] + ldr r3, [r12, #(8 * (\offset)) + 4] vmov r0, lr, \dreg eor r2, r2, r0 eor r3, r3, lr @@ -148,7 +150,8 @@ function checked_call_\variant, export=1 @ keep track of the checked GPR mov r1, #4 .macro check_reg reg1, reg2= - ldrd r2, r3, [r12], #8 + ldr r2, [r12], #4 + ldr r3, [r12], #4 eors r2, r2, \reg1 bne 2f add r1, r1, #1 @@ -198,4 +201,5 @@ function checked_call_\variant, export=1 endfunc .endm +clobbercheck novfp clobbercheck vfp diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 2faa01f4a65940f05a73a1d9768fb9d9a4ab4c44..2115ed3e7a60a2ee2ae1ae94ca07be2d355854ba 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -44,12 +44,16 @@ #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04 #endif #else -#include <unistd.h> #include <time.h> +#if HAVE_UNISTD_H +#include <unistd.h> +#endif +#if HAVE_PTHREAD_SETAFFINITY_NP #include <pthread.h> -#ifdef HAVE_PTHREAD_NP_H +#if HAVE_PTHREAD_NP_H #include <pthread_np.h> #endif +#endif #ifdef __APPLE__ #include <mach/mach_time.h> #endif @@ -732,7 +736,7 @@ int main(int argc, char *argv[]) { } else { fprintf(stderr, "checkasm: running on cpu %lu\n", affinity); } -#elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(CPU_SET) +#elif HAVE_PTHREAD_SETAFFINITY_NP && defined(CPU_SET) cpu_set_t set; CPU_ZERO(&set); CPU_SET(affinity, &set); @@ -832,6 +836,14 @@ int main(int argc, char *argv[]) { state.simd_warmup = checkasm_warmup_avx2; checkasm_simd_warmup(); #endif +#if ARCH_ARM + void checkasm_checked_call_vfp(void *func, int dummy, ...); + void checkasm_checked_call_novfp(void *func, int dummy, ...); + if (cpu_flags & DAV1D_ARM_CPU_FLAG_NEON) + checkasm_checked_call_ptr = checkasm_checked_call_vfp; + else + checkasm_checked_call_ptr = checkasm_checked_call_novfp; +#endif #if ARCH_X86 unsigned checkasm_init_x86(char *name); char name[48]; @@ -1126,3 +1138,7 @@ void checkasm_simd_warmup(void) state.simd_warmup(); } #endif + +#if ARCH_ARM +void (*checkasm_checked_call_ptr)(void *func, int dummy, ...); +#endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 29de9b64b3386a9f7dc43ed5ddb910064ca064c5..07ce4da581fd0e806bc4bfd89bc1cbebc961f9ef 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -68,6 +68,10 @@ typedef sigjmp_buf checkasm_context; #include "include/common/bitdepth.h" #include "include/common/intops.h" +#if ARCH_ARM +#include "src/arm/arm-arch.h" +#endif + int xor128_rand(void); #define rnd xor128_rand @@ -254,7 +258,7 @@ void checkasm_simd_warmup(void); * handled orthogonally from integer parameters passed in GPR registers. */ #define IGNORED_FP_ARGS 8 #endif -#ifdef HAVE_C11_GENERIC +#if HAVE_C11_GENERIC #define clobber_type(arg) _Generic((void (*)(void*, arg))NULL,\ void (*)(void*, int32_t ): clobber_mask |= 1 << mpos++,\ void (*)(void*, uint32_t): clobber_mask |= 1 << mpos++,\ @@ -302,12 +306,12 @@ void checkasm_simd_warmup(void); /* Use a dummy argument, to offset the real parameters by 2, not only 1. * This makes sure that potential 8-byte-alignment of parameters is kept * the same even when the extra parameters have been removed. */ -void checkasm_checked_call_vfp(void *func, int dummy, ...); +extern void (*checkasm_checked_call_ptr)(void *func, int dummy, ...); #define declare_new(ret, ...)\ ret (*checked_call)(void *, int dummy, __VA_ARGS__,\ int, int, int, int, int, int, int, int,\ int, int, int, int, int, int, int) =\ - (void *)checkasm_checked_call_vfp; + (void *)checkasm_checked_call_ptr; #define call_new(...)\ (checkasm_set_signal_handler_state(1),\ checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\ diff --git a/tests/checkasm/itx.c b/tests/checkasm/itx.c index c7cc411ff5343c4ba1b3e39ed1b38ac045eed1c7..b0de65dddc5d89e86ab87e98890547b50c64bc26 100644 --- a/tests/checkasm/itx.c +++ b/tests/checkasm/itx.c @@ -130,7 +130,8 @@ static void fwht4_1d(double *const out, const double *const in) static int copy_subcoefs(coef *coeff, const enum RectTxfmSize tx, const enum TxfmType txtp, - const int sw, const int sh, const int subsh) + const int sw, const int sh, const int subsh, + int *const max_eob) { /* copy the topleft coefficients such that the return value (being the * coefficient scantable index for the eob token) guarantees that only @@ -160,6 +161,7 @@ static int copy_subcoefs(coef *coeff, } else if (!eob && (rcx > sub_low || rcy > sub_low)) eob = n; /* lower boundary */ } + *max_eob = n - 1; if (eob) eob += rnd() % (n - eob - 1); @@ -182,7 +184,7 @@ static int copy_subcoefs(coef *coeff, static int ftx(coef *const buf, const enum RectTxfmSize tx, const enum TxfmType txtp, const int w, const int h, - const int subsh, const int bitdepth_max) + const int subsh, int *const max_eob, const int bitdepth_max) { double out[64 * 64], temp[64 * 64]; const double scale = scaling_factors[ctz(w * h) - 4]; @@ -236,7 +238,7 @@ static int ftx(coef *const buf, const enum RectTxfmSize tx, for (int x = 0; x < sw; x++) buf[y * sw + x] = (coef) (out[y * w + x] + 0.5); - return copy_subcoefs(buf, tx, txtp, sw, sh, subsh); + return copy_subcoefs(buf, tx, txtp, sw, sh, subsh, max_eob); } static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c, @@ -272,7 +274,9 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c, bpc)) { const int bitdepth_max = (1 << bpc) - 1; - const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max); + int max_eob; + const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, &max_eob, + bitdepth_max); memcpy(coeff[1], coeff[0], sizeof(*coeff)); CLEAR_PIXEL_RECT(c_dst); @@ -295,7 +299,7 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c, fail(); bench_new(alternate(c_dst, a_dst), a_dst_stride, - alternate(coeff[0], coeff[1]), eob HIGHBD_TAIL_SUFFIX); + alternate(coeff[0], coeff[1]), max_eob HIGHBD_TAIL_SUFFIX); } } report("add_%dx%d", w, h); diff --git a/tests/dav1d_argon.bash b/tests/dav1d_argon.bash index ead3e6ed2dc1812d432cea04edfd32098a9dc0c1..229eab5a8ff1988287806060a6d72eb7e38c341c 100755 --- a/tests/dav1d_argon.bash +++ b/tests/dav1d_argon.bash @@ -6,6 +6,7 @@ FILMGRAIN=1 CPUMASK=-1 THREADS=1 JOBS=0 +WRAP="" usage() { NAME=$(basename "$0") @@ -20,7 +21,8 @@ usage() { printf " -g \$num enable filmgrain (default: 1)\n" printf " -c \$mask use restricted cpumask (default: -1)\n" printf " -t \$num number of threads per dav1d (default: 1)\n" - printf " -j \$num number of parallel dav1d processes (default: 0)\n\n" + printf " -j \$num number of parallel dav1d processes (default: 0)\n" + printf " -w tool execute dav1d with a wrapper tool\n\n" } >&2 exit 1 } @@ -79,7 +81,7 @@ if [ -d "$tests_dir/argon" ]; then ARGON_DIR="$tests_dir/argon" fi -while getopts ":d:a:g:c:t:j:" opt; do +while getopts ":d:a:g:c:t:j:w:" opt; do case "$opt" in d) DAV1D="$OPTARG" @@ -99,6 +101,9 @@ while getopts ":d:a:g:c:t:j:" opt; do j) JOBS="$OPTARG" ;; + w) + WRAP="$OPTARG" + ;; \?) printf "Error! Invalid option: -%s\n" "$OPTARG" >&2 usage @@ -158,7 +163,7 @@ for i in "${!files[@]}"; do md5=${md5/ */} printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}" - cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q) + cmd=($WRAP "$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q) if [ "$JOBS" -gt 1 ]; then "${cmd[@]}" 2>/dev/null & p=$! diff --git a/tests/seek_stress.c b/tests/seek_stress.c index a85ec86886c31d0fc52e843c250d0b6ee555f44e..7f75ea86e5c6c994a23b9cf9227a49ddd10930d0 100644 --- a/tests/seek_stress.c +++ b/tests/seek_stress.c @@ -60,7 +60,7 @@ static unsigned get_seed(void) { static unsigned get_seed(void) { #ifdef __APPLE__ return (unsigned) mach_absolute_time(); -#elif defined(HAVE_CLOCK_GETTIME) +#elif HAVE_CLOCK_GETTIME struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec); diff --git a/tools/compat/getopt.c b/tools/compat/getopt.c index ac1fda426ecc59c017d4a56da7cb4ceb728e60b0..ab375bdb17e2add76ef884b1877ad93731dee3c6 100644 --- a/tools/compat/getopt.c +++ b/tools/compat/getopt.c @@ -55,7 +55,11 @@ #include <getopt.h> #include <stdarg.h> #include <stdio.h> +#ifdef _WIN32 #include <windows.h> +#else +#include <err.h> +#endif #define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ @@ -80,12 +84,6 @@ char *optarg; /* argument associated with option */ #define BADARG ((*options == ':') ? (int)':' : (int)'?') #define INORDER (int)1 -#ifndef __CYGWIN__ -#define __progname __argv[0] -#else -extern char __declspec(dllimport) *__progname; -#endif - #ifdef __CYGWIN__ static char EMSG[] = ""; #else @@ -113,6 +111,13 @@ static const char noarg[] = "option doesn't take an argument -- %.*s"; static const char illoptchar[] = "unknown option -- %c"; static const char illoptstring[] = "unknown option -- %s"; +#ifdef _WIN32 +#ifndef __CYGWIN__ +#define __progname __argv[0] +#else +extern char __declspec(dllimport) *__progname; +#endif + static void _vwarnx(const char *fmt,va_list ap) { @@ -130,6 +135,7 @@ warnx(const char *fmt,...) _vwarnx(fmt,ap); va_end(ap); } +#endif /* * Compute the greatest common divisor of a and b. diff --git a/tools/dav1d.c b/tools/dav1d.c index 4d8d072debd7b9bfef29cd1f093f133b2ed887b9..eb19a80b358b74accf384796f6f2175c180c8cb1 100644 --- a/tools/dav1d.c +++ b/tools/dav1d.c @@ -38,10 +38,10 @@ #include <stdio.h> #include <string.h> #include <time.h> -#ifdef HAVE_UNISTD_H +#if HAVE_UNISTD_H # include <unistd.h> #endif -#ifdef HAVE_IO_H +#if HAVE_IO_H # include <io.h> #endif #ifdef _WIN32 @@ -68,7 +68,7 @@ static uint64_t get_time_nanos(void) { uint64_t seconds = t.QuadPart / frequency.QuadPart; uint64_t fractions = t.QuadPart % frequency.QuadPart; return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart; -#elif defined(HAVE_CLOCK_GETTIME) +#elif HAVE_CLOCK_GETTIME struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return 1000000000ULL * ts.tv_sec + ts.tv_nsec; diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c index f4259643542ad227b95792af1a446b3efdaa84e5..134be46b2fb08a0da587f60125b480264bcb9354 100644 --- a/tools/dav1d_cli_parse.c +++ b/tools/dav1d_cli_parse.c @@ -35,7 +35,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#ifdef HAVE_UNISTD_H +#if HAVE_UNISTD_H # include <unistd.h> #endif diff --git a/tools/input/parse.h b/tools/input/parse.h index f5805e8ca45323363d4c2017583c3f13f8f2a85a..f39f80f2c69d9e964d7bb9aafe258ce721092669 100644 --- a/tools/input/parse.h +++ b/tools/input/parse.h @@ -89,6 +89,8 @@ static inline int parse_obu_header(const uint8_t *buf, int buf_size, buf_size--; if (extension_flag) { + if (!buf_size) + return -1; buf++; buf_size--; // ignore fields diff --git a/tools/input/section5.c b/tools/input/section5.c index db1b34c227419be85baed4ed31859bba77b0d859..99cb7615d6caca56faf5d7b10294202ad0669531 100644 --- a/tools/input/section5.c +++ b/tools/input/section5.c @@ -32,7 +32,9 @@ #include <stdint.h> #include <stdlib.h> #include <string.h> +#if HAVE_SYS_TYPES_H #include <sys/types.h> +#endif #include "dav1d/headers.h" diff --git a/tools/output/md5.c b/tools/output/md5.c index 7d192c2459dc84c607c9e1cfd408c52fc4aab1cd..cfad4f0bfb9696254078eb034f953dea45cd1b2d 100644 --- a/tools/output/md5.c +++ b/tools/output/md5.c @@ -31,7 +31,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/stat.h> #include "common/intops.h" diff --git a/tools/output/y4m2.c b/tools/output/y4m2.c index 8766f64868231e8003117d91e7882c8c34b3cb22..40411d15abf55a5dda71efaaa614e11485de5b75 100644 --- a/tools/output/y4m2.c +++ b/tools/output/y4m2.c @@ -32,7 +32,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/stat.h> #include "output/muxer.h" diff --git a/tools/output/yuv.c b/tools/output/yuv.c index 406f28418893217022c8daad824b8c5557ae42e9..e0c0ec47d914f7bf5cd97aecb88119f77f67f39c 100644 --- a/tools/output/yuv.c +++ b/tools/output/yuv.c @@ -31,7 +31,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/stat.h> #include "output/muxer.h"