Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • videolan/dav1d
  • ePirat/dav1d
  • magsoft/dav1d
  • chouquette/dav1d
  • shiz/dav1d
  • tdaede/dav1d
  • tmatth/dav1d
  • dwbuiten/dav1d
  • mstorsjo/dav1d
  • janne/dav1d
  • ltrudeau/dav1d
  • rzumer/dav1d
  • lu_zero/dav1d
  • rbultje/dav1d
  • tbr/dav1d
  • thresh/dav1d
  • haasn/dav1d
  • midtskogen/dav1d
  • SmilingWolf/dav1d
  • lotharkript/dav1d
  • jamrial/dav1d
  • barrbrain/dav1d
  • robUx4/dav1d
  • jbk/dav1d
  • skal65535/dav1d
  • tappara/dav1d
  • dalecurtis/dav1d
  • montytyper/dav1d
  • TLaurent/dav1d
  • liwei/dav1d
  • CounterPillow/dav1d
  • rswarbrick-argon/dav1d
  • mjbshaw/dav1d
  • fcartegnie/dav1d
  • jyavenard/dav1d
  • xuefeng/dav1d
  • licao/dav1d
  • FredB/dav1d
  • jn7163/dav1d
  • bherman.aconspart/dav1d
  • anisse/dav1d
  • koda/dav1d
  • mihulet88/dav1d
  • sabdfl/dav1d
  • brion/dav1d
  • tj_davies/dav1d
  • EwoutH/dav1d
  • KyleSiefring/dav1d
  • manass3018/dav1d
  • krish-iyer/dav1d
  • stebler/dav1d
  • hchen1506/dav1d
  • f3ndot/dav1d
  • linkmauve/dav1d
  • malvanos/dav1d
  • rcss/dav1d
  • DonDiego/dav1d
  • ledyba-z/dav1d
  • seiqan2/dav1d
  • t0934812955/dav1d
  • xclaesse/dav1d
  • lynne/dav1d
  • loveingpowellalways/dav1d
  • govind.sharma/dav1d
  • kossh1/dav1d
  • davidandsabrina4ever2014/dav1d
  • abdouseck664/dav1d
  • jennifer.derrick61583/dav1d
  • msaas01925/dav1d
  • akymaster/dav1d
  • sylvestre/dav1d
  • morgan.shenkin/dav1d
  • B3rn4arD/dav1d
  • evzien/dav1d
  • mwozniak/dav1d
  • TompSciGit/dav1d
  • namse/dav1d
  • kkourin/dav1d
  • nico/dav1d
  • galad/dav1d
  • ltnokiago/dav1d
  • mindfreeze/dav1d
  • DmitriySychov/dav1d
  • oddstone/dav1d
  • nasirhemed/dav1d
  • richselwood/dav1d
  • longervision/dav1d
  • kurosu/dav1d
  • heitbaum/dav1d
  • Opiyonag/dav1d
  • salomethirot-arm/dav1d
  • dillmo71/dav1d
  • jwright-arm/dav1d
  • stonef385/dav1d
  • y-guyon/dav1d
  • andrekempe-arm/dav-1-d-reloaded
  • joedrago/dav1d
  • Rtytry/dav1d
  • altanai/dav1d
  • beiluo97/dav1d
  • wtc/dav1d
  • Asilx21/dav1d
  • DarioSucic/dav1d
  • Siberiawind/dav1d
  • edelmirocove17/dav1d
  • Mtndude/dav1d
  • dconrad/dav1d
  • ChildSoap/dav1d
  • kalan5269/dav1d
  • Jolincai/dav1d
  • kawiddoes/dav1d
  • ledyba/dav1d
  • minhhien231186/dav1d
  • beiluo971/dav1d
  • hakantezgoren34/dav1d
  • chigita73/dav1d
  • slomo/dav1d
  • Starbuck5/dav1d
  • jbeich/dav1d
  • berrylcm/dav1d
  • philip584521/dav1d
  • IgorKey/dav1d
  • shekar007/dav1d
  • jdek/dav1d
  • oldsssteveo/dav1d
  • Jingwiw/dav1d
  • vigneshv/dav1d
  • andrey.semashev/dav1d
  • v.cvetkov/dav1d
  • kattmedhatt/dav1d
  • ccawley2011/dav1d
  • rportalez/dav1d
  • Skantes/dav1d
  • arpadpanyik-arm/dav1d
  • asenat/dav1d
  • pcc/dav1d
  • nickg/dav1d
  • BogdanW3/dav1d
  • brad/dav1d
  • MARBEAN2/dav1d
  • yintong.ustc/dav1d
  • cosmin/dav1d
  • kasper93/dav1d
  • HecaiYuan/dav1d
  • jerrytsai569/dav1d
  • ttwuandes/dav1d
  • OctopusET/dav1d
  • maryla-uc/dav1d
  • Un1q32/dav1d
  • pranavk/dav1d
  • twulz/dav1d
  • gianni-r/dav1d
152 results
Show changes
Commits on Source (195)
Showing
with 1559 additions and 1908 deletions
......@@ -4,56 +4,56 @@ stages:
- test
.debian-amd64-common:
image: registry.videolan.org/dav1d-debian-unstable:20240406142551
image: registry.videolan.org/dav1d-debian-unstable:20250207200301
stage: build
tags:
- docker
- amd64
.debian-amd64-minimum:
image: registry.videolan.org/dav1d-debian-minimum:20240406142551
image: registry.videolan.org/dav1d-debian-minimum:20250207200301
stage: build
tags:
- docker
- amd64
.debian-llvm-mingw-common:
image: registry.videolan.org/vlc-debian-llvm-msvcrt:20240415145055
image: registry.videolan.org/vlc-debian-llvm-msvcrt:20250305204125
stage: build
tags:
- docker
- amd64
.debian-aarch64-common:
image: registry.videolan.org/dav1d-debian-bookworm-aarch64:20240401050239
image: registry.videolan.org/dav1d-debian-bookworm-aarch64:20250215002814
stage: build
tags:
- docker
- aarch64
.debian-armv7-common:
image: registry.videolan.org/dav1d-debian-bookworm-armv7:20240401050040
image: registry.videolan.org/dav1d-debian-bookworm-armv7:20250215014239
stage: build
tags:
- docker
- armv7
.debian-ppc64le-common:
image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20240401050321
image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20250215003029
stage: build
tags:
- docker
- ppc64le
.android-common:
image: registry.videolan.org/vlc-debian-android:20240406142551
image: registry.videolan.org/vlc-debian-android:20241118101328
stage: build
tags:
- docker
- amd64
.debian-wasm-emscripten-common:
image: registry.videolan.org/vlc-debian-wasm-emscripten:20240313095757
image: registry.videolan.org/vlc-debian-wasm-emscripten:20250207201514
stage: build
tags:
- docker
......@@ -192,7 +192,7 @@ build-debian-avx:
variables:
CFLAGS: '-mavx'
script:
- meson setup build --buildtype debug
- meson setup build --buildtype debugoptimized
--werror
- ninja -C build
- cd build
......@@ -215,7 +215,7 @@ build-debian-avx512:
variables:
CFLAGS: '-mavx'
script:
- meson setup build --buildtype debug
- meson setup build --buildtype debugoptimized
--werror
- ninja -C build
- cd build
......@@ -410,6 +410,8 @@ build-debian-armv7-clang-5:
build-debian-ppc64le:
extends: .debian-ppc64le-common
variables:
CC: gcc-13
script:
- meson setup build --buildtype release
-Dtrim_dsp=false
......@@ -719,6 +721,8 @@ test-debian-ppc64le:
extends:
- .debian-ppc64le-common
- .test-common
variables:
CC: gcc-13
needs: ["build-debian-ppc64le"]
script:
- meson setup build --buildtype release
......@@ -740,7 +744,7 @@ test-debian-riscv64:
-Dtrim_dsp=false
--cross-file package/crossfiles/riscv64-linux.meson
- ninja -C build
- cd build && time meson test -v --timeout-multiplier 4
- cd build && time meson test -v --timeout-multiplier 10
variables:
QEMU_LD_PREFIX: /usr/riscv64-linux-gnu/
parallel:
......@@ -762,7 +766,7 @@ test-debian-aarch64-qemu:
-Dtrim_dsp=false
--cross-file package/crossfiles/aarch64-linux.meson
- ninja -C build
- cd build && time meson test -v --timeout-multiplier 4
- cd build && time meson test -v --timeout-multiplier 10
variables:
QEMU_LD_PREFIX: /usr/aarch64-linux-gnu/
parallel:
......@@ -808,7 +812,7 @@ test-debian-loongarch64:
-Dtrim_dsp=false
--cross-file package/crossfiles/loongarch64-linux.meson
- ninja -C build
- cd build && time meson test -v --timeout-multiplier 4
- cd build && time meson test -v --timeout-multiplier 10
.test-argon-script: &test-argon-script
- meson setup build --buildtype release
......
Changes for 1.5.1 'Sonic':
--------------------------
1.5.1 is a minor release of dav1d, focusing on optimizations and stack reduction:
- Rewrite of the looprestoration (SGR, wiener) to reduce stack usage
- Rewrite of {put,prep}_scaled functions
Now, the required stack space for dav1d should be: 62 KB on x86_64 and
58KB on arm and aarch64.
- Improvements on the SSSE3 SGR
- Improvements on ARM32/ARM64 looprestoration optimizations
- RISC-V: blend optimizations for high bitdepth
- Power9: blend optimizations for 8bpc
- Port RISC-V to POSIX/non-Linux OS
- AArch64: Add Neon implementation of load_tmvs
- Fix a rare, but possible deadlock, in flush()
Changes for 1.5.0 'Sonic':
--------------------------
1.5.0 is a major release of dav1d, that:
- WARNING: we removed some of the SSE2 optimizations, so if you care about
systems without SSSE3, you should be careful when updating!
- Add Arm OpenBSD run-time CPU feature
- Optimize index offset calculations for decode_coefs
- picture: copy HDR10+ and T35 metadata only to visible frames
- SSSE3 new optimizations for 6-tap (8bit and hbd)
- AArch64/SVE: Add HBD subpel filters using 128-bit SVE2
- AArch64: Add USMMLA implempentation for 6-tap H/HV
- AArch64: Optimize Armv8.0 NEON for HBD horizontal filters and 6-tap filters
- Power9: Optimized ITX till 16x4.
- Loongarch: numerous optimizations
- RISC-V optimizations for pal, cdef_filter, ipred, mc_blend, mc_bdir, itx
- Allow playing videos in full-screen mode in dav1dplay
Changes for 1.4.3 'Road Runner':
--------------------------------
1.4.3 is a small release focused on security issues
- AArch64: Fix potential out of bounds access in DotProd H/HV filters
- cli: Prevent buffer over-read
Changes for 1.4.2 'Road Runner':
--------------------------------
......
......@@ -120,6 +120,7 @@ static void dp_settings_print_usage(const char *const app,
" --highquality: enable high quality rendering\n"
" --zerocopy/-z: enable zero copy upload path\n"
" --gpugrain/-g: enable GPU grain synthesis\n"
" --fullscreen/-f: enable full screen mode\n"
" --version/-v: print version and exit\n"
" --renderer/-r: select renderer backend (default: auto)\n");
exit(1);
......@@ -144,7 +145,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
// Short options
static const char short_opts[] = "i:vuzgr:";
static const char short_opts[] = "i:vuzgfr:";
enum {
ARG_THREADS = 256,
......@@ -162,6 +163,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
{ "zerocopy", 0, NULL, 'z' },
{ "gpugrain", 0, NULL, 'g' },
{ "fullscreen", 0, NULL, 'f'},
{ "renderer", 0, NULL, 'r'},
{ NULL, 0, NULL, 0 },
};
......@@ -186,6 +188,9 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
case 'g':
settings->gpugrain = true;
break;
case 'f':
settings->fullscreen = true;
break;
case 'r':
settings->renderer_name = optarg;
break;
......@@ -240,35 +245,37 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
return NULL;
}
// Parse and validate arguments
dav1d_default_settings(&rd_ctx->lib_settings);
memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
dp_rd_ctx_parse_args(rd_ctx, argc, argv);
// Init SDL2 library
if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0) {
fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
goto fail;
}
// Register a custom event to notify our SDL main thread
// about new frames
rd_ctx->event_types = SDL_RegisterEvents(3);
if (rd_ctx->event_types == UINT32_MAX) {
fprintf(stderr, "Failure to create custom SDL event types!\n");
free(rd_ctx);
return NULL;
goto fail;
}
rd_ctx->fifo = dp_fifo_create(5);
if (rd_ctx->fifo == NULL) {
fprintf(stderr, "Failed to create FIFO for output pictures!\n");
free(rd_ctx);
return NULL;
goto fail;
}
rd_ctx->lock = SDL_CreateMutex();
if (rd_ctx->lock == NULL) {
fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
dp_fifo_destroy(rd_ctx->fifo);
free(rd_ctx);
return NULL;
goto fail;
}
// Parse and validate arguments
dav1d_default_settings(&rd_ctx->lib_settings);
memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
dp_rd_ctx_parse_args(rd_ctx, argc, argv);
// Select renderer
renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name);
......@@ -279,15 +286,21 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
printf("Using %s renderer\n", renderer_info->name);
}
rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL;
rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer(&rd_ctx->settings) : NULL;
if (rd_ctx->rd_priv == NULL) {
SDL_DestroyMutex(rd_ctx->lock);
dp_fifo_destroy(rd_ctx->fifo);
free(rd_ctx);
return NULL;
goto fail;
}
return rd_ctx;
fail:
if (rd_ctx->lock)
SDL_DestroyMutex(rd_ctx->lock);
if (rd_ctx->fifo)
dp_fifo_destroy(rd_ctx->fifo);
free(rd_ctx);
SDL_Quit();
return NULL;
}
/**
......@@ -662,10 +675,6 @@ int main(int argc, char **argv)
return 1;
}
// Init SDL2 library
if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0)
return 10;
// Create render context
Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv);
if (rd_ctx == NULL) {
......@@ -711,9 +720,7 @@ int main(int argc, char **argv)
if (e->type == SDL_QUIT) {
dp_rd_ctx_request_shutdown(rd_ctx);
dp_fifo_flush(rd_ctx->fifo, destroy_pic);
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME);
num_frame_events = 0;
goto out;
} else if (e->type == SDL_WINDOWEVENT) {
if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
// TODO: Handle window resizes
......@@ -724,6 +731,10 @@ int main(int argc, char **argv)
SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e;
if (kbde->keysym.sym == SDLK_SPACE) {
dp_rd_ctx_toggle_pause(rd_ctx);
} else if (kbde->keysym.sym == SDLK_ESCAPE) {
dp_rd_ctx_request_shutdown(rd_ctx);
dp_fifo_flush(rd_ctx->fifo, destroy_pic);
goto out;
} else if (kbde->keysym.sym == SDLK_LEFT ||
kbde->keysym.sym == SDLK_RIGHT)
{
......@@ -776,5 +787,6 @@ out:;
int decoder_ret = 0;
SDL_WaitThread(decoder_thread, &decoder_ret);
dp_rd_ctx_destroy(rd_ctx);
SDL_Quit();
return decoder_ret;
}
......@@ -30,22 +30,32 @@
#include "dav1d/dav1d.h"
#include <SDL.h>
#ifdef HAVE_PLACEBO
#if HAVE_PLACEBO
# include <libplacebo/config.h>
#endif
// Check libplacebo Vulkan rendering
#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN)
#if HAVE_VULKAN && defined(SDL_VIDEO_VULKAN)
# if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN
# define HAVE_RENDERER_PLACEBO
# define HAVE_PLACEBO_VULKAN
# define HAVE_RENDERER_PLACEBO 1
# define HAVE_PLACEBO_VULKAN 1
# endif
#endif
// Check libplacebo OpenGL rendering
#if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL
# define HAVE_RENDERER_PLACEBO
# define HAVE_PLACEBO_OPENGL
# define HAVE_RENDERER_PLACEBO 1
# define HAVE_PLACEBO_OPENGL 1
#endif
#ifndef HAVE_RENDERER_PLACEBO
#define HAVE_RENDERER_PLACEBO 0
#endif
#ifndef HAVE_PLACEBO_VULKAN
#define HAVE_PLACEBO_VULKAN 0
#endif
#ifndef HAVE_PLACEBO_OPENGL
#define HAVE_PLACEBO_OPENGL 0
#endif
/**
......@@ -61,6 +71,7 @@ typedef struct {
int untimed;
int zerocopy;
int gpugrain;
int fullscreen;
} Dav1dPlaySettings;
#define WINDOW_WIDTH 910
......@@ -82,7 +93,7 @@ typedef struct rdr_info
// Cookie passed to the renderer implementation callbacks
void *cookie;
// Callback to create the renderer
void* (*create_renderer)(void);
void* (*create_renderer)(const Dav1dPlaySettings *settings);
// Callback to destroy the renderer
void (*destroy_renderer)(void *cookie);
// Callback to the render function that renders a prevously sent frame
......
......@@ -26,17 +26,17 @@
#include "dp_renderer.h"
#ifdef HAVE_RENDERER_PLACEBO
#if HAVE_RENDERER_PLACEBO
#include <assert.h>
#include <libplacebo/renderer.h>
#include <libplacebo/utils/dav1d.h>
#ifdef HAVE_PLACEBO_VULKAN
#if HAVE_PLACEBO_VULKAN
# include <libplacebo/vulkan.h>
# include <SDL_vulkan.h>
#endif
#ifdef HAVE_PLACEBO_OPENGL
#if HAVE_PLACEBO_OPENGL
# include <libplacebo/opengl.h>
# include <SDL_opengl.h>
#endif
......@@ -53,7 +53,7 @@ typedef struct renderer_priv_ctx
pl_log log;
// Placebo renderer
pl_renderer renderer;
#ifdef HAVE_PLACEBO_VULKAN
#if HAVE_PLACEBO_VULKAN
// Placebo Vulkan handle
pl_vulkan vk;
// Placebo Vulkan instance
......@@ -61,9 +61,11 @@ typedef struct renderer_priv_ctx
// Vulkan surface
VkSurfaceKHR surf;
#endif
#ifdef HAVE_PLACEBO_OPENGL
#if HAVE_PLACEBO_OPENGL
// Placebo OpenGL handle
pl_opengl gl;
// SDL OpenGL context
SDL_GLContext gl_context;
#endif
// Placebo GPU
pl_gpu gpu;
......@@ -77,13 +79,18 @@ typedef struct renderer_priv_ctx
} Dav1dPlayRendererPrivateContext;
static Dav1dPlayRendererPrivateContext*
placebo_renderer_create_common(int window_flags)
placebo_renderer_create_common(const Dav1dPlaySettings *settings, int window_flags)
{
if (settings->fullscreen)
window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP;
// Create Window
SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE);
if (sdlwin == NULL)
return NULL;
SDL_ShowCursor(0);
// Alloc
Dav1dPlayRendererPrivateContext *const rd_priv_ctx =
calloc(1, sizeof(Dav1dPlayRendererPrivateContext));
......@@ -118,24 +125,25 @@ static Dav1dPlayRendererPrivateContext*
return rd_priv_ctx;
}
#ifdef HAVE_PLACEBO_OPENGL
static void *placebo_renderer_create_gl(void)
#if HAVE_PLACEBO_OPENGL
static void *placebo_renderer_create_gl(const Dav1dPlaySettings *settings)
{
SDL_Window *sdlwin = NULL;
SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
// Common init
Dav1dPlayRendererPrivateContext *rd_priv_ctx =
placebo_renderer_create_common(SDL_WINDOW_OPENGL);
placebo_renderer_create_common(settings, SDL_WINDOW_OPENGL);
if (rd_priv_ctx == NULL)
return NULL;
sdlwin = rd_priv_ctx->win;
SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin);
SDL_GL_MakeCurrent(sdlwin, glcontext);
rd_priv_ctx->gl_context = SDL_GL_CreateContext(sdlwin);
SDL_GL_MakeCurrent(sdlwin, rd_priv_ctx->gl_context);
rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->log, pl_opengl_params(
.allow_software = true,
#ifndef NDEBUG
.debug = true,
#endif
......@@ -173,14 +181,14 @@ static void *placebo_renderer_create_gl(void)
}
#endif
#ifdef HAVE_PLACEBO_VULKAN
static void *placebo_renderer_create_vk(void)
#if HAVE_PLACEBO_VULKAN
static void *placebo_renderer_create_vk(const Dav1dPlaySettings *settings)
{
SDL_Window *sdlwin = NULL;
// Common init
Dav1dPlayRendererPrivateContext *rd_priv_ctx =
placebo_renderer_create_common(SDL_WINDOW_VULKAN);
placebo_renderer_create_common(settings, SDL_WINDOW_VULKAN);
if (rd_priv_ctx == NULL)
return NULL;
......@@ -270,16 +278,18 @@ static void placebo_renderer_destroy(void *cookie)
for (int i = 0; i < 3; i++)
pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i]));
#ifdef HAVE_PLACEBO_VULKAN
#if HAVE_PLACEBO_VULKAN
if (rd_priv_ctx->vk) {
pl_vulkan_destroy(&(rd_priv_ctx->vk));
vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
}
#endif
#ifdef HAVE_PLACEBO_OPENGL
#if HAVE_PLACEBO_OPENGL
if (rd_priv_ctx->gl)
pl_opengl_destroy(&(rd_priv_ctx->gl));
if (rd_priv_ctx->gl_context)
SDL_GL_DeleteContext(rd_priv_ctx->gl_context);
#endif
SDL_DestroyWindow(rd_priv_ctx->win);
......@@ -382,7 +392,7 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
SDL_UnlockMutex(rd_priv_ctx->lock);
}
#ifdef HAVE_PLACEBO_VULKAN
#if HAVE_PLACEBO_VULKAN
const Dav1dPlayRenderInfo rdr_placebo_vk = {
.name = "placebo-vk",
.create_renderer = placebo_renderer_create_vk,
......@@ -397,7 +407,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = {
const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
#endif
#ifdef HAVE_PLACEBO_OPENGL
#if HAVE_PLACEBO_OPENGL
const Dav1dPlayRenderInfo rdr_placebo_gl = {
.name = "placebo-gl",
.create_renderer = placebo_renderer_create_gl,
......
......@@ -43,12 +43,18 @@ typedef struct renderer_priv_ctx
SDL_Texture *tex;
} Dav1dPlayRendererPrivateContext;
static void *sdl_renderer_create(void)
static void *sdl_renderer_create(const Dav1dPlaySettings *settings)
{
SDL_Window *win = dp_create_sdl_window(0);
int window_flags = 0;
if (settings->fullscreen)
window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP;
SDL_Window *win = dp_create_sdl_window(window_flags);
if (win == NULL)
return NULL;
SDL_ShowCursor(0);
// Alloc
Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
if (rd_priv_ctx == NULL) {
......@@ -79,7 +85,9 @@ static void sdl_renderer_destroy(void *cookie)
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
SDL_DestroyTexture(rd_priv_ctx->tex);
SDL_DestroyRenderer(rd_priv_ctx->renderer);
SDL_DestroyWindow(rd_priv_ctx->win);
SDL_DestroyMutex(rd_priv_ctx->lock);
free(rd_priv_ctx);
}
......@@ -142,6 +150,7 @@ static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
if (texture == NULL) {
texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV,
SDL_TEXTUREACCESS_STREAMING, width, height);
SDL_RenderSetLogicalSize(rd_priv_ctx->renderer, width, height);
}
SDL_UpdateYUVTexture(texture, NULL,
......
......@@ -48,19 +48,23 @@ if sdl2_dependency.found()
placebo_dependency = dependency('libplacebo', version: '>= 4.160.0', required: false)
if placebo_dependency.found()
have_vulkan = false
have_placebo = placebo_dependency.found()
if have_placebo
dav1dplay_deps += placebo_dependency
dav1dplay_cflags += '-DHAVE_PLACEBO'
# If libplacebo is found, we might be able to use Vulkan
# with it, in which case we need the Vulkan library too.
vulkan_dependency = dependency('vulkan', required: false)
if vulkan_dependency.found()
dav1dplay_deps += vulkan_dependency
dav1dplay_cflags += '-DHAVE_VULKAN'
have_vulkan = true
endif
endif
dav1dplay_cflags += '-DHAVE_PLACEBO=' + (have_placebo ? '1' : '0')
dav1dplay_cflags += '-DHAVE_VULKAN=' + (have_vulkan ? '1' : '0')
dav1dplay = executable('dav1dplay',
dav1dplay_sources,
rev_target,
......
......@@ -123,6 +123,12 @@
#define EXTERN extern
#endif
#if ARCH_X86_64 && __has_attribute(model)
#define ATTR_MCMODEL_SMALL __attribute__((model("small")))
#else
#define ATTR_MCMODEL_SMALL
#endif
#ifdef __clang__
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
#else
......@@ -189,9 +195,13 @@ static inline int clzll(const unsigned long long mask) {
#ifndef static_assert
#define CHECK_OFFSET(type, field, name) \
struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; }
#define CHECK_SIZE(type, size) \
struct check_##type##_size { int x[(size == sizeof(type)) ? 1 : -1]; }
#else
#define CHECK_OFFSET(type, field, name) \
static_assert(name == offsetof(type, field), #field)
#define CHECK_SIZE(type, size) \
static_assert(size == sizeof(type), #type)
#endif
#ifdef _MSC_VER
......
......@@ -65,11 +65,11 @@ static inline int apply_sign64(const int v, const int64_t s) {
}
static inline int ulog2(const unsigned v) {
return 31 - clz(v);
return 31 ^ clz(v);
}
static inline int u64log2(const uint64_t v) {
return 63 - clzll(v);
return 63 ^ clzll(v);
}
static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
......
......@@ -13,7 +13,9 @@
#define __GETOPT_H__
/* All the headers include this file. */
#ifdef _WIN32
#include <crtdefs.h>
#endif
#ifdef __cplusplus
extern "C" {
......
......@@ -187,14 +187,10 @@ typedef struct Dav1dContentLightLevel {
} Dav1dContentLightLevel;
typedef struct Dav1dMasteringDisplay {
///< 0.16 fixed point
uint16_t primaries[3][2];
///< 0.16 fixed point
uint16_t white_point[2];
///< 24.8 fixed point
uint32_t max_luminance;
///< 18.14 fixed point
uint32_t min_luminance;
uint16_t primaries[3][2]; ///< 0.16 fixed point
uint16_t white_point[2]; ///< 0.16 fixed point
uint32_t max_luminance; ///< 24.8 fixed point
uint32_t min_luminance; ///< 18.14 fixed point
} Dav1dMasteringDisplay;
typedef struct Dav1dITUTT35 {
......
......@@ -22,24 +22,15 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# installed version.h header generation
version_h_data = configuration_data()
version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major)
version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor)
version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision)
version_h_target = configure_file(input: 'version.h.in',
output: 'version.h',
configuration: version_h_data)
dav1d_api_headers = [
'common.h',
'data.h',
'dav1d.h',
'headers.h',
'picture.h',
'version.h',
]
# install headers
install_headers(dav1d_api_headers,
version_h_target,
subdir : 'dav1d')
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019-2024, VideoLAN and dav1d authors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
......@@ -31,9 +31,9 @@
extern "C" {
#endif
#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@
#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
#define DAV1D_API_VERSION_MAJOR 7
#define DAV1D_API_VERSION_MINOR 0
#define DAV1D_API_VERSION_PATCH 0
/**
* Extract version components from the value returned by
......
# Copyright © 2018-2022, VideoLAN and dav1d authors
# Copyright © 2018-2024, VideoLAN and dav1d authors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
......@@ -23,19 +23,13 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '1.4.2',
version: '1.5.1',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.49.0')
dav1d_soname_version = '7.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
dav1d_api_version_revision = dav1d_api_version_array[2]
dav1d_src_root = meson.current_source_dir()
cc = meson.get_compiler('c')
......@@ -48,7 +42,18 @@ cdata_asm = configuration_data()
# Include directories
dav1d_inc_dirs = include_directories(['.', 'include/dav1d', 'include'])
dav1d_api_version_major = cc.get_define('DAV1D_API_VERSION_MAJOR',
prefix: '#include "dav1d/version.h"',
include_directories: dav1d_inc_dirs).strip()
dav1d_api_version_minor = cc.get_define('DAV1D_API_VERSION_MINOR',
prefix: '#include "dav1d/version.h"',
include_directories: dav1d_inc_dirs).strip()
dav1d_api_version_revision = cc.get_define('DAV1D_API_VERSION_PATCH',
prefix: '#include "dav1d/version.h"',
include_directories: dav1d_inc_dirs).strip()
dav1d_soname_version = '@0@.@1@.@2@'.format(dav1d_api_version_major,
dav1d_api_version_minor,
dav1d_api_version_revision)
#
# Option handling
......@@ -98,6 +103,10 @@ if host_machine.system() in ['linux', 'gnu', 'emscripten']
add_project_arguments('-D_GNU_SOURCE', language: 'c')
endif
have_clock_gettime = false
have_posix_memalign = false
have_memalign = false
have_aligned_alloc = false
if host_machine.system() == 'windows'
cdata.set('_WIN32_WINNT', '0x0601')
cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
......@@ -138,27 +147,32 @@ if host_machine.system() == 'windows'
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
rc_data.set('COPYRIGHT_YEARS', '2018-2024')
rc_data.set('COPYRIGHT_YEARS', '2018-2025')
else
thread_dependency = dependency('threads')
thread_compat_dep = []
rt_dependency = []
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
cdata.set('HAVE_CLOCK_GETTIME', 1)
have_clock_gettime = true
elif host_machine.system() not in ['darwin', 'ios', 'tvos']
rt_dependency = cc.find_library('rt', required: false)
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
error('clock_gettime not found')
endif
cdata.set('HAVE_CLOCK_GETTIME', 1)
have_clock_gettime = true
endif
if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
cdata.set('HAVE_POSIX_MEMALIGN', 1)
endif
have_posix_memalign = cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
have_memalign = cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
have_aligned_alloc = cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
endif
cdata.set10('HAVE_CLOCK_GETTIME', have_clock_gettime)
cdata.set10('HAVE_POSIX_MEMALIGN', have_posix_memalign)
cdata.set10('HAVE_MEMALIGN', have_memalign)
cdata.set10('HAVE_ALIGNED_ALLOC', have_aligned_alloc)
# check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
have_fseeko = true
if host_machine.system() == 'android'
......@@ -175,12 +189,12 @@ if host_machine.system() == 'android'
endif
libdl_dependency = []
have_dlsym = false
if host_machine.system() == 'linux'
libdl_dependency = cc.find_library('dl', required : false)
if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
cdata.set('HAVE_DLSYM', 1)
endif
have_dlsym = cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
endif
cdata.set10('HAVE_DLSYM', have_dlsym)
libm_dependency = cc.find_library('m', required: false)
......@@ -209,19 +223,13 @@ if host_machine.cpu_family().startswith('wasm')
stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
endif
if cc.check_header('unistd.h')
cdata.set('HAVE_UNISTD_H', 1)
endif
if cc.check_header('io.h')
cdata.set('HAVE_IO_H', 1)
endif
if cc.check_header('pthread_np.h')
cdata.set('HAVE_PTHREAD_NP_H', 1)
test_args += '-DHAVE_PTHREAD_NP_H'
endif
cdata.set10('HAVE_SYS_TYPES_H', cc.check_header('sys/types.h'))
cdata.set10('HAVE_UNISTD_H', cc.check_header('unistd.h'))
cdata.set10('HAVE_IO_H', cc.check_header('io.h'))
have_pthread_np = cc.check_header('pthread_np.h')
cdata.set10('HAVE_PTHREAD_NP_H', have_pthread_np)
test_args += '-DHAVE_PTHREAD_NP_H=' + (have_pthread_np ? '1' : '0')
# Function checks
......@@ -234,35 +242,32 @@ else
getopt_dependency = []
endif
have_getauxval = false
have_elf_aux_info = false
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
host_machine.cpu_family().startswith('loongarch') or
host_machine.cpu() == 'ppc64le' or
host_machine.cpu_family().startswith('riscv'))
if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
cdata.set('HAVE_GETAUXVAL', 1)
endif
if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
cdata.set('HAVE_ELF_AUX_INFO', 1)
endif
have_getauxval = cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
have_elf_aux_info = cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
endif
cdata.set10('HAVE_GETAUXVAL', have_getauxval)
cdata.set10('HAVE_ELF_AUX_INFO', have_elf_aux_info)
pthread_np_prefix = '''
#include <pthread.h>
#ifdef HAVE_PTHREAD_NP_H
#if HAVE_PTHREAD_NP_H
#include <pthread_np.h>
#endif
'''
if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1)
endif
if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1)
endif
cdata.set10('HAVE_PTHREAD_GETAFFINITY_NP', cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
cdata.set10('HAVE_PTHREAD_SETAFFINITY_NP', cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
cdata.set10('HAVE_PTHREAD_SETNAME_NP', cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
cdata.set10('HAVE_PTHREAD_SET_NAME_NP', cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
cdata.set('HAVE_C11_GENERIC', 1)
endif
cdata.set10('HAVE_C11_GENERIC', cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args))
# Compiler flag tests
......@@ -341,8 +346,49 @@ if host_machine.cpu_family().startswith('x86')
cdata_asm.set('STACK_ALIGNMENT', stack_alignment)
endif
#
# ASM specific stuff
#
use_gaspp = false
if (is_asm_enabled and
(host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm')) and
cc.get_argument_syntax() == 'msvc' and
(cc.get_id() != 'clang-cl' or meson.version().version_compare('<0.58.0')))
gaspp = find_program('gas-preprocessor.pl')
use_gaspp = true
gaspp_args = [
'-as-type', 'armasm',
'-arch', host_machine.cpu_family(),
'--',
host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
'-nologo',
'-I@0@'.format(dav1d_src_root),
'-I@0@/'.format(meson.current_build_dir()),
]
gaspp_gen = generator(gaspp,
output: '@BASENAME@.obj',
arguments: gaspp_args + [
'@INPUT@',
'-c',
'-o', '@OUTPUT@'
])
endif
cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64')
cdata.set10('ARCH_ARM', host_machine.cpu_family().startswith('arm') and host_machine.cpu() != 'arm64')
have_as_func = false
have_as_arch = false
aarch64_extensions = {
'dotprod': 'udot v0.4s, v0.16b, v0.16b',
'i8mm': 'usdot v0.4s, v0.16b, v0.16b',
'sve': 'whilelt p0.s, x0, x1',
'sve2': 'sqrdmulh z0.s, z0.s, z0.s',
}
supported_aarch64_archexts = []
supported_aarch64_instructions = []
if (is_asm_enabled and
(host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm')))
......@@ -353,7 +399,6 @@ if (is_asm_enabled and
);
'''
have_as_func = cc.compiles(as_func_code)
cdata.set10('HAVE_AS_FUNC', have_as_func)
# fedora package build infrastructure uses a gcc specs file to enable
# '-fPIE' by default. The chosen way only adds '-fPIE' to the C compiler
......@@ -374,7 +419,6 @@ if (is_asm_enabled and
if host_machine.cpu_family() == 'aarch64'
have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''')
cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
as_arch_str = ''
if have_as_arch
as_arch_level = 'armv8-a'
......@@ -403,36 +447,53 @@ if (is_asm_enabled and
cdata.set('AS_ARCH_LEVEL', as_arch_level)
as_arch_str = '".arch ' + as_arch_level + '\\n"'
endif
extensions = {
'dotprod': 'udot v0.4s, v0.16b, v0.16b',
'i8mm': 'usdot v0.4s, v0.16b, v0.16b',
'sve': 'whilelt p0.s, x0, x1',
'sve2': 'sqrdmulh z0.s, z0.s, z0.s',
}
foreach name, instr : extensions
# Test for support for the various extensions. First test if
# the assembler supports the .arch_extension directive for
# enabling/disabling the extension, then separately check whether
# the instructions themselves are supported. Even if .arch_extension
# isn't supported, we may be able to assemble the instructions
# if the .arch level includes support for them.
code = '__asm__ (' + as_arch_str
code += '".arch_extension ' + name + '\\n"'
code += ');'
supports_archext = cc.compiles(code)
cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext)
code = '__asm__ (' + as_arch_str
if supports_archext
if use_gaspp
python3 = import('python').find_installation()
endif
foreach name, instr : aarch64_extensions
if use_gaspp
f = configure_file(
command: [python3, '-c', 'import sys; print(sys.argv[1])', '@0@'.format(instr)],
output: 'test-@0@.S'.format(name),
capture: true)
r = run_command(gaspp, gaspp_args, f, '-c', '-o', meson.current_build_dir() / 'test-' + name + '.obj', check: false)
message('Checking for gaspp/armasm64 ' + name.to_upper() + ': ' + (r.returncode() == 0 ? 'YES' : 'NO'))
if r.returncode() == 0
supported_aarch64_instructions += name
endif
else
# Test for support for the various extensions. First test if
# the assembler supports the .arch_extension directive for
# enabling/disabling the extension, then separately check whether
# the instructions themselves are supported. Even if .arch_extension
# isn't supported, we may be able to assemble the instructions
# if the .arch level includes support for them.
code = '__asm__ (' + as_arch_str
code += '".arch_extension ' + name + '\\n"'
code += ');'
supports_archext = cc.compiles(code)
code = '__asm__ (' + as_arch_str
if supports_archext
supported_aarch64_archexts += name
code += '".arch_extension ' + name + '\\n"'
endif
code += '"' + instr + '\\n"'
code += ');'
if cc.compiles(code, name: name.to_upper())
supported_aarch64_instructions += name
endif
endif
code += '"' + instr + '\\n"'
code += ');'
supports_instr = cc.compiles(code, name: name.to_upper())
cdata.set10('HAVE_' + name.to_upper(), supports_instr)
endforeach
endif
endif
cdata.set10('HAVE_AS_FUNC', have_as_func)
cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
foreach name, _ : aarch64_extensions
cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', name in supported_aarch64_archexts)
cdata.set10('HAVE_' + name.to_upper(), name in supported_aarch64_instructions)
endforeach
cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
......@@ -466,9 +527,6 @@ if (host_machine.system() in ['darwin', 'ios', 'tvos'] or
cdata_asm.set10('PREFIX', true)
endif
#
# ASM specific stuff
#
if is_asm_enabled and host_machine.cpu_family().startswith('x86')
# NASM compiler support
......@@ -519,30 +577,6 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
])
endif
use_gaspp = false
if (is_asm_enabled and
(host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm')) and
cc.get_argument_syntax() == 'msvc' and
(cc.get_id() != 'clang-cl' or meson.version().version_compare('<0.58.0')))
gaspp = find_program('gas-preprocessor.pl')
use_gaspp = true
gaspp_gen = generator(gaspp,
output: '@BASENAME@.obj',
arguments: [
'-as-type', 'armasm',
'-arch', host_machine.cpu_family(),
'--',
host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
'-nologo',
'-I@0@'.format(dav1d_src_root),
'-I@0@/'.format(meson.current_build_dir()),
'@INPUT@',
'-c',
'-o', '@OUTPUT@'
])
endif
if is_asm_enabled and host_machine.cpu_family().startswith('riscv')
as_option_code = '''__asm__ (
".option arch, +v\n"
......
[binaries]
c = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
cpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
objc = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
objcpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
ar = 'ar'
strip = 'strip'
[built-in options]
c_args = ['-miphoneos-version-min=11.0']
cpp_args = ['-miphoneos-version-min=11.0']
c_link_args = ['-miphoneos-version-min=11.0']
cpp_link_args = ['-miphoneos-version-min=11.0']
objc_args = ['-miphoneos-version-min=11.0']
objcpp_args = ['-miphoneos-version-min=11.0']
[properties]
root = '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer'
needs_exe_wrapper = true
[host_machine]
system = 'darwin'
subsystem = 'ios'
kernel = 'xnu'
cpu_family = 'aarch64'
cpu = 'aarch64'
endian = 'little'
[binaries]
c = ['clang', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
cpp = ['clang++', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
objc = ['clang', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
objcpp = ['clang++', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
ar = 'ar'
strip = 'strip'
[built-in options]
c_args = ['-miphoneos-version-min=11.0']
cpp_args = ['-miphoneos-version-min=11.0']
c_link_args = ['-miphoneos-version-min=11.0']
cpp_link_args = ['-miphoneos-version-min=11.0']
objc_args = ['-miphoneos-version-min=11.0']
objcpp_args = ['-miphoneos-version-min=11.0']
[properties]
root = '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer'
needs_exe_wrapper = true
[host_machine]
system = 'darwin'
subsystem = 'ios-simulator'
kernel = 'xnu'
cpu_family = 'x86_64'
cpu = 'x86_64'
endian = 'little'
......@@ -41,691 +41,540 @@ right_ext_mask:
endconst
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[8], intptr_t w,
// int h, enum LrEdgeFlags edges);
// const pixel *src, const int16_t fh[8],
// const int w,
// const enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [sp, #108]
mov r8, r5
vld1.16 {q0}, [r4, :128]
movw r9, #(1 << 14) - (1 << 2)
vdup.16 q14, r9
push {r4-r5,lr}
ldrd r4, r5, [sp, #12]
vld1.16 {q0}, [r3, :128]
movw r12, #(1 << 14) - (1 << 2)
vdup.16 q14, r12
vmov.s16 q15, #2048
// Calculate mid_stride
add r10, r5, #7
bic r10, r10, #7
lsl r10, r10, #1
// Set up pointers for reading/writing alternate rows
add r12, r0, r10
lsl r10, r10, #1
add lr, r2, r3
lsl r3, r3, #1
// Subtract the aligned width from mid_stride
add r11, r5, #7
bic r11, r11, #7
sub r10, r10, r11, lsl #1
// Subtract the number of pixels read from the source stride
add r11, r11, #8
sub r3, r3, r11
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r7, #1 // LR_HAVE_LEFT
beq 2f
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL
sub r2, r2, #3
sub lr, lr, #3
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 3 pixels from the src pointer,
// but shift it as if we had done that.
add r3, r3, #3
1: // Loop vertically
vld1.8 {q2}, [r2]!
vld1.8 {q9}, [lr]!
b 2f
tst r7, #1 // LR_HAVE_LEFT
beq 0f
cmp r1, #0
beq 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.32 {d3[1]}, [r1]!
// Move r2/lr back to account for the last 3 bytes we loaded earlier,
vld1.8 {q2}, [r2]!
vld1.32 {d3[1]}, [r1]
// Move r2 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out.
sub r2, r2, #3
sub lr, lr, #3
vld1.32 {d17[1]}, [r1]!
vext.8 q2, q1, q2, #13
vext.8 q9, q8, q9, #13
b 2f
0:
1:
vld1.8 {q2}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
vdup.8 q1, d4[0]
vdup.8 q8, d18[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub r2, r2, #3
sub lr, lr, #3
vext.8 q2, q1, q2, #13
vext.8 q9, q8, q9, #13
2:
vmovl.u8 q1, d4
vmovl.u8 q2, d5
vmovl.u8 q8, d18
vmovl.u8 q9, d19
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub r9, r5, #14
ldrb r11, [r2, r9]
ldrb r9, [lr, r9]
// Fill q12/q13 with the right padding pixel
vdup.16 q12, r11
vdup.16 q13, r9
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r5, #11
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
sub r12, r4, #14
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r4, right_ext_mask, -6
sub r4, r4, r5, lsl #1
vld1.8 {q10, q11}, [r4]
movrel_local r3, right_ext_mask, -6
ldrb r12, [r2, r12]
sub r3, r3, r4, lsl #1
vdup.16 q13, r12
vld1.8 {q10, q11}, [r3]
vbit q1, q12, q10
vbit q2, q12, q11
vbit q8, q13, q10
vbit q9, q13, q11
vbit q1, q13, q10
vbit q2, q13, q11
4: // Loop horizontally
vext.8 q11, q1, q2, #4
vext.8 q5, q1, q2, #8
vext.8 q10, q1, q2, #2
vext.8 q6, q1, q2, #10
vext.8 q7, q1, q2, #12
vext.8 q4, q1, q2, #6
vadd.i16 q5, q5, q11
vadd.i16 q6, q6, q10
vadd.i16 q7, q7, q1
vmul.s16 q3, q4, d0[3]
vmla.s16 q3, q5, d1[0]
vmla.s16 q3, q6, d1[1]
vmla.s16 q3, q7, d1[2]
vext.8 q4, q8, q9, #4
vext.8 q6, q8, q9, #8
vext.8 q11, q8, q9, #2
vext.8 q7, q8, q9, #10
vadd.i16 q6, q6, q4
vext.8 q4, q8, q9, #12
vext.8 q5, q8, q9, #6
vadd.i16 q7, q7, q11
vadd.i16 q4, q4, q8
vmul.s16 q10, q5, d0[3]
vmla.s16 q10, q6, d1[0]
vmla.s16 q10, q7, d1[1]
vmla.s16 q10, q4, d1[2]
vext.8 q1, q1, q2, #6
vext.8 q8, q8, q9, #6
vshl.s16 q1, q1, #7
vshl.s16 q8, q8, #7
vext.8 q10, q1, q2, #4
vext.8 q11, q1, q2, #8
vext.8 q9, q1, q2, #2
vext.8 q12, q1, q2, #10
vext.8 q13, q1, q2, #12
vext.8 q8, q1, q2, #6
vadd.i16 q10, q10, q11
vadd.i16 q9, q9, q12
vadd.i16 q13, q13, q1
vshl.s16 q1, q8, #7
vmul.s16 q3, q8, d0[3]
vmla.s16 q3, q10, d1[0]
vmla.s16 q3, q9, d1[1]
vmla.s16 q3, q13, d1[2]
vsub.s16 q1, q1, q14
vsub.s16 q8, q8, q14
vqadd.s16 q3, q3, q1
vqadd.s16 q10, q10, q8
vshr.s16 q3, q3, #3
vshr.s16 q10, q10, #3
vadd.s16 q3, q3, q15
vadd.s16 q10, q10, q15
subs r5, r5, #8
subs r4, r4, #8
vst1.16 {q3}, [r0, :128]!
vst1.16 {q10}, [r12, :128]!
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q1, q2
vmov q8, q9
vld1.8 {d4}, [r2]!
vld1.8 {d18}, [lr]!
tst r5, #2 // LR_HAVE_RIGHT
vmovl.u8 q2, d4
vmovl.u8 q9, d18
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs r6, r6, #2
ble 0f
// Jump to the next row and loop horizontally
add r0, r0, r10
add r12, r12, r10
add r2, r2, r3
add lr, lr, r3
mov r5, r8
b 1b
0:
vpop {q4-q7}
pop {r4-r11,pc}
pop {r4-r5,pc}
endfunc
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[8], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, int16_t **ptrs,
// const int16_t fv[8], const int w);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r7,lr}
push {r4-r9,lr}
vpush {q4-q6}
ldrd r4, r5, [sp, #68]
ldrd r6, r7, [sp, #76]
mov lr, r4
vld1.16 {q0}, [r5, :128]
// Calculate the number of rows to move back when looping vertically
mov r12, r4
tst r6, #4 // LR_HAVE_TOP
beq 0f
sub r2, r2, r7, lsl #1
add r12, r12, #2
0:
tst r6, #8 // LR_HAVE_BOTTOM
beq 1f
add r12, r12, #2
1: // Start of horizontal loop; start one vertical filter slice.
// Load rows into q8-q11 and pad properly.
tst r6, #4 // LR_HAVE_TOP
vld1.16 {q8}, [r2, :128], r7
beq 2f
// LR_HAVE_TOP
vld1.16 {q10}, [r2, :128], r7
vmov q9, q8
vld1.16 {q11}, [r2, :128], r7
b 3f
2: // !LR_HAVE_TOP
vmov q9, q8
vmov q10, q8
vmov q11, q8
3:
cmp r4, #4
blt 5f
// Start filtering normally; fill in q12-q14 with unique rows.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
vld1.16 {q14}, [r2, :128], r7
4:
.macro filter compare
subs r4, r4, #1
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
vadd.i16 q4, q10, q12
vadd.i16 q5, q9, q13
vadd.i16 q6, q8, q14
vmull.s16 q2, d22, d0[3]
vmlal.s16 q2, d8, d1[0]
vmlal.s16 q2, d10, d1[1]
vmlal.s16 q2, d12, d1[2]
vmull.s16 q3, d23, d0[3]
vmlal.s16 q3, d9, d1[0]
vmlal.s16 q3, d11, d1[1]
vmlal.s16 q3, d13, d1[2]
vqrshrun.s32 d4, q2, #11
vqrshrun.s32 d5, q3, #11
vqmovun.s16 d4, q2
vst1.8 {d4}, [r0, :64], r1
.if \compare
cmp r4, #4
.else
ble 9f
.endif
vmov q8, q9
vmov q9, q10
vmov q10, q11
vmov q11, q12
vmov q12, q13
vmov q13, q14
.endm
filter 1
blt 7f
vld1.16 {q14}, [r2, :128], r7
b 4b
5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
tst r6, #8 // LR_HAVE_BOTTOM
beq 6f
// LR_HAVE_BOTTOM
cmp r4, #2
// We load at least 2 rows in all cases.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
bgt 53f // 3 rows in total
beq 52f // 2 rows in total
51: // 1 row in total, q11 already loaded, load edge into q12-q14.
vmov q13, q12
b 8f
52: // 2 rows in total, q11 already loaded, load q12 with content data
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vmov q15, q14
b 8f
53:
// 3 rows in total, q11 already loaded, load q12 and q13 with content
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vld1.16 {q15}, [r2, :128], r7
vmov q1, q15
b 8f
6:
// !LR_HAVE_BOTTOM
cmp r4, #2
bgt 63f // 3 rows in total
beq 62f // 2 rows in total
61: // 1 row in total, q11 already loaded, pad that into q12-q14.
vmov q12, q11
vmov q13, q11
vmov q14, q11
b 8f
62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
vld1.16 {q12}, [r2, :128], r7
vmov q13, q12
vmov q14, q12
vmov q15, q12
b 8f
63:
// 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
vmov q14, q13
vmov q15, q13
vmov q1, q13
b 8f
7:
// All registers up to q13 are filled already, 3 valid rows left.
// < 4 valid rows left; fill in padding and filter the last
// few rows.
tst r6, #8 // LR_HAVE_BOTTOM
beq 71f
// LR_HAVE_BOTTOM; load 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vld1.16 {q15}, [r2, :128], r7
vmov q1, q15
b 8f
71:
// !LR_HAVE_BOTTOM, pad 3 rows
vmov q14, q13
vmov q15, q13
vmov q1, q13
8: // At this point, all registers up to q14-15,q1 are loaded with
// edge/padding (depending on how many rows are left).
filter 0 // This branches to 9f when done
vmov q14, q15
vmov q15, q1
b 8b
9: // End of one vertical slice.
subs r3, r3, #8
ble 0f
// Move pointers back up to the top and loop horizontally.
mls r0, r1, lr, r0
mls r2, r7, r12, r2
add r0, r0, #8
add r2, r2, #16
mov r4, lr
b 1b
0:
vpop {q4-q6}
pop {r4-r7,pc}
.purgem filter
endfunc
vld1.16 {q0}, [r2, :128]
#define SUM_STRIDE (384+16)
ldrd r4, r5, [r1]
ldrd r6, r7, [r1, #8]
ldrd r8, r9, [r1, #16]
#include "looprestoration_tmpl.S"
1:
vld1.16 {q1, q2}, [r4, :128]!
vld1.16 {q8, q9}, [r9, :128]!
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [sp, #108]
add r5, r5, #2 // w += 2
vld1.16 {q5, q6}, [r5, :128]!
vld1.16 {q10, q11}, [r6, :128]!
vld1.16 {q12, q13}, [r8, :128]!
vld1.16 {q14, q15}, [r7, :128]!
subs r3, r3, #16
vadd.i16 q1, q1, q8
vadd.i16 q2, q2, q9
vadd.i16 q5, q5, q8
vadd.i16 q6, q6, q9
vadd.i16 q10, q10, q12
vadd.i16 q11, q11, q13
vmull.s16 q3, d28, d0[3]
vmlal.s16 q3, d2, d0[0]
vmlal.s16 q3, d10, d0[1]
vmlal.s16 q3, d20, d0[2]
// Set up pointers for reading/writing alternate rows
add r10, r0, #(4*SUM_STRIDE) // sumsq
add r11, r1, #(2*SUM_STRIDE) // sum
add r12, r3, r4 // src
lsl r4, r4, #1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
vmull.s16 q4, d29, d0[3]
vmlal.s16 q4, d3, d0[0]
vmlal.s16 q4, d11, d0[1]
vmlal.s16 q4, d21, d0[2]
// Subtract the aligned width from the output stride.
add lr, r5, #7
bic lr, lr, #7
sub r9, r9, lr, lsl #1
vmull.s16 q8, d30, d0[3]
vmlal.s16 q8, d4, d0[0]
vmlal.s16 q8, d12, d0[1]
vmlal.s16 q8, d22, d0[2]
// Store the width for the vertical loop
mov r8, r5
vmull.s16 q9, d31, d0[3]
vmlal.s16 q9, d5, d0[0]
vmlal.s16 q9, d13, d0[1]
vmlal.s16 q9, d23, d0[2]
// Subtract the number of pixels read from the input from the stride
add lr, lr, #8
sub r4, r4, lr
vqrshrun.s32 d6, q3, #11
vqrshrun.s32 d7, q4, #11
vqrshrun.s32 d16, q8, #11
vqrshrun.s32 d17, q9, #11
vqmovun.s16 d6, q3
vqmovun.s16 d7, q8
vst1.8 {q3}, [r0, :128]!
bgt 1b
// Shift the pointers, but only update the first 5; the 6th pointer is
// kept as it was before (and the 7th is implicitly identical to the
// 6th).
ldrd r4, r5, [r1, #4]
ldrd r6, r7, [r1, #12]
ldr r8, [r1, #20]
strd r4, r5, [r1]
strd r6, r7, [r1, #8]
str r8, [r1, #16]
vpop {q4-q6}
pop {r4-r9,pc}
endfunc
// void dav1d_wiener_filter_hv_8bpc_neon(pixel *dst, const pixel (*left)[4],
// const pixel *src,
// const int16_t filter[2][8],
// const int w,
// const enum LrEdgeFlags edges,
// int16_t **ptrs);
function wiener_filter_hv_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldr lr, [sp, #108]
vld1.16 {q0, q1}, [r3, :128]
movw r12, #(1 << 14) - (1 << 2)
vdup.16 q14, r12
vmov.s16 q15, #2048
ldrd r6, r7, [lr]
ldrd r8, r9, [lr, #8]
ldrd r10, r11, [lr, #16]
ldr r12, [lr, #24]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r7, #1 // LR_HAVE_LEFT
beq 2f
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r2, #0
cmp r1, #0
bne 0f
// left == NULL
sub r3, r3, #2
sub r12, r12, #2
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 2 pixels from the src pointer,
// but shift it as if we had done that.
add r4, r4, #2
sub r2, r2, #3
vld1.8 {q2}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q2}, [r2]!
vld1.32 {d3[1]}, [r1]
// Move r2 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out.
sub r2, r2, #3
vext.8 q2, q1, q2, #13
b 2f
1: // Loop vertically
vld1.8 {q0}, [r3]!
vld1.8 {q4}, [r12]!
1:
vld1.8 {q2}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
vdup.8 q3, d4[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub r2, r2, #3
vext.8 q2, q3, q2, #13
tst r7, #1 // LR_HAVE_LEFT
beq 0f
2:
vmovl.u8 q3, d5
vmovl.u8 q2, d4
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
sub lr, r4, #14
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrb lr, [r2, lr]
sub r3, r3, r4, lsl #1
vdup.16 q13, lr
vld1.8 {q10, q11}, [r3]
vbit q2, q13, q10
vbit q3, q13, q11
4: // Loop horizontally
vext.8 q10, q2, q3, #4
vext.8 q11, q2, q3, #8
vext.8 q9, q2, q3, #2
vext.8 q12, q2, q3, #10
vext.8 q13, q2, q3, #12
vext.8 q8, q2, q3, #6
vadd.i16 q10, q10, q11
vadd.i16 q9, q9, q12
vadd.i16 q13, q13, q2
vld1.16 {q6}, [r7, :128]!
vshl.s16 q2, q8, #7
vld1.16 {q11}, [r11, :128]!
vsub.s16 q2, q2, q14
vld1.16 {q7}, [r8, :128]!
vmul.s16 q4, q8, d0[3]
vmla.s16 q4, q10, d1[0]
vmla.s16 q4, q9, d1[1]
vmla.s16 q4, q13, d1[2]
vld1.16 {q10}, [r10, :128]!
vqadd.s16 q4, q4, q2
vld1.16 {q9}, [r9, :128]!
vshr.s16 q4, q4, #3
vld1.16 {q5}, [r6, :128]!
vadd.s16 q4, q4, q15
vadd.s16 q6, q6, q11
vadd.s16 q7, q7, q10
vadd.s16 q5, q5, q4
vmull.s16 q8, d18, d2[3]
vmlal.s16 q8, d12, d2[1]
vmlal.s16 q8, d14, d2[2]
vmlal.s16 q8, d10, d2[0]
vmull.s16 q9, d19, d2[3]
vmlal.s16 q9, d13, d2[1]
vmlal.s16 q9, d15, d2[2]
vmlal.s16 q9, d11, d2[0]
vqrshrun.s32 d16, q8, #11
vqrshrun.s32 d17, q9, #11
vst1.16 {q4}, [r12, :128]!
vqmovun.s16 d16, q8
subs r4, r4, #8
vst1.8 {d16}, [r0, :64]!
ble 9f
vmov q2, q3
vld1.8 {d6}, [r2]!
tst r5, #2 // LR_HAVE_RIGHT
vmovl.u8 q3, d6
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
// Reload ptrs from arguments on the stack
ldr lr, [sp, #108]
// Rotate the window of pointers. Shift the 6 pointers downwards one step.
ldrd r6, r7, [lr, #4]
ldrd r8, r9, [lr, #12]
ldrd r10, r11, [lr, #20]
strd r6, r7, [lr]
strd r8, r9, [lr, #8]
strd r10, r11, [lr, #16]
// The topmost pointer, ptrs[6], which isn't used as input, is set to
// ptrs[0], which will be used as output for the next _hv call.
// At the start of the filtering, the caller may set ptrs[6] to the
// right next buffer to fill in, instead.
str r6, [lr, #24]
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
#include "looprestoration_tmpl.S"
// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box3_row_h_8bpc_neon, export=1
push {r4-r5,lr}
ldrd r4, r5, [sp, #12]
add r4, r4, #2 // w += 2
tst r5, #1 // LR_HAVE_LEFT
beq 1f
cmp r2, #0
beq 2f
bne 0f
// LR_HAVE_LEFT && left == NULL
sub r3, r3, #2
vld1.8 {q0}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.32 {d3[]}, [r2]!
// Move r3/r12 back to account for the last 2 bytes we loaded earlier,
vld1.8 {q0}, [r3]!
vld1.32 {d3[]}, [r2]
// Move r3 back to account for the last 2 bytes we loaded earlier,
// which we'll shift out.
sub r3, r3, #2
sub r12, r12, #2
vld1.32 {d11[]}, [r2]!
vext.8 q0, q1, q0, #14
vext.8 q4, q5, q4, #14
b 2f
0:
1:
vld1.8 {q0}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 2x the first byte at the front.
vdup.8 q1, d0[0]
vdup.8 q5, d8[0]
// Move r3 back to account for the last 2 bytes we loaded before,
// which we shifted out.
sub r3, r3, #2
sub r12, r12, #2
vext.8 q0, q1, q0, #14
vext.8 q4, q5, q4, #14
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub lr, r5, #(2 + 16 - 2 + 1)
ldrb r11, [r3, lr]
ldrb lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
vdup.8 q14, r11
vdup.8 q15, lr
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
sub lr, r4, #(2 + 16 - 2 + 1)
ldrb lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r5, #10
cmp r4, #10
bge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in q0/4.b[w] onwards
// Insert padding in q0.b[w] onwards
movrel_local lr, right_ext_mask
sub lr, lr, r5
sub lr, lr, r4
vld1.8 {q13}, [lr]
vbit q0, q14, q13
vbit q4, q15, q13
// Update the precalculated squares
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
4: // Loop horizontally
vext.8 d16, d0, d1, #1
vext.8 d17, d0, d1, #2
vext.8 d18, d8, d9, #1
vext.8 d19, d8, d9, #2
vaddl.u8 q3, d0, d16
vext.8 q9, q1, q2, #2
vaddw.u8 q3, q3, d17
vaddl.u8 q7, d8, d18
vaddw.u8 q7, q7, d19
vext.8 q8, q1, q2, #2
vext.8 q9, q1, q2, #4
vext.8 q10, q5, q6, #2
vext.8 q11, q5, q6, #4
vext.8 q10, q1, q2, #4
vaddl.u16 q12, d2, d16
vaddl.u16 q13, d3, d17
vaddw.u16 q12, q12, d18
vaddw.u16 q13, q13, d19
vaddl.u16 q12, d2, d18
vaddl.u16 q13, d3, d19
vaddw.u16 q12, q12, d20
vaddw.u16 q13, q13, d21
vaddl.u16 q8, d10, d20
vaddl.u16 q9, d11, d21
vaddw.u16 q8, q8, d22
vaddw.u16 q9, q9, d23
subs r5, r5, #8
subs r4, r4, #8
vst1.16 {q3}, [r1, :128]!
vst1.16 {q7}, [r11, :128]!
vst1.32 {q12, q13}, [r0, :128]!
vst1.32 {q8, q9}, [r10, :128]!
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r3]!
vld1.8 {d14}, [r12]!
vmov q1, q2
vmov q5, q6
vext.8 q0, q0, q3, #8
vext.8 q4, q4, q7, #8
vmull.u8 q2, d6, d6
vmull.u8 q6, d14, d14
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs r6, r6, #2
ble 0f
// Jump to the next row and loop horizontally
add r0, r0, r9, lsl #1
add r10, r10, r9, lsl #1
add r1, r1, r9
add r11, r11, r9
add r3, r3, r4
add r12, r12, r4
mov r5, r8
b 1b
0:
vpop {q4-q7}
pop {r4-r11,pc}
pop {r4-r5,pc}
endfunc
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [sp, #108]
add r5, r5, #2 // w += 2
// Set up pointers for reading/writing alternate rows
add r10, r0, #(4*SUM_STRIDE) // sumsq
add r11, r1, #(2*SUM_STRIDE) // sum
add r12, r3, r4 // src
lsl r4, r4, #1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
add lr, r5, #7
bic lr, lr, #7
sub r9, r9, lr, lsl #1
add lr, lr, #8
sub r4, r4, lr
// Store the width for the vertical loop
mov r8, r5
// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box5_row_h_8bpc_neon, export=1
push {r4-r5,lr}
ldrd r4, r5, [sp, #12]
add r4, r4, #2 // w += 2
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r7, #1 // LR_HAVE_LEFT
beq 2f
// LR_HAVE_LEFT
tst r5, #1 // LR_HAVE_LEFT
beq 1f
cmp r2, #0
bne 0f
// left == NULL
// LR_HAVE_LEFT && left == NULL
sub r3, r3, #3
sub r12, r12, #3
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 3 pixels from the src pointer,
// but shift it as if we had done that.
add r4, r4, #3
1: // Loop vertically
vld1.8 {q0}, [r3]!
vld1.8 {q4}, [r12]!
b 2f
tst r7, #1 // LR_HAVE_LEFT
beq 0f
cmp r2, #0
beq 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.32 {d3[]}, [r2]!
// Move r3/r12 back to account for the last 3 bytes we loaded earlier,
vld1.8 {q0}, [r3]!
vld1.32 {d3[]}, [r2]
// Move r3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out.
sub r3, r3, #3
sub r12, r12, #3
vld1.32 {d11[]}, [r2]!
vext.8 q0, q1, q0, #13
vext.8 q4, q5, q4, #13
b 2f
0:
1:
vld1.8 {q0}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
vdup.8 q5, d8[0]
// Move r3 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub r3, r3, #3
sub r12, r12, #3
vext.8 q0, q1, q0, #13
vext.8 q4, q5, q4, #13
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub lr, r5, #(2 + 16 - 3 + 1)
ldrb r11, [r3, lr]
ldrb lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
vdup.8 q14, r11
vdup.8 q15, lr
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
sub lr, r4, #(2 + 16 - 3 + 1)
ldrb lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r5, #11
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
// Insert padding in q0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -1
sub lr, lr, r5
sub lr, lr, r4
vld1.8 {q13}, [lr]
vbit q0, q14, q13
vbit q4, q15, q13
// Update the precalculated squares
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
4: // Loop horizontally
vext.8 d16, d0, d1, #1
vext.8 d17, d0, d1, #2
vext.8 d18, d0, d1, #3
vext.8 d19, d0, d1, #4
vext.8 d20, d8, d9, #1
vext.8 d21, d8, d9, #2
vext.8 d22, d8, d9, #3
vext.8 d23, d8, d9, #4
vaddl.u8 q3, d0, d16
vaddl.u8 q12, d17, d18
vaddl.u8 q7, d8, d20
vaddl.u8 q13, d21, d22
vaddw.u8 q3, q3, d19
vaddw.u8 q7, q7, d23
vadd.u16 q3, q3, q12
vadd.u16 q7, q7, q13
vext.8 q8, q1, q2, #2
vext.8 q9, q1, q2, #4
......@@ -739,53 +588,142 @@ function sgr_box5_h_8bpc_neon, export=1
vaddw.u16 q13, q13, d23
vadd.i32 q12, q12, q8
vadd.i32 q13, q13, q9
vext.8 q8, q5, q6, #2
vext.8 q9, q5, q6, #4
vext.8 q10, q5, q6, #6
vext.8 q11, q5, q6, #8
vaddl.u16 q1, d10, d16
vaddl.u16 q5, d11, d17
vaddl.u16 q8, d18, d20
vaddl.u16 q9, d19, d21
vaddw.u16 q1, q1, d22
vaddw.u16 q5, q5, d23
vadd.i32 q10, q1, q8
vadd.i32 q11, q5, q9
subs r5, r5, #8
subs r4, r4, #8
vst1.16 {q3}, [r1, :128]!
vst1.16 {q7}, [r11, :128]!
vst1.32 {q12, q13}, [r0, :128]!
vst1.32 {q10, q11}, [r10, :128]!
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r3]!
vld1.8 {d14}, [r12]!
vmov q1, q2
vmov q5, q6
vext.8 q0, q0, q3, #8
vext.8 q4, q4, q7, #8
vmull.u8 q2, d6, d6
vmull.u8 q6, d14, d14
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs r6, r6, #2
ble 0f
// Jump to the next row and loop horizontally
add r0, r0, r9, lsl #1
add r10, r10, r9, lsl #1
add r1, r1, r9
add r11, r11, r9
add r3, r3, r4
add r12, r12, r4
mov r5, r8
b 1b
pop {r4-r5,pc}
endfunc
// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
// int32_t *sumsq5, int16_t *sum5,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box35_row_h_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
add r6, r6, #2 // w += 2
tst r7, #1 // LR_HAVE_LEFT
beq 1f
cmp r4, #0
bne 0f
// LR_HAVE_LEFT && left == NULL
sub r5, r5, #3
vld1.8 {q0}, [r5]!
b 2f
0:
vpop {q4-q7}
pop {r4-r11,pc}
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0}, [r5]!
vld1.32 {d3[]}, [r4]
// Move r3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out.
sub r5, r5, #3
vext.8 q0, q1, q0, #13
b 2f
1:
vld1.8 {q0}, [r5]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
// Move r3 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub r5, r5, #3
vext.8 q0, q1, q0, #13
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
tst r7, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub lr, r6, #(2 + 16 - 3 + 1)
ldrb lr, [r5, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r6, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -1
sub lr, lr, r6
vld1.8 {q13}, [lr]
vbit q0, q14, q13
// Update the precalculated squares
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
4: // Loop horizontally
vext.8 d16, d0, d1, #1
vext.8 d17, d0, d1, #2
vext.8 d18, d0, d1, #3
vext.8 d19, d0, d1, #4
vaddl.u8 q3, d16, d17
vaddl.u8 q12, d0, d19
vaddw.u8 q3, q3, d18
vext.8 q8, q1, q2, #2
vext.8 q9, q1, q2, #4
vext.8 q10, q1, q2, #6
vext.8 q11, q1, q2, #8
vst1.16 {q3}, [r1, :128]!
vadd.u16 q3, q3, q12
vaddl.u16 q12, d16, d18
vaddl.u16 q13, d17, d19
vaddl.u16 q8, d2, d22
vaddl.u16 q9, d3, d23
vaddw.u16 q12, q12, d20
vaddw.u16 q13, q13, d21
vst1.32 {q12, q13}, [r0, :128]!
vadd.i32 q12, q12, q8
vadd.i32 q13, q13, q9
subs r6, r6, #8
vst1.16 {q3}, [r3, :128]!
vst1.32 {q12, q13}, [r2, :128]!
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r5]!
vmov q1, q2
vext.8 q0, q0, q3, #8
vmull.u8 q2, d6, d6
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
pop {r4-r7,pc}
endfunc
sgr_funcs 8
......@@ -41,761 +41,717 @@ right_ext_mask:
endconst
// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges,
// const pixel *src, const int16_t fh[8],
// const int w,
// enum LrEdgeFlags edges,
// const int bitdepth_max);
function wiener_filter_h_16bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [sp, #108]
ldr r8, [sp, #116] // bitdepth_max
vld1.16 {q0}, [r4, :128]
clz r8, r8
push {r4-r6,lr}
ldrd r4, r5, [sp, #16]
ldr r6, [sp, #24] // bitdepth_max
vld1.16 {q0}, [r3, :128]
clz r6, r6
vmov.i32 q14, #1
sub r9, r8, #38 // -(bitdepth + 6)
sub r8, r8, #25 // -round_bits_h
neg r9, r9 // bitdepth + 6
vdup.32 q1, r9
vdup.32 q13, r8 // -round_bits_h
sub r12, r6, #38 // -(bitdepth + 6)
sub r6, r6, #25 // -round_bits_h
neg r12, r12 // bitdepth + 6
vdup.32 q1, r12
vdup.32 q13, r6 // -round_bits_h
vmov.i16 q15, #8192
vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
mov r8, r5
// Calculate mid_stride
add r10, r5, #7
bic r10, r10, #7
lsl r10, r10, #1
// Set up pointers for reading/writing alternate rows
add r12, r0, r10
lsl r10, r10, #1
add lr, r2, r3
lsl r3, r3, #1
// Subtract the aligned width from mid_stride
add r11, r5, #7
bic r11, r11, #7
sub r10, r10, r11, lsl #1
// Subtract the number of pixels read from the source stride
add r11, r11, #8
sub r3, r3, r11, lsl #1
vmvn.i16 q12, #0x8000 // 0x7fff = (1 << 15) - 1
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r7, #1 // LR_HAVE_LEFT
beq 2f
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL
sub r2, r2, #6
sub lr, lr, #6
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 3 pixels from the src pointer,
// but shift it as if we had done that.
add r3, r3, #6
1: // Loop vertically
vld1.16 {q2, q3}, [r2]!
vld1.16 {q4, q5}, [lr]!
b 2f
tst r7, #1 // LR_HAVE_LEFT
beq 0f
cmp r1, #0
beq 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.16 {q2, q3}, [r2]!
vld1.16 {d3}, [r1]!
// Move r2/lr back to account for the last 3 pixels we loaded earlier,
// Move r2 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out.
sub r2, r2, #6
sub lr, lr, #6
vld1.16 {d13}, [r1]!
vext.8 q3, q2, q3, #10
vext.8 q2, q1, q2, #10
vext.8 q5, q4, q5, #10
vext.8 q4, q6, q4, #10
b 2f
0:
1:
vld1.16 {q2, q3}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q2/q3 to have 3x the first pixel at the front.
vdup.16 q1, d4[0]
vdup.16 q6, d8[0]
// Move r2 back to account for the last 3 pixels we loaded before,
// which we shifted out.
sub r2, r2, #6
sub lr, lr, #6
vext.8 q3, q2, q3, #10
vext.8 q2, q1, q2, #10
vext.8 q5, q4, q5, #10
vext.8 q4, q6, q4, #10
2:
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub r9, r5, #14
lsl r9, r9, #1
ldrh r11, [r2, r9]
ldrh r9, [lr, r9]
// Fill q11/q12 with the right padding pixel
vdup.16 q11, r11
vdup.16 q12, r9
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r5, #11
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
sub r12, r4, #14
lsl r12, r12, #1
// Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r4, right_ext_mask, -6
sub r4, r4, r5, lsl #1
vld1.8 {q9, q10}, [r4]
movrel_local r3, right_ext_mask, -6
ldrh r12, [r2, r12]
sub r3, r3, r4, lsl #1
vdup.16 q11, r12
vld1.8 {q9, q10}, [r3]
vbit q2, q11, q9
vbit q3, q11, q10
vbit q4, q12, q9
vbit q5, q12, q10
4: // Loop horizontally
vext.8 q7, q2, q3, #4
vext.8 q8, q2, q3, #8
vext.8 q6, q2, q3, #2
vext.8 q9, q2, q3, #10
vadd.i16 q8, q8, q7
vadd.i16 q9, q9, q6
vext.8 q6, q2, q3, #12
vext.8 q7, q2, q3, #6
vadd.i16 q2, q2, q6
vmull.s16 q6, d14, d0[3]
vmlal.s16 q6, d16, d1[0]
vmlal.s16 q6, d18, d1[1]
vmlal.s16 q6, d4, d1[2]
vmull.s16 q7, d15, d0[3]
vmlal.s16 q7, d17, d1[0]
vmlal.s16 q7, d19, d1[1]
vmlal.s16 q7, d5, d1[2]
vext.8 q8, q4, q5, #4
vext.8 q10, q4, q5, #8
vext.8 q9, q4, q5, #2
vext.8 q2, q4, q5, #10
vadd.i16 q10, q10, q8
vadd.i16 q2, q2, q9
vext.8 q8, q4, q5, #12
vext.8 q9, q4, q5, #6
vadd.i16 q4, q4, q8
vext.8 q9, q2, q3, #4
vext.8 q10, q2, q3, #8
vext.8 q8, q2, q3, #2
vext.8 q11, q2, q3, #10
vadd.i16 q10, q10, q9
vadd.i16 q11, q11, q8
vext.8 q8, q2, q3, #12
vext.8 q9, q2, q3, #6
vadd.i16 q2, q2, q8
vmull.s16 q8, d18, d0[3]
vmlal.s16 q8, d20, d1[0]
vmlal.s16 q8, d4, d1[1]
vmlal.s16 q8, d8, d1[2]
vmlal.s16 q8, d22, d1[1]
vmlal.s16 q8, d4, d1[2]
vmull.s16 q9, d19, d0[3]
vmlal.s16 q9, d21, d1[0]
vmlal.s16 q9, d5, d1[1]
vmlal.s16 q9, d9, d1[2]
vmlal.s16 q9, d23, d1[1]
vmlal.s16 q9, d5, d1[2]
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
vadd.i32 q6, q6, q14
vadd.i32 q7, q7, q14
vadd.i32 q8, q8, q14
vadd.i32 q9, q9, q14
vrshl.s32 q6, q6, q13
vrshl.s32 q7, q7, q13
vrshl.s32 q8, q8, q13
vrshl.s32 q9, q9, q13
vqmovun.s32 d12, q6
vqmovun.s32 d13, q7
vqmovun.s32 d14, q8
vqmovun.s32 d15, q9
vmin.u16 q6, q6, q10
vmin.u16 q7, q7, q10
vsub.i16 q6, q6, q15
vsub.i16 q7, q7, q15
subs r5, r5, #8
vst1.16 {q6}, [r0, :128]!
vst1.16 {q7}, [r12, :128]!
vqmovun.s32 d16, q8
vqmovun.s32 d17, q9
vmin.u16 q8, q8, q12
vsub.i16 q8, q8, q15
subs r4, r4, #8
vst1.16 {q8}, [r0, :128]!
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q2, q3
vmov q4, q5
tst r5, #2 // LR_HAVE_RIGHT
vld1.16 {q3}, [r2]!
vld1.16 {q5}, [lr]!
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs r6, r6, #2
ble 0f
// Jump to the next row and loop horizontally
add r0, r0, r10
add r12, r12, r10
add r2, r2, r3
add lr, lr, r3
mov r5, r8
b 1b
0:
vpop {q4-q7}
pop {r4-r11,pc}
pop {r4-r6,pc}
endfunc
// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride, const int bitdepth_max);
// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, int16_t **ptrs,
// const int16_t fv[8], const int w,
// const int bitdepth_max);
function wiener_filter_v_16bpc_neon, export=1
push {r4-r7,lr}
vpush {q4-q5}
ldrd r4, r5, [sp, #52]
ldrd r6, r7, [sp, #60]
ldr lr, [sp, #68] // bitdepth_max
vld1.16 {q0}, [r5, :128]
vdup.16 q5, lr
push {r4-r9,lr}
vpush {q4-q7}
ldr lr, [sp, #92] // bitdepth_max
vld1.16 {q0}, [r2, :128]
vdup.16 q2, lr
clz lr, lr
sub lr, lr, #11 // round_bits_v
vdup.32 q4, lr
mov lr, r4
vneg.s32 q4, q4 // -round_bits_v
// Calculate the number of rows to move back when looping vertically
mov r12, r4
tst r6, #4 // LR_HAVE_TOP
beq 0f
sub r2, r2, r7, lsl #1
add r12, r12, #2
0:
tst r6, #8 // LR_HAVE_BOTTOM
beq 1f
add r12, r12, #2
1: // Start of horizontal loop; start one vertical filter slice.
// Load rows into q8-q11 and pad properly.
tst r6, #4 // LR_HAVE_TOP
vld1.16 {q8}, [r2, :128], r7
beq 2f
// LR_HAVE_TOP
vld1.16 {q10}, [r2, :128], r7
vmov q9, q8
vld1.16 {q11}, [r2, :128], r7
b 3f
2: // !LR_HAVE_TOP
vmov q9, q8
vmov q10, q8
vmov q11, q8
3:
cmp r4, #4
blt 5f
// Start filtering normally; fill in q12-q14 with unique rows.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
vld1.16 {q14}, [r2, :128], r7
4:
.macro filter compare
subs r4, r4, #1
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
vmull.s16 q2, d16, d0[0]
vmlal.s16 q2, d18, d0[1]
vmlal.s16 q2, d20, d0[2]
vmlal.s16 q2, d22, d0[3]
vmlal.s16 q2, d24, d1[0]
vmlal.s16 q2, d26, d1[1]
vmlal.s16 q2, d28, d1[2]
vmull.s16 q3, d17, d0[0]
vmlal.s16 q3, d19, d0[1]
vmlal.s16 q3, d21, d0[2]
vmlal.s16 q3, d23, d0[3]
vmlal.s16 q3, d25, d1[0]
vmlal.s16 q3, d27, d1[1]
vmlal.s16 q3, d29, d1[2]
vrshl.s32 q2, q2, q4 // round_bits_v
vrshl.s32 q3, q3, q4
vqmovun.s32 d4, q2
vqmovun.s32 d5, q3
vmin.u16 q2, q2, q5 // bitdepth_max
vst1.16 {q2}, [r0, :128], r1
.if \compare
cmp r4, #4
.else
ble 9f
.endif
vmov q8, q9
vmov q9, q10
vmov q10, q11
vmov q11, q12
vmov q12, q13
vmov q13, q14
.endm
filter 1
blt 7f
vld1.16 {q14}, [r2, :128], r7
b 4b
5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
tst r6, #8 // LR_HAVE_BOTTOM
beq 6f
// LR_HAVE_BOTTOM
cmp r4, #2
// We load at least 2 rows in all cases.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
bgt 53f // 3 rows in total
beq 52f // 2 rows in total
51: // 1 row in total, q11 already loaded, load edge into q12-q14.
vmov q13, q12
b 8f
52: // 2 rows in total, q11 already loaded, load q12 with content data
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vmov q15, q14
b 8f
53:
// 3 rows in total, q11 already loaded, load q12 and q13 with content
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vld1.16 {q15}, [r2, :128], r7
vmov q1, q15
b 8f
6:
// !LR_HAVE_BOTTOM
cmp r4, #2
bgt 63f // 3 rows in total
beq 62f // 2 rows in total
61: // 1 row in total, q11 already loaded, pad that into q12-q14.
vmov q12, q11
vmov q13, q11
vmov q14, q11
b 8f
62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
vld1.16 {q12}, [r2, :128], r7
vmov q13, q12
vmov q14, q12
vmov q15, q12
b 8f
63:
// 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
vmov q14, q13
vmov q15, q13
vmov q1, q13
b 8f
7:
// All registers up to q13 are filled already, 3 valid rows left.
// < 4 valid rows left; fill in padding and filter the last
// few rows.
tst r6, #8 // LR_HAVE_BOTTOM
beq 71f
// LR_HAVE_BOTTOM; load 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vld1.16 {q15}, [r2, :128], r7
vmov q1, q15
b 8f
71:
// !LR_HAVE_BOTTOM, pad 3 rows
vmov q14, q13
vmov q15, q13
vmov q1, q13
8: // At this point, all registers up to q14-q15,q1 are loaded with
// edge/padding (depending on how many rows are left).
filter 0 // This branches to 9f when done
vmov q14, q15
vmov q15, q1
b 8b
9: // End of one vertical slice.
subs r3, r3, #8
ble 0f
// Move pointers back up to the top and loop horizontally.
mls r0, r1, lr, r0
mls r2, r7, r12, r2
add r0, r0, #16
add r2, r2, #16
mov r4, lr
b 1b
0:
vpop {q4-q5}
pop {r4-r7,pc}
.purgem filter
endfunc
#define SUM_STRIDE (384+16)
vdup.32 q1, lr
ldrd r4, r5, [r1]
ldrd r6, r7, [r1, #8]
ldrd r8, r9, [r1, #16]
vneg.s32 q1, q1 // -round_bits_v
1:
vld1.16 {q4, q5}, [r4, :128]!
vld1.16 {q6, q7}, [r5, :128]!
vld1.16 {q8, q9}, [r6, :128]!
vld1.16 {q10, q11}, [r7, :128]!
vld1.16 {q12, q13}, [r8, :128]!
vld1.16 {q14, q15}, [r9, :128]!
subs r3, r3, #16
vmull.s16 q3, d8, d0[0]
vmlal.s16 q3, d12, d0[1]
vmlal.s16 q3, d16, d0[2]
vmlal.s16 q3, d20, d0[3]
vmlal.s16 q3, d24, d1[0]
vmlal.s16 q3, d28, d1[1]
vmlal.s16 q3, d28, d1[2]
vmull.s16 q4, d9, d0[0]
vmlal.s16 q4, d13, d0[1]
vmlal.s16 q4, d17, d0[2]
vmlal.s16 q4, d21, d0[3]
vmlal.s16 q4, d25, d1[0]
vmlal.s16 q4, d29, d1[1]
vmlal.s16 q4, d29, d1[2]
vmull.s16 q6, d10, d0[0]
vmlal.s16 q6, d14, d0[1]
vmlal.s16 q6, d18, d0[2]
vmlal.s16 q6, d22, d0[3]
vmlal.s16 q6, d26, d1[0]
vmlal.s16 q6, d30, d1[1]
vmlal.s16 q6, d30, d1[2]
vmull.s16 q5, d11, d0[0]
vmlal.s16 q5, d15, d0[1]
vmlal.s16 q5, d19, d0[2]
vmlal.s16 q5, d23, d0[3]
vmlal.s16 q5, d27, d1[0]
vmlal.s16 q5, d31, d1[1]
vmlal.s16 q5, d31, d1[2]
vrshl.s32 q3, q3, q1 // round_bits_v
vrshl.s32 q4, q4, q1
vrshl.s32 q6, q6, q1
vrshl.s32 q5, q5, q1
vqmovun.s32 d6, q3
vqmovun.s32 d7, q4
vqmovun.s32 d8, q6
vqmovun.s32 d9, q5
vmin.u16 q3, q3, q2 // bitdepth_max
vmin.u16 q4, q4, q2
vst1.16 {q3, q4}, [r0, :128]!
bgt 1b
// Shift the pointers, but only update the first 5; the 6th pointer is
// kept as it was before (and the 7th is implicitly identical to the
// 6th).
ldrd r4, r5, [r1, #4]
ldrd r6, r7, [r1, #12]
ldr r8, [r1, #20]
strd r4, r5, [r1]
strd r6, r7, [r1, #8]
str r8, [r1, #16]
#include "looprestoration_tmpl.S"
vpop {q4-q7}
pop {r4-r9,pc}
endfunc
// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_16bpc_neon, export=1
// void dav1d_wiener_filter_hv_16bpc_neon(pixel *dst, const pixel (*left)[4],
// const pixel *src,
// const int16_t filter[2][8],
// const int w,
// const enum LrEdgeFlags edges,
// int16_t **ptrs,
// const int bitdepth_max);
function wiener_filter_hv_16bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [sp, #108]
add r5, r5, #2 // w += 2
// Set up pointers for reading/writing alternate rows
add r10, r0, #(4*SUM_STRIDE) // sumsq
add r11, r1, #(2*SUM_STRIDE) // sum
add r12, r3, r4 // src
lsl r4, r4, #1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
add lr, r5, #7
bic lr, lr, #7
sub r9, r9, lr, lsl #1
// Store the width for the vertical loop
mov r8, r5
vld1.16 {q0, q1}, [r3, :128]
vdup.16 q11, r7 // bitdepth_max
clz r7, r7
vmov.i32 q14, #1
sub r12, r7, #38 // -(bitdepth + 6)
sub lr, r7, #11 // round_bits_v
sub r7, r7, #25 // -round_bits_h
neg r12, r12 // bitdepth + 6
vdup.32 q2, r12
vdup.32 q13, r7 // -round_bits_h
vdup.32 q10, lr // round_bits_v
mov lr, r6
vmov.i16 q15, #8192
vshl.u32 q14, q14, q2 // 1 << (bitdepth + 6)
vneg.s32 q10, q10 // -round_bits_v
// Subtract the number of pixels read from the input from the stride
add lr, lr, #8
sub r4, r4, lr, lsl #1
ldrd r6, r7, [lr]
ldrd r8, r9, [lr, #8]
ldrd r10, r11, [lr, #16]
ldr r12, [lr, #24]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r7, #1 // LR_HAVE_LEFT
beq 2f
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r2, #0
cmp r1, #0
bne 0f
// left == NULL
sub r3, r3, #4
sub r12, r12, #4
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 2 pixels from the src pointer,
// but shift it as if we had done that.
add r4, r4, #4
sub r2, r2, #6
vld1.16 {q2, q3}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.16 {q2, q3}, [r2]!
vld1.16 {d9}, [r1]!
// Move r2 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out.
sub r2, r2, #6
vext.8 q3, q2, q3, #10
vext.8 q2, q4, q2, #10
b 2f
1:
vld1.16 {q2, q3}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q2/q3 to have 3x the first pixel at the front.
vdup.16 q4, d4[0]
// Move r2 back to account for the last 3 pixels we loaded before,
// which we shifted out.
sub r2, r2, #6
vext.8 q3, q2, q3, #10
vext.8 q2, q4, q2, #10
1: // Loop vertically
vld1.16 {q0, q1}, [r3]!
vld1.16 {q4, q5}, [r12]!
2:
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
tst r7, #1 // LR_HAVE_LEFT
beq 0f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
sub lr, r4, #14
lsl lr, lr, #1
// Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrh lr, [r2, lr]
sub r3, r3, r4, lsl #1
vdup.16 q4, lr
vld1.8 {q8, q9}, [r3]
vbit q2, q4, q8
vbit q3, q4, q9
4: // Loop horizontally
vext.8 q5, q2, q3, #4
vext.8 q6, q2, q3, #8
vext.8 q4, q2, q3, #2
vext.8 q7, q2, q3, #10
vadd.i16 q6, q6, q5
vadd.i16 q7, q7, q4
vext.8 q4, q2, q3, #12
vext.8 q5, q2, q3, #6
vadd.i16 q2, q2, q4
vld1.16 {q4}, [r6, :128]!
vmull.s16 q8, d10, d0[3]
vmlal.s16 q8, d12, d1[0]
vmlal.s16 q8, d14, d1[1]
vmlal.s16 q8, d4, d1[2]
vmull.s16 q9, d11, d0[3]
vmlal.s16 q9, d13, d1[0]
vmlal.s16 q9, d15, d1[1]
vmlal.s16 q9, d5, d1[2]
vld1.16 {q5}, [r7, :128]!
vmvn.i16 q12, #0x8000 // 0x7fff = (1 << 15) - 1
vadd.i32 q8, q8, q14
vadd.i32 q9, q9, q14
vld1.16 {q6}, [r8, :128]!
vrshl.s32 q8, q8, q13
vrshl.s32 q9, q9, q13
vqmovun.s32 d16, q8
vqmovun.s32 d17, q9
vld1.16 {q7}, [r9, :128]!
vmin.u16 q8, q8, q12
vld1.16 {q9}, [r10, :128]!
vsub.i16 q8, q8, q15
vld1.16 {q2}, [r11, :128]!
vmull.s16 q12, d8, d2[0]
vmlal.s16 q12, d10, d2[1]
vmlal.s16 q12, d12, d2[2]
vmlal.s16 q12, d14, d2[3]
vmlal.s16 q12, d18, d3[0]
vmlal.s16 q12, d4, d3[1]
vmlal.s16 q12, d16, d3[2]
vmull.s16 q4, d9, d2[0]
vmlal.s16 q4, d11, d2[1]
vmlal.s16 q4, d13, d2[2]
vmlal.s16 q4, d15, d2[3]
vmlal.s16 q4, d19, d3[0]
vmlal.s16 q4, d5, d3[1]
vmlal.s16 q4, d17, d3[2]
vrshl.s32 q12, q12, q10 // round_bits_v
vrshl.s32 q4, q4, q10
vqmovun.s32 d24, q12
vqmovun.s32 d25, q4
vst1.16 {q8}, [r12, :128]!
vmin.u16 q12, q12, q11 // bitdepth_max
subs r4, r4, #8
vst1.16 {q12}, [r0, :128]!
ble 9f
vmov q2, q3
tst r5, #2 // LR_HAVE_RIGHT
vld1.16 {q3}, [r2]!
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
// Reload ptrs from arguments on the stack
ldr lr, [sp, #108]
// Rotate the window of pointers. Shift the 6 pointers downwards one step.
ldrd r6, r7, [lr, #4]
ldrd r8, r9, [lr, #12]
ldrd r10, r11, [lr, #20]
strd r6, r7, [lr]
strd r8, r9, [lr, #8]
strd r10, r11, [lr, #16]
// The topmost pointer, ptrs[6], which isn't used as input, is set to
// ptrs[0], which will be used as output for the next _hv call.
// At the start of the filtering, the caller may set ptrs[6] to the
// right next buffer to fill in, instead.
str r6, [lr, #24]
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
#include "looprestoration_tmpl.S"
// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box3_row_h_16bpc_neon, export=1
push {r4-r5,lr}
ldrd r4, r5, [sp, #12]
add r4, r4, #2 // w += 2
tst r5, #1 // LR_HAVE_LEFT
beq 1f
cmp r2, #0
beq 2f
bne 0f
// LR_HAVE_LEFT && left == NULL
sub r3, r3, #4
vld1.8 {q0, q1}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.16 {d5}, [r2]!
// Move r3/r12 back to account for the last 2 pixels we loaded earlier,
vld1.8 {q0, q1}, [r3]!
vld1.16 {d5}, [r2]
// Move r3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out.
sub r3, r3, #4
sub r12, r12, #4
vld1.16 {d13}, [r2]!
vext.8 q1, q0, q1, #12
vext.8 q0, q2, q0, #12
vext.8 q5, q4, q5, #12
vext.8 q4, q6, q4, #12
b 2f
0:
// !LR_HAVE_LEFT, fill q2 with the leftmost pixel
// and shift q0 to have 2x the first byte at the front.
1:
vld1.8 {q0, q1}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q0/q1 to have 2x the first pixel at the front.
vdup.16 q2, d0[0]
vdup.16 q6, d8[0]
// Move r3 back to account for the last 2 pixels we loaded before,
// which we shifted out.
sub r3, r3, #4
sub r12, r12, #4
vext.8 q1, q0, q1, #12
vext.8 q0, q2, q0, #12
vext.8 q5, q4, q5, #12
vext.8 q4, q6, q4, #12
2:
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub lr, r5, #(2 + 16 - 2 + 1)
sub lr, r4, #(2 + 16 - 2 + 1)
lsl lr, lr, #1
ldrh r11, [r3, lr]
ldrh lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
vdup.16 q14, r11
vdup.16 q15, lr
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
ldrh lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.16 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r5, #10
cmp r4, #10
bge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in q0/1.h[w] onwards
// Insert padding in q0.h[w] onwards
movrel_local lr, right_ext_mask
sub lr, lr, r5, lsl #1
sub lr, lr, r4, lsl #1
vld1.8 {q12, q13}, [lr]
vbit q0, q14, q12
vbit q1, q14, q13
vbit q4, q15, q12
vbit q5, q15, q13
4: // Loop horizontally
vext.8 q8, q0, q1, #2
vext.8 q10, q4, q5, #2
vext.8 q9, q0, q1, #4
vext.8 q11, q4, q5, #4
vadd.i16 q2, q0, q8
vadd.i16 q3, q4, q10
vmull.u16 q12, d0, d0
vmlal.u16 q12, d16, d16
vmlal.u16 q12, d18, d18
vadd.i16 q2, q2, q9
vadd.i16 q3, q3, q11
vmull.u16 q6, d0, d0
vmlal.u16 q6, d16, d16
vmlal.u16 q6, d18, d18
vmull.u16 q12, d8, d8
vmlal.u16 q12, d20, d20
vmlal.u16 q12, d22, d22
vmull.u16 q7, d1, d1
vmlal.u16 q7, d17, d17
vmlal.u16 q7, d19, d19
vmull.u16 q13, d9, d9
vmlal.u16 q13, d21, d21
vmlal.u16 q13, d23, d23
subs r5, r5, #8
vmull.u16 q13, d1, d1
vmlal.u16 q13, d17, d17
vmlal.u16 q13, d19, d19
subs r4, r4, #8
vst1.16 {q2}, [r1, :128]!
vst1.16 {q3}, [r11, :128]!
vst1.32 {q6, q7}, [r0, :128]!
vst1.32 {q12, q13}, [r10, :128]!
vst1.32 {q12, q13}, [r0, :128]!
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
vmov q0, q1
vmov q4, q5
vld1.16 {q1}, [r3]!
vld1.16 {q5}, [r12]!
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs r6, r6, #2
ble 0f
// Jump to the next row and loop horizontally
add r0, r0, r9, lsl #1
add r10, r10, r9, lsl #1
add r1, r1, r9
add r11, r11, r9
add r3, r3, r4
add r12, r12, r4
mov r5, r8
b 1b
0:
vpop {q4-q7}
pop {r4-r11,pc}
pop {r4-r5,pc}
endfunc
// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_16bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [sp, #108]
add r5, r5, #2 // w += 2
// Set up pointers for reading/writing alternate rows
add r10, r0, #(4*SUM_STRIDE) // sumsq
add r11, r1, #(2*SUM_STRIDE) // sum
add r12, r3, r4 // src
lsl r4, r4, #1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
add lr, r5, #7
bic lr, lr, #7
sub r9, r9, lr, lsl #1
add lr, lr, #8
sub r4, r4, lr, lsl #1
// Store the width for the vertical loop
mov r8, r5
// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box5_row_h_16bpc_neon, export=1
push {r4-r5,lr}
ldrd r4, r5, [sp, #12]
add r4, r4, #2 // w += 2
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r7, #1 // LR_HAVE_LEFT
beq 2f
// LR_HAVE_LEFT
tst r5, #1 // LR_HAVE_LEFT
beq 1f
cmp r2, #0
bne 0f
// left == NULL
// LR_HAVE_LEFT && left == NULL
sub r3, r3, #6
sub r12, r12, #6
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 3 pixels from the src pointer,
// but shift it as if we had done that.
add r4, r4, #6
1: // Loop vertically
vld1.16 {q0, q1}, [r3]!
vld1.16 {q4, q5}, [r12]!
vld1.8 {q0, q1}, [r3]!
b 2f
tst r7, #1 // LR_HAVE_LEFT
beq 0f
cmp r2, #0
beq 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.16 {d5}, [r2]!
// Move r3/r12 back to account for the last 3 pixels we loaded earlier,
vld1.8 {q0, q1}, [r3]!
vld1.16 {d5}, [r2]
// Move r3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out.
sub r3, r3, #6
sub r12, r12, #6
vld1.16 {d13}, [r2]!
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
vext.8 q5, q4, q5, #10
vext.8 q4, q6, q4, #10
b 2f
0:
// !LR_HAVE_LEFT, fill q2 with the leftmost pixel
// and shift q0 to have 3x the first pixel at the front.
1:
vld1.8 {q0, q1}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q0/q1 to have 3x the first pixel at the front.
vdup.16 q2, d0[0]
vdup.16 q6, d8[0]
// Move r3 back to account for the last 3 pixels we loaded before,
// which we shifted out.
sub r3, r3, #6
sub r12, r12, #6
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
vext.8 q5, q4, q5, #10
vext.8 q4, q6, q4, #10
2:
tst r7, #2 // LR_HAVE_RIGHT
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub lr, r5, #(2 + 16 - 3 + 1)
sub lr, r4, #(2 + 16 - 3 + 1)
lsl lr, lr, #1
ldrh r11, [r3, lr]
ldrh lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
vdup.16 q14, r11
vdup.16 q15, lr
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
ldrh lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.16 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r5, #11
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
// Insert padding in q0.h[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -2
sub lr, lr, r5, lsl #1
sub lr, lr, r4, lsl #1
vld1.8 {q12, q13}, [lr]
vbit q0, q14, q12
vbit q1, q14, q13
vbit q4, q15, q12
vbit q5, q15, q13
4: // Loop horizontally
vext.8 q8, q0, q1, #2
vext.8 q10, q4, q5, #2
vext.8 q9, q0, q1, #4
vext.8 q11, q4, q5, #4
vadd.i16 q2, q0, q8
vadd.i16 q3, q4, q10
vmull.u16 q12, d0, d0
vmlal.u16 q12, d16, d16
vmlal.u16 q12, d18, d18
vadd.i16 q2, q2, q9
vadd.i16 q3, q3, q11
vmull.u16 q6, d0, d0
vmlal.u16 q6, d16, d16
vmlal.u16 q6, d18, d18
vmull.u16 q12, d8, d8
vmlal.u16 q12, d20, d20
vmlal.u16 q12, d22, d22
vmull.u16 q7, d1, d1
vmlal.u16 q7, d17, d17
vmlal.u16 q7, d19, d19
vmull.u16 q13, d9, d9
vmlal.u16 q13, d21, d21
vmlal.u16 q13, d23, d23
vmull.u16 q13, d1, d1
vmlal.u16 q13, d17, d17
vmlal.u16 q13, d19, d19
vext.8 q8, q0, q1, #6
vext.8 q10, q4, q5, #6
vext.8 q9, q0, q1, #8
vext.8 q11, q4, q5, #8
vadd.i16 q2, q2, q8
vadd.i16 q3, q3, q10
vmlal.u16 q12, d16, d16
vmlal.u16 q12, d1, d1
vadd.i16 q2, q2, q9
vadd.i16 q3, q3, q11
vmlal.u16 q13, d17, d17
vmlal.u16 q13, d19, d19
subs r4, r4, #8
vst1.16 {q2}, [r1, :128]!
vst1.32 {q12, q13}, [r0, :128]!
ble 9f
tst r5, #2 // LR_HAVE_RIGHT
vmov q0, q1
vld1.16 {q1}, [r3]!
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
pop {r4-r5,pc}
endfunc
// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
// int32_t *sumsq5, int16_t *sum5,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box35_row_h_16bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
add r6, r6, #2 // w += 2
tst r7, #1 // LR_HAVE_LEFT
beq 1f
cmp r4, #0
bne 0f
// LR_HAVE_LEFT && left == NULL
sub r5, r5, #6
vld1.8 {q0, q1}, [r5]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0, q1}, [r5]!
vld1.16 {d5}, [r4]
// Move r3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out.
sub r5, r5, #6
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
b 2f
1:
vld1.8 {q0, q1}, [r5]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q0/q1 to have 3x the first pixel at the front.
vdup.16 q2, d0[0]
// Move r3 back to account for the last 3 pixels we loaded before,
// which we shifted out.
sub r5, r5, #6
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
2:
tst r7, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub lr, r6, #(2 + 16 - 3 + 1)
lsl lr, lr, #1
ldrh lr, [r5, lr]
// Fill q14 with the right padding pixel
vdup.16 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r6, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.h[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -2
sub lr, lr, r6, lsl #1
vld1.8 {q12, q13}, [lr]
vbit q0, q14, q12
vbit q1, q14, q13
4: // Loop horizontally
vext.8 q8, q0, q1, #2
vext.8 q9, q0, q1, #4
vext.8 q10, q0, q1, #6
vext.8 q11, q0, q1, #8
vadd.i16 q2, q8, q9
vadd.i16 q3, q0, q11
vadd.i16 q2, q2, q10
vmlal.u16 q6, d16, d16
vmlal.u16 q6, d1, d1
vmull.u16 q12, d16, d16
vmlal.u16 q12, d18, d18
vmlal.u16 q12, d20, d20
vmlal.u16 q12, d9, d9
vmlal.u16 q7, d17, d17
vmlal.u16 q7, d19, d19
vmull.u16 q13, d17, d17
vmlal.u16 q13, d19, d19
vmlal.u16 q13, d21, d21
vmlal.u16 q13, d23, d23
subs r5, r5, #8
vadd.i16 q3, q3, q2
vst1.16 {q2}, [r1, :128]!
vst1.16 {q3}, [r11, :128]!
vst1.32 {q6, q7}, [r0, :128]!
vst1.32 {q12, q13}, [r10, :128]!
vst1.32 {q12, q13}, [r0, :128]!
vmlal.u16 q12, d0, d0
vmlal.u16 q12, d22, d22
vmlal.u16 q13, d1, d1
vmlal.u16 q13, d23, d23
subs r6, r6, #8
vst1.16 {q3}, [r3, :128]!
vst1.32 {q12, q13}, [r2, :128]!
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q0, q1
vmov q4, q5
vld1.16 {q1}, [r3]!
vld1.16 {q5}, [r12]!
vld1.16 {q1}, [r5]!
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs r6, r6, #2
ble 0f
// Jump to the next row and loop horizontally
add r0, r0, r9, lsl #1
add r10, r10, r9, lsl #1
add r1, r1, r9
add r11, r11, r9
add r3, r3, r4
add r12, r12, r4
mov r5, r8
b 1b
0:
vpop {q4-q7}
pop {r4-r11,pc}
pop {r4-r7,pc}
endfunc
sgr_funcs 16
......@@ -28,341 +28,119 @@
#include "src/arm/asm.S"
#include "util.S"
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_v_neon, export=1
// void dav1d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum,
// int32_t *sumsq_out, int16_t *sum_out,
// const int w);
function sgr_box3_row_v_neon, export=1
push {r4-r9,lr}
ldr r4, [sp, #28]
add r12, r3, #2 // Number of output rows to move back
mov lr, r3 // Number of input rows to move back
add r2, r2, #2 // Actual summed width
mov r7, #(4*SUM_STRIDE) // sumsq stride
mov r8, #(2*SUM_STRIDE) // sum stride
sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
ldrd r6, r7, [r0]
ldr r0, [r0, #8]
add r4, r4, #2
ldrd r8, r9, [r1]
ldr r1, [r1, #8]
tst r4, #4 // LR_HAVE_TOP
beq 0f
// If have top, read from row -2.
sub r5, r0, #(4*SUM_STRIDE)
sub r6, r1, #(2*SUM_STRIDE)
add lr, lr, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add r5, r0, #(4*SUM_STRIDE)
add r6, r1, #(2*SUM_STRIDE)
1:
vld1.32 {q8, q9}, [r6]!
vld1.32 {q10, q11}, [r7]!
vld1.16 {q14}, [r8]!
vld1.16 {q15}, [r9]!
subs r4, r4, #8
tst r4, #8 // LR_HAVE_BOTTOM
beq 1f
// LR_HAVE_BOTTOM
add r3, r3, #2 // Sum all h+2 lines with the main loop
add lr, lr, #2
1:
mov r9, r3 // Backup of h for next loops
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into q8-q13 and q0-q2 taking top
// padding into consideration.
tst r4, #4 // LR_HAVE_TOP
vld1.32 {q8, q9}, [r5, :128], r7
vld1.16 {q0}, [r6, :128], r8
beq 2f
// LR_HAVE_TOP
vld1.32 {q10, q11}, [r5, :128], r7
vld1.16 {q1}, [r6, :128], r8
vld1.32 {q12, q13}, [r5, :128], r7
vld1.16 {q2}, [r6, :128], r8
b 3f
2: // !LR_HAVE_TOP
vmov q10, q8
vmov q11, q9
vmov q1, q0
vmov q12, q8
vmov q13, q9
vmov q2, q0
vld1.32 {q12, q13}, [r0]!
3:
subs r3, r3, #1
.macro add3
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vadd.i16 q0, q0, q1
vadd.i16 q14, q14, q15
vld1.16 {q15}, [r1]!
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i16 q0, q0, q2
vst1.32 {q8, q9}, [r0, :128], r7
vst1.16 {q0}, [r1, :128], r8
.endm
add3
vmov q8, q10
vmov q9, q11
vmov q0, q1
vmov q10, q12
vmov q11, q13
vmov q1, q2
ble 4f
vld1.32 {q12, q13}, [r5, :128], r7
vld1.16 {q2}, [r6, :128], r8
b 3b
vadd.i16 q14, q14, q15
4:
tst r4, #8 // LR_HAVE_BOTTOM
bne 5f
// !LR_HAVE_BOTTOM
// Produce two more rows, extending the already loaded rows.
add3
vmov q8, q10
vmov q9, q11
vmov q0, q1
add3
5: // End of one vertical slice.
subs r2, r2, #8
ble 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
mls r5, r7, lr, r5
mls r6, r8, lr, r6
// Output pointers
mls r0, r7, r12, r0
mls r1, r8, r12, r1
add r0, r0, #32
add r1, r1, #16
add r5, r5, #32
add r6, r6, #16
mov r3, r9
b 1b
vst1.32 {q8, q9}, [r2]!
vst1.16 {q14}, [r3]!
0:
bgt 1b
pop {r4-r9,pc}
.purgem add3
endfunc
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_v_neon, export=1
push {r4-r9,lr}
vpush {q5-q7}
ldr r4, [sp, #76]
add r12, r3, #2 // Number of output rows to move back
mov lr, r3 // Number of input rows to move back
add r2, r2, #8 // Actual summed width
mov r7, #(4*SUM_STRIDE) // sumsq stride
mov r8, #(2*SUM_STRIDE) // sum stride
sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
// void dav1d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum,
// int32_t *sumsq_out, int16_t *sum_out,
// const int w);
function sgr_box5_row_v_neon, export=1
push {r4-r11,lr}
ldr lr, [sp, #36]
tst r4, #4 // LR_HAVE_TOP
beq 0f
// If have top, read from row -2.
sub r5, r0, #(4*SUM_STRIDE)
sub r6, r1, #(2*SUM_STRIDE)
ldrd r4, r5, [r0]
ldrd r6, r7, [r0, #8]
ldr r0, [r0, #16]
add lr, lr, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add r5, r0, #(4*SUM_STRIDE)
add r6, r1, #(2*SUM_STRIDE)
1:
ldrd r8, r9, [r1]
ldrd r10, r11, [r1, #8]
ldr r1, [r1, #16]
tst r4, #8 // LR_HAVE_BOTTOM
beq 0f
// LR_HAVE_BOTTOM
add r3, r3, #2 // Handle h+2 lines with the main loop
add lr, lr, #2
b 1f
0:
// !LR_HAVE_BOTTOM
sub r3, r3, #1 // Handle h-1 lines with the main loop
1:
mov r9, r3 // Backup of h for next loops
vld1.32 {q8, q9}, [r4]!
vld1.32 {q10, q11}, [r5]!
vld1.32 {q12, q13}, [r6]!
vld1.32 {q14, q15}, [r7]!
vld1.16 {q0}, [r8]!
vld1.16 {q1}, [r9]!
vld1.16 {q2}, [r10]!
vld1.16 {q3}, [r11]!
subs lr, lr, #8
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into q6-q15 and q0-q3,q5 taking top
// padding into consideration.
tst r4, #4 // LR_HAVE_TOP
vld1.32 {q6, q7}, [r5, :128], r7
vld1.16 {q0}, [r6, :128], r8
beq 2f
// LR_HAVE_TOP
vld1.32 {q10, q11}, [r5, :128], r7
vld1.16 {q2}, [r6, :128], r8
vmov q8, q6
vmov q9, q7
vmov q1, q0
vld1.32 {q12, q13}, [r5, :128], r7
vld1.16 {q3}, [r6, :128], r8
b 3f
2: // !LR_HAVE_TOP
vmov q8, q6
vmov q9, q7
vmov q1, q0
vmov q10, q6
vmov q11, q7
vmov q2, q0
vmov q12, q6
vmov q13, q7
vmov q3, q0
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vadd.i32 q12, q12, q14
vadd.i32 q13, q13, q15
3:
cmp r3, #0
beq 4f
vld1.32 {q14, q15}, [r5, :128], r7
vld1.16 {q5}, [r6, :128], r8
vld1.32 {q14, q15}, [r0]!
3:
// Start of vertical loop
subs r3, r3, #2
.macro add5
vadd.i32 q6, q6, q8
vadd.i32 q7, q7, q9
vadd.i16 q0, q0, q1
vadd.i32 q6, q6, q10
vadd.i32 q7, q7, q11
vadd.i16 q0, q0, q2
vadd.i32 q6, q6, q12
vadd.i32 q7, q7, q13
vadd.i16 q0, q0, q3
vadd.i32 q6, q6, q14
vadd.i32 q7, q7, q15
vadd.i16 q0, q0, q5
vst1.32 {q6, q7}, [r0, :128], r7
vst1.16 {q0}, [r1, :128], r8
.endm
add5
.macro shift2
vmov q6, q10
vmov q7, q11
vmov q0, q2
vmov q8, q12
vmov q9, q13
vmov q1, q3
vmov q10, q14
vmov q11, q15
vmov q2, q5
.endm
shift2
add r0, r0, r7
add r1, r1, r8
ble 5f
vld1.32 {q12, q13}, [r5, :128], r7
vld1.16 {q3}, [r6, :128], r8
vld1.32 {q14, q15}, [r5, :128], r7
vld1.16 {q5}, [r6, :128], r8
b 3b
4:
// h == 1, !LR_HAVE_BOTTOM.
// Pad the last row with the only content row, and add.
vmov q14, q12
vmov q15, q13
vmov q5, q3
add5
shift2
add r0, r0, r7
add r1, r1, r8
add5
b 6f
vadd.i16 q2, q2, q3
5:
tst r4, #8 // LR_HAVE_BOTTOM
bne 6f
// !LR_HAVE_BOTTOM
cmp r3, #0
bne 5f
// The intended three edge rows left; output the one at h-2 and
// the past edge one at h.
vld1.32 {q12, q13}, [r5, :128], r7
vld1.16 {q3}, [r6, :128], r8
// Pad the past-edge row from the last content row.
vmov q14, q12
vmov q15, q13
vmov q5, q3
add5
shift2
add r0, r0, r7
add r1, r1, r8
// The last two rows are already padded properly here.
add5
b 6f
vld1.16 {q3}, [r1]!
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i16 q0, q0, q2
5:
// r3 == -1, two rows left, output one.
// Pad the last two rows from the mid one.
vmov q12, q10
vmov q13, q11
vmov q3, q2
vmov q14, q10
vmov q15, q11
vmov q5, q2
add5
add r0, r0, r7
add r1, r1, r8
b 6f
vadd.i32 q8, q8, q14
vadd.i32 q9, q9, q15
vadd.i16 q0, q0, q3
6: // End of one vertical slice.
subs r2, r2, #8
ble 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
mls r5, r7, lr, r5
mls r6, r8, lr, r6
// Output pointers
mls r0, r7, r12, r0
mls r1, r8, r12, r1
add r0, r0, #32
add r1, r1, #16
add r5, r5, #32
add r6, r6, #16
mov r3, r9
b 1b
vst1.32 {q8, q9}, [r2]!
vst1.16 {q0}, [r3]!
0:
vpop {q5-q7}
pop {r4-r9,pc}
.purgem add5
bgt 1b
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength,
// const int bitdepth_max);
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength,
// const int bitdepth_max);
function sgr_calc_ab1_neon, export=1
// void dav1d_sgr_calc_row_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int strength,
// const int bitdepth_max);
// void dav1d_sgr_calc_row_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int strength,
// const int bitdepth_max);
function sgr_calc_row_ab1_neon, export=1
push {r4-r7,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #84]
add r3, r3, #2 // h += 2
clz r6, r5
ldr r4, [sp, #84]
clz r6, r4
vmov.i32 q15, #9 // n
movw r5, #455
mov lr, #SUM_STRIDE
b sgr_calc_ab_neon
endfunc
function sgr_calc_ab2_neon, export=1
function sgr_calc_row_ab2_neon, export=1
push {r4-r7,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #84]
add r3, r3, #3 // h += 3
clz r6, r5
asr r3, r3, #1 // h /= 2
ldr r4, [sp, #84]
clz r6, r4
vmov.i32 q15, #25 // n
mov r5, #164
mov lr, #(2*SUM_STRIDE)
endfunc
function sgr_calc_ab_neon
......@@ -379,20 +157,14 @@ function sgr_calc_ab_neon
vmov.i8 d14, #254 // idx of last 1
vmov.i8 d15, #32 // elements consumed in first vtbl
add r2, r2, #2 // w += 2
add r12, r2, #7
bic r12, r12, #7 // aligned w
sub r12, lr, r12 // increment between rows
vdup.32 q12, r4
sub r0, r0, #(4*(SUM_STRIDE))
sub r1, r1, #(2*(SUM_STRIDE))
mov r4, r2 // backup of w
vdup.32 q12, r3
vsub.i8 q8, q8, q11
vsub.i8 q9, q9, q11
vsub.i8 q10, q10, q11
vdup.32 q13, r7 // -2*bitdepth_min_8
1:
vld1.32 {q0, q1}, [r0, :128] // a
vld1.16 {q2}, [r1, :128] // b
vdup.32 q13, r7 // -2*bitdepth_min_8
vdup.16 q14, r6 // -bitdepth_min_8
subs r2, r2, #8
vrshl.s32 q0, q0, q13
......@@ -426,7 +198,6 @@ function sgr_calc_ab_neon
vadd.i8 d1, d1, d2
vmovl.u8 q0, d1 // x
vmov.i16 q13, #256
vdup.32 q14, r5 // one_by_x
vmull.u16 q1, d0, d4 // x * BB[i]
......@@ -435,19 +206,11 @@ function sgr_calc_ab_neon
vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
vrshr.s32 q1, q1, #12 // AA[i]
vrshr.s32 q2, q2, #12 // AA[i]
vsub.i16 q0, q13, q0 // 256 - x
vst1.32 {q1, q2}, [r0, :128]!
vst1.16 {q0}, [r1, :128]!
bgt 1b
subs r3, r3, #1
ble 0f
add r0, r0, r12, lsl #2
add r1, r1, r12, lsl #1
mov r2, r4
b 1b
0:
vpop {q4-q7}
pop {r4-r7,pc}
endfunc
......@@ -30,44 +30,30 @@
#define FILTER_OUT_STRIDE 384
.macro sgr_funcs bpc
// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_\bpc\()bpc_neon, export=1
// void dav1d_sgr_finish_filter_row1_Xbpc_neon(int16_t *tmp,
// const pixel *src,
// const int32_t **a, const int16_t **b,
// const int w);
function sgr_finish_filter_row1_\bpc\()bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldr r6, [sp, #108]
sub r7, r3, #(4*SUM_STRIDE)
add r8, r3, #(4*SUM_STRIDE)
sub r9, r4, #(2*SUM_STRIDE)
add r10, r4, #(2*SUM_STRIDE)
mov r11, #SUM_STRIDE
mov r12, #FILTER_OUT_STRIDE
add lr, r5, #3
bic lr, lr, #3 // Aligned width
.if \bpc == 8
sub r2, r2, lr
.else
sub r2, r2, lr, lsl #1
.endif
sub r12, r12, lr
sub r11, r11, lr
sub r11, r11, #4 // We read 4 extra elements from both a and b
mov lr, r5
ldr r4, [sp, #100]
ldrd r6, r7, [r2]
ldr r2, [r2, #8]
ldrd r8, r9, [r3]
ldr r3, [r3, #8]
vmov.i16 q14, #3
vmov.i32 q15, #3
1:
vld1.16 {q0}, [r9, :128]!
vld1.16 {q1}, [r4, :128]!
vld1.16 {q2}, [r10, :128]!
vld1.32 {q8, q9}, [r7, :128]!
vld1.32 {q10, q11}, [r3, :128]!
vld1.32 {q12, q13}, [r8, :128]!
vld1.16 {q0}, [r8, :128]!
vld1.16 {q1}, [r9, :128]!
vld1.16 {q2}, [r3, :128]!
vld1.32 {q8, q9}, [r6, :128]!
vld1.32 {q10, q11}, [r7, :128]!
vld1.32 {q12, q13}, [r2, :128]!
2:
subs r5, r5, #4
subs r4, r4, #4
vext.8 d6, d0, d1, #2 // -stride
vext.8 d7, d2, d3, #2 // 0
vext.8 d8, d4, d5, #2 // +stride
......@@ -108,7 +94,7 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
vmovl.u8 q12, d24 // src
.endif
vmov d0, d1
vmlal.u16 q3, d2, d24 // b + a * src
vmlsl.u16 q3, d2, d24 // b - a * src
vmov d2, d3
vrshrn.i32 d6, q3, #9
vmov d4, d5
......@@ -118,67 +104,42 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
vmov q8, q9
vmov q10, q11
vmov q12, q13
vld1.16 {d1}, [r9, :64]!
vld1.16 {d3}, [r4, :64]!
vld1.16 {d5}, [r10, :64]!
vld1.32 {q9}, [r7, :128]!
vld1.32 {q11}, [r3, :128]!
vld1.32 {q13}, [r8, :128]!
vld1.16 {d1}, [r8, :64]!
vld1.16 {d3}, [r9, :64]!
vld1.16 {d5}, [r3, :64]!
vld1.32 {q9}, [r6, :128]!
vld1.32 {q11}, [r7, :128]!
vld1.32 {q13}, [r2, :128]!
b 2b
3:
subs r6, r6, #1
ble 0f
mov r5, lr
add r0, r0, r12, lsl #1
add r1, r1, r2
add r3, r3, r11, lsl #2
add r7, r7, r11, lsl #2
add r8, r8, r11, lsl #2
add r4, r4, r11, lsl #1
add r9, r9, r11, lsl #1
add r10, r10, r11, lsl #1
b 1b
0:
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_\bpc\()bpc_neon, export=1
// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t **a, const int16_t **b,
// const int w, const int h);
function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldr r6, [sp, #108]
add r7, r3, #(4*(SUM_STRIDE))
sub r3, r3, #(4*(SUM_STRIDE))
add r8, r4, #(2*(SUM_STRIDE))
sub r4, r4, #(2*(SUM_STRIDE))
mov r9, #(2*SUM_STRIDE)
mov r10, #FILTER_OUT_STRIDE
add r11, r5, #7
bic r11, r11, #7 // Aligned width
.if \bpc == 8
sub r2, r2, r11
.else
sub r2, r2, r11, lsl #1
.endif
sub r10, r10, r11
sub r9, r9, r11
sub r9, r9, #4 // We read 4 extra elements from a
sub r12, r9, #4 // We read 8 extra elements from b
ldrd r8, r9, [r3]
ldrd r10, r11, [r4]
mov r7, #2*FILTER_OUT_STRIDE
add r2, r1, r2
add r7, r7, r0
mov lr, r5
1:
vld1.16 {q0, q1}, [r4, :128]!
vld1.16 {q2, q3}, [r8, :128]!
vld1.32 {q8, q9}, [r3, :128]!
vld1.32 {q11, q12}, [r7, :128]!
vld1.32 {q10}, [r3, :128]!
vld1.32 {q13}, [r7, :128]!
vld1.16 {q0, q1}, [r10, :128]!
vld1.16 {q2, q3}, [r11, :128]!
vld1.32 {q8, q9}, [r8, :128]!
vld1.32 {q11, q12}, [r9, :128]!
vld1.32 {q10}, [r8, :128]!
vld1.32 {q13}, [r9, :128]!
2:
vmov.i16 q14, #5
......@@ -229,8 +190,8 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
.if \bpc == 8
vmovl.u8 q2, d4
.endif
vmlal.u16 q4, d0, d4 // b + a * src
vmlal.u16 q5, d1, d5 // b + a * src
vmlsl.u16 q4, d0, d4 // b - a * src
vmlsl.u16 q5, d1, d5 // b - a * src
vmov q0, q1
vrshrn.i32 d8, q4, #9
vrshrn.i32 d9, q5, #9
......@@ -240,26 +201,24 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
ble 3f
vmov q8, q10
vmov q11, q13
vld1.16 {q1}, [r4, :128]!
vld1.16 {q3}, [r8, :128]!
vld1.32 {q9, q10}, [r3, :128]!
vld1.32 {q12, q13}, [r7, :128]!
vld1.16 {q1}, [r10, :128]!
vld1.16 {q3}, [r11, :128]!
vld1.32 {q9, q10}, [r8, :128]!
vld1.32 {q12, q13}, [r9, :128]!
b 2b
3:
subs r6, r6, #1
ble 0f
mov r5, lr
add r0, r0, r10, lsl #1
add r1, r1, r2
add r3, r3, r9, lsl #2
add r7, r7, r9, lsl #2
add r4, r4, r12, lsl #1
add r8, r8, r12, lsl #1
ldrd r8, r9, [r3]
ldrd r10, r11, [r4]
mov r0, r7
mov r1, r2
vld1.32 {q8, q9}, [r3, :128]!
vld1.16 {q0, q1}, [r4, :128]!
vld1.32 {q10}, [r3, :128]!
vld1.32 {q8, q9}, [r9, :128]!
vld1.16 {q0, q1}, [r11, :128]!
vld1.32 {q10}, [r9, :128]!
vmov.i16 q12, #5
vmov.i16 q13, #6
......@@ -291,8 +250,8 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
vmul.i32 q5, q5, q15 // * 6
vmla.i32 q5, q9, q14 // * 5 -> b
vmlal.u16 q4, d4, d22 // b + a * src
vmlal.u16 q5, d5, d23
vmlsl.u16 q4, d4, d22 // b - a * src
vmlsl.u16 q5, d5, d23
vmov q0, q1
vrshrn.i32 d8, q4, #8
vrshrn.i32 d9, q5, #8
......@@ -300,301 +259,152 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
vst1.16 {q4}, [r0, :128]!
ble 5f
vld1.16 {q1}, [r4, :128]!
vld1.32 {q9, q10}, [r3, :128]!
vld1.16 {q1}, [r11, :128]!
vld1.32 {q9, q10}, [r9, :128]!
b 4b
5:
subs r6, r6, #1
ble 0f
mov r5, lr
sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
sub r4, r4, r11, lsl #1
add r0, r0, r10, lsl #1
add r1, r1, r2
sub r3, r3, #16
sub r4, r4, #16
b 1b
0:
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int w, const int h,
// const int wt, const int bitdepth_max);
function sgr_weighted1_\bpc\()bpc_neon, export=1
push {r4-r9,lr}
ldrd r4, r5, [sp, #28]
ldrd r6, r7, [sp, #36]
// void dav1d_sgr_weighted_row1_Xbpc_neon(pixel *dst,
// const int16_t *t1, const int w,
// const int w1, const int bitdepth_max);
function sgr_weighted_row1_\bpc\()bpc_neon, export=1
push {lr}
.if \bpc == 16
ldr r8, [sp, #44]
ldr lr, [sp, #4]
.endif
vdup.16 d31, r7
cmp r6, #2
vdup.16 d31, r3
.if \bpc == 16
vdup.16 q14, r8
vmov.i16 q13, #0
vdup.16 q14, lr
.endif
add r9, r0, r1
add r12, r2, r3
add lr, r4, #2*FILTER_OUT_STRIDE
mov r7, #(4*FILTER_OUT_STRIDE)
lsl r1, r1, #1
lsl r3, r3, #1
add r8, r5, #7
bic r8, r8, #7 // Aligned width
.if \bpc == 8
sub r1, r1, r8
sub r3, r3, r8
.else
sub r1, r1, r8, lsl #1
sub r3, r3, r8, lsl #1
.endif
sub r7, r7, r8, lsl #1
mov r8, r5
blt 2f
1:
.if \bpc == 8
vld1.8 {d0}, [r2, :64]!
vld1.8 {d16}, [r12, :64]!
vld1.8 {d0}, [r0, :64]
.else
vld1.16 {q0}, [r2, :128]!
vld1.16 {q8}, [r12, :128]!
vld1.16 {q0}, [r0, :128]
.endif
vld1.16 {q1}, [r4, :128]!
vld1.16 {q9}, [lr, :128]!
subs r5, r5, #8
.if \bpc == 8
vshll.u8 q0, d0, #4 // u
vshll.u8 q8, d16, #4 // u
.else
vshl.i16 q0, q0, #4 // u
vshl.i16 q8, q8, #4 // u
.endif
vsub.i16 q1, q1, q0 // t1 - u
vsub.i16 q9, q9, q8 // t1 - u
vshll.u16 q2, d0, #7 // u << 7
vshll.u16 q3, d1, #7 // u << 7
vshll.u16 q10, d16, #7 // u << 7
vshll.u16 q11, d17, #7 // u << 7
vmlal.s16 q2, d2, d31 // v
vmlal.s16 q3, d3, d31 // v
vmlal.s16 q10, d18, d31 // v
vmlal.s16 q11, d19, d31 // v
.if \bpc == 8
vld1.16 {q1}, [r1, :128]!
subs r2, r2, #8
vmull.s16 q2, d2, d31 // v
vmull.s16 q3, d3, d31 // v
vrshrn.i32 d4, q2, #11
vrshrn.i32 d5, q3, #11
vrshrn.i32 d20, q10, #11
vrshrn.i32 d21, q11, #11
vqmovun.s16 d4, q2
vqmovun.s16 d20, q10
vst1.8 {d4}, [r0, :64]!
vst1.8 {d20}, [r9, :64]!
.else
vqrshrun.s32 d4, q2, #11
vqrshrun.s32 d5, q3, #11
vqrshrun.s32 d20, q10, #11
vqrshrun.s32 d21, q11, #11
vmin.u16 q2, q2, q14
vmin.u16 q10, q10, q14
vst1.16 {q2}, [r0, :128]!
vst1.16 {q10}, [r9, :128]!
.endif
bgt 1b
sub r6, r6, #2
cmp r6, #1
blt 0f
mov r5, r8
add r0, r0, r1
add r9, r9, r1
add r2, r2, r3
add r12, r12, r3
add r4, r4, r7
add lr, lr, r7
beq 2f
b 1b
2:
.if \bpc == 8
vld1.8 {d0}, [r2, :64]!
.else
vld1.16 {q0}, [r2, :128]!
.endif
vld1.16 {q1}, [r4, :128]!
subs r5, r5, #8
.if \bpc == 8
vshll.u8 q0, d0, #4 // u
.else
vshl.i16 q0, q0, #4 // u
.endif
vsub.i16 q1, q1, q0 // t1 - u
vshll.u16 q2, d0, #7 // u << 7
vshll.u16 q3, d1, #7 // u << 7
vmlal.s16 q2, d2, d31 // v
vmlal.s16 q3, d3, d31 // v
.if \bpc == 8
vrshrn.i32 d4, q2, #11
vrshrn.i32 d5, q3, #11
vaddw.u8 q2, q2, d0
vqmovun.s16 d2, q2
vst1.8 {d2}, [r0, :64]!
.else
vqrshrun.s32 d4, q2, #11
vqrshrun.s32 d5, q3, #11
vadd.i16 q2, q2, q0
vmax.s16 q2, q2, q13
vmin.u16 q2, q2, q14
vst1.16 {q2}, [r0, :128]!
.endif
bgt 2b
bgt 1b
0:
pop {r4-r9,pc}
pop {pc}
endfunc
// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2], const int bitdepth_max);
function sgr_weighted2_\bpc\()bpc_neon, export=1
push {r4-r11,lr}
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
push {r4-r8,lr}
ldrd r4, r5, [sp, #24]
.if \bpc == 8
ldr r8, [sp, #52]
ldr r6, [sp, #32]
.else
ldrd r8, r9, [sp, #52]
ldrd r6, r7, [sp, #32]
.endif
cmp r7, #2
add r10, r0, r1
add r11, r2, r3
add r12, r4, #2*FILTER_OUT_STRIDE
add lr, r5, #2*FILTER_OUT_STRIDE
vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
cmp r5, #2
add r8, r0, r1
add r12, r2, #2*FILTER_OUT_STRIDE
add lr, r3, #2*FILTER_OUT_STRIDE
vld2.16 {d30[], d31[]}, [r6] // wt[0], wt[1]
.if \bpc == 16
vdup.16 q14, r9
vdup.16 q14, r7
.endif
mov r8, #4*FILTER_OUT_STRIDE
lsl r1, r1, #1
lsl r3, r3, #1
add r9, r6, #7
bic r9, r9, #7 // Aligned width
.if \bpc == 8
sub r1, r1, r9
sub r3, r3, r9
.else
sub r1, r1, r9, lsl #1
sub r3, r3, r9, lsl #1
.endif
sub r8, r8, r9, lsl #1
mov r9, r6
blt 2f
1:
.if \bpc == 8
vld1.8 {d0}, [r2, :64]!
vld1.8 {d16}, [r11, :64]!
vld1.8 {d0}, [r0, :64]
vld1.8 {d16}, [r8, :64]
.else
vld1.16 {q0}, [r2, :128]!
vld1.16 {q8}, [r11, :128]!
vld1.16 {q0}, [r0, :128]
vld1.16 {q8}, [r8, :128]
.endif
vld1.16 {q1}, [r4, :128]!
vld1.16 {q1}, [r2, :128]!
vld1.16 {q9}, [r12, :128]!
vld1.16 {q2}, [r5, :128]!
vld1.16 {q2}, [r3, :128]!
vld1.16 {q10}, [lr, :128]!
subs r6, r6, #8
.if \bpc == 8
vshll.u8 q0, d0, #4 // u
vshll.u8 q8, d16, #4 // u
.else
vshl.i16 q0, q0, #4 // u
vshl.i16 q8, q8, #4 // u
.endif
vsub.i16 q1, q1, q0 // t1 - u
vsub.i16 q2, q2, q0 // t2 - u
vsub.i16 q9, q9, q8 // t1 - u
vsub.i16 q10, q10, q8 // t2 - u
vshll.u16 q3, d0, #7 // u << 7
vshll.u16 q0, d1, #7 // u << 7
vshll.u16 q11, d16, #7 // u << 7
vshll.u16 q8, d17, #7 // u << 7
vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
.if \bpc == 8
subs r4, r4, #8
vmull.s16 q3, d2, d30 // wt[0] * t1
vmlal.s16 q3, d4, d31 // wt[1] * t2
vmull.s16 q12, d3, d30 // wt[0] * t1
vmlal.s16 q12, d5, d31 // wt[1] * t2
vmull.s16 q11, d18, d30 // wt[0] * t1
vmlal.s16 q11, d20, d31 // wt[1] * t2
vmull.s16 q13, d19, d30 // wt[0] * t1
vmlal.s16 q13, d21, d31 // wt[1] * t2
vrshrn.i32 d6, q3, #11
vrshrn.i32 d7, q0, #11
vrshrn.i32 d7, q12, #11
vrshrn.i32 d22, q11, #11
vrshrn.i32 d23, q8, #11
vrshrn.i32 d23, q13, #11
.if \bpc == 8
vaddw.u8 q3, q3, d0
vaddw.u8 q11, q11, d16
vqmovun.s16 d6, q3
vqmovun.s16 d22, q11
vst1.8 {d6}, [r0, :64]!
vst1.8 {d22}, [r10, :64]!
vst1.8 {d22}, [r8, :64]!
.else
vqrshrun.s32 d6, q3, #11
vqrshrun.s32 d7, q0, #11
vqrshrun.s32 d22, q11, #11
vqrshrun.s32 d23, q8, #11
vmov.i16 q13, #0
vadd.i16 q3, q3, q0
vadd.i16 q11, q11, q8
vmax.s16 q3, q3, q13
vmax.s16 q11, q11, q13
vmin.u16 q3, q3, q14
vmin.u16 q11, q11, q14
vst1.16 {q3}, [r0, :128]!
vst1.16 {q11}, [r10, :128]!
vst1.16 {q11}, [r8, :128]!
.endif
bgt 1b
subs r7, r7, #2
cmp r7, #1
blt 0f
mov r6, r9
add r0, r0, r1
add r10, r10, r1
add r2, r2, r3
add r11, r11, r3
add r4, r4, r8
add r12, r12, r8
add r5, r5, r8
add lr, lr, r8
beq 2f
b 1b
b 0f
2:
.if \bpc == 8
vld1.8 {d0}, [r2, :64]!
vld1.8 {d0}, [r0, :64]
.else
vld1.16 {q0}, [r2, :128]!
vld1.16 {q0}, [r0, :128]
.endif
vld1.16 {q1}, [r4, :128]!
vld1.16 {q2}, [r5, :128]!
subs r6, r6, #8
.if \bpc == 8
vshll.u8 q0, d0, #4 // u
.else
vshl.i16 q0, q0, #4 // u
.endif
vsub.i16 q1, q1, q0 // t1 - u
vsub.i16 q2, q2, q0 // t2 - u
vshll.u16 q3, d0, #7 // u << 7
vshll.u16 q0, d1, #7 // u << 7
vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
.if \bpc == 8
vld1.16 {q1}, [r2, :128]!
vld1.16 {q2}, [r3, :128]!
subs r4, r4, #8
vmull.s16 q3, d2, d30 // wt[0] * t1
vmlal.s16 q3, d4, d31 // wt[1] * t2
vmull.s16 q11, d3, d30 // wt[0] * t1
vmlal.s16 q11, d5, d31 // wt[1] * t2
vrshrn.i32 d6, q3, #11
vrshrn.i32 d7, q0, #11
vrshrn.i32 d7, q11, #11
.if \bpc == 8
vaddw.u8 q3, q3, d0
vqmovun.s16 d6, q3
vst1.8 {d6}, [r0, :64]!
.else
vqrshrun.s32 d6, q3, #11
vqrshrun.s32 d7, q0, #11
vmov.i16 q13, #0
vadd.i16 q3, q3, q0
vmax.s16 q3, q3, q13
vmin.u16 q3, q3, q14
vst1.16 {q3}, [r0, :128]!
.endif
bgt 1b
bgt 2b
0:
pop {r4-r11,pc}
pop {r4-r8,pc}
endfunc
.endm